Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4
155 files changed, 82733 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 000000000000..6d3e8b1bd1f2
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,411 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+        bool "IP: multicasting"
+        depends on INET
+        help
+          This is code for addressing several networked computers at once,
+          enlarging your kernel by about 2 KB. You need multicasting if you
+          intend to participate in the MBONE, a high bandwidth network on top
+          of the Internet which carries audio and video broadcasts. More
+          information about the MBONE is on the WWW at
+          <http://www-itg.lbl.gov/mbone/>. Information about the multicast
+          capabilities of the various network cards is contained in
+          <file:Documentation/networking/multicast.txt>. For most people, it's
+          safe to say N.
+config IP_ADVANCED_ROUTER
+        bool "IP: advanced router"
+        depends on INET
+        ---help---
+          If you intend to run your Linux box mostly as a router, i.e. as a
+          computer that forwards and redistributes network packets, say Y; you
+          will then be presented with several options that allow more precise
+          control about the routing process.
+          The answer to this question won't directly affect the kernel:
+          answering N will just cause the configurator to skip all the
+          questions about advanced routing.
+          Note that your box can only act as a router if you enable IP
+          forwarding in your kernel; you can do that by saying Y to "/proc
+          file system support" and "Sysctl support" below and executing the
+          line
+          echo "1" > /proc/sys/net/ipv4/ip_forward
+          at boot time after the /proc file system has been mounted.
+          If you turn on IP forwarding, you will also get the rp_filter, which
+          automatically rejects incoming packets if the routing table entry
+          for their source address doesn't match the network interface they're
+          arriving on. This has security advantages because it prevents the
+          so-called IP spoofing, however it can pose problems if you use
+          asymmetric routing (packets from you to a host take a different path
+          than packets from that host to you) or if you operate a non-routing
+          host which has several IP addresses on different interfaces. To turn
+          rp_filter off use:
+          echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+          or
+          echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
+          If unsure, say N here.
+config IP_MULTIPLE_TABLES
+        bool "IP: policy routing"
+        depends on IP_ADVANCED_ROUTER
+        ---help---
+          Normally, a router decides what to do with a received packet based
+          solely on the packet's final destination address. If you say Y here,
+          the Linux router will also be able to take the packet's source
+          address into account. Furthermore, the TOS (Type-Of-Service) field
+          of the packet can be used for routing decisions as well.
+          If you are interested in this, please see the preliminary
+          documentation at <http://www.compendium.com.ar/policy-routing.txt>
+          and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
+          You will need supporting software from
+          <ftp://ftp.tux.org/pub/net/ip-routing/>.
+          If unsure, say N.
+config IP_ROUTE_FWMARK
+        bool "IP: use netfilter MARK value as routing key"
+        depends on IP_MULTIPLE_TABLES && NETFILTER
+        help
+          If you say Y here, you will be able to specify different routes for
+          packets with different mark values (see iptables(8), MARK target).
+config IP_ROUTE_MULTIPATH
+        bool "IP: equal cost multipath"
+        depends on IP_ADVANCED_ROUTER
+        help
+          Normally, the routing tables specify a single action to be taken in
+          a deterministic manner for a given packet. If you say Y here
+          however, it becomes possible to attach several actions to a packet
+          pattern, in effect specifying several alternative paths to travel
+          for those packets. The router considers all these paths to be of
+          equal "cost" and chooses one of them in a non-deterministic fashion
+          if a matching packet arrives.
+config IP_ROUTE_MULTIPATH_CACHED
+        bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
+        depends on: IP_ROUTE_MULTIPATH
+        help
+          Normally, equal cost multipath routing is not supported by the
+          routing cache. If you say Y here, alternative routes are cached
+          and on cache lookup a route is chosen in a configurable fashion.
+          If unsure, say N.
+config IP_ROUTE_MULTIPATH_RR
+        tristate "MULTIPATH: round robin algorithm"
+        depends on IP_ROUTE_MULTIPATH_CACHED
+        help
+          Mulitpath routes are chosen according to Round Robin
+config IP_ROUTE_MULTIPATH_RANDOM
+        tristate "MULTIPATH: random algorithm"
+        depends on IP_ROUTE_MULTIPATH_CACHED
+        help
+          Multipath routes are chosen in a random fashion. Actually,
+          there is no weight for a route. The advantage of this policy
+          is that it is implemented stateless and therefore introduces only
+          a very small delay.
+config IP_ROUTE_MULTIPATH_WRANDOM
+        tristate "MULTIPATH: weighted random algorithm"
+        depends on IP_ROUTE_MULTIPATH_CACHED
+        help
+          Multipath routes are chosen in a weighted random fashion. 
+          The per route weights are the weights visible via ip route 2. As the
+          corresponding state management introduces some overhead routing delay
+          is increased.
+config IP_ROUTE_MULTIPATH_DRR
+        tristate "MULTIPATH: interface round robin algorithm"
+        depends on IP_ROUTE_MULTIPATH_CACHED
+        help
+          Connections are distributed in a round robin fashion over the
+          available interfaces. This policy makes sense if the connections 
+          should be primarily distributed on interfaces and not on routes. 
+config IP_ROUTE_VERBOSE
+        bool "IP: verbose route monitoring"
+        depends on IP_ADVANCED_ROUTER
+        help
+          If you say Y here, which is recommended, then the kernel will print
+          verbose messages regarding the routing, for example warnings about
+          received packets which look strange and could be evidence of an
+          attack or a misconfigured system somewhere. The information is
+          handled by the klogd daemon which is responsible for kernel messages
+          ("man klogd").
+config IP_PNP
+        bool "IP: kernel level autoconfiguration"
+        depends on INET
+        help
+          This enables automatic configuration of IP addresses of devices and
+          of the routing table during kernel boot, based on either information
+          supplied on the kernel command line or by BOOTP or RARP protocols.
+          You need to say Y only for diskless machines requiring network
+          access to boot (in which case you want to say Y to "Root file system
+          on NFS" as well), because all other machines configure the network
+          in their startup scripts.
+config IP_PNP_DHCP
+        bool "IP: DHCP support"
+        depends on IP_PNP
+        ---help---
+          If you want your Linux box to mount its whole root file system (the
+          one containing the directory /) from some other computer over the
+          net via NFS and you want the IP address of your computer to be
+          discovered automatically at boot time using the DHCP protocol (a
+          special protocol designed for doing this job), say Y here. In case
+          the boot ROM of your network card was designed for booting Linux and
+          does DHCP itself, providing all necessary information on the kernel
+          command line, you can say N here.
+          If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+          must be operating on your network.  Read
+          <file:Documentation/nfsroot.txt> for details.
+config IP_PNP_BOOTP
+        bool "IP: BOOTP support"
+        depends on IP_PNP
+        ---help---
+          If you want your Linux box to mount its whole root file system (the
+          one containing the directory /) from some other computer over the
+          net via NFS and you want the IP address of your computer to be
+          discovered automatically at boot time using the BOOTP protocol (a
+          special protocol designed for doing this job), say Y here. In case
+          the boot ROM of your network card was designed for booting Linux and
+          does BOOTP itself, providing all necessary information on the kernel
+          command line, you can say N here. If unsure, say Y. Note that if you
+          want to use BOOTP, a BOOTP server must be operating on your network.
+          Read <file:Documentation/nfsroot.txt> for details.
+config IP_PNP_RARP
+        bool "IP: RARP support"
+        depends on IP_PNP
+        help
+          If you want your Linux box to mount its whole root file system (the
+          one containing the directory /) from some other computer over the
+          net via NFS and you want the IP address of your computer to be
+          discovered automatically at boot time using the RARP protocol (an
+          older protocol which is being obsoleted by BOOTP and DHCP), say Y
+          here. Note that if you want to use RARP, a RARP server must be
+          operating on your network. Read <file:Documentation/nfsroot.txt> for
+          details.
+# not yet ready..
+#   bool '    IP: ARP support' CONFIG_IP_PNP_ARP                
+config NET_IPIP
+        tristate "IP: tunneling"
+        depends on INET
+        select INET_TUNNEL
+        ---help---
+          Tunneling means encapsulating data of one protocol type within
+          another protocol and sending it over a channel that understands the
+          encapsulating protocol. This particular tunneling driver implements
+          encapsulation of IP within IP, which sounds kind of pointless, but
+          can be useful if you want to make your (or some other) machine
+          appear on a different network than it physically is, or to use
+          mobile-IP facilities (allowing laptops to seamlessly move between
+          networks without changing their IP addresses).
+          Saying Y to this option will produce two modules ( = code which can
+          be inserted in and removed from the running kernel whenever you
+          want). Most people won't need this and can say N.
+config NET_IPGRE
+        tristate "IP: GRE tunnels over IP"
+        depends on INET
+        select XFRM
+        help
+          Tunneling means encapsulating data of one protocol type within
+          another protocol and sending it over a channel that understands the
+          encapsulating protocol. This particular tunneling driver implements
+          GRE (Generic Routing Encapsulation) and at this time allows
+          encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+          This driver is useful if the other endpoint is a Cisco router: Cisco
+          likes GRE much better than the other Linux tunneling driver ("IP
+          tunneling" above). In addition, GRE allows multicast redistribution
+          through the tunnel.
+config NET_IPGRE_BROADCAST
+        bool "IP: broadcast GRE over IP"
+        depends on IP_MULTICAST && NET_IPGRE
+        help
+          One application of GRE/IP is to construct a broadcast WAN (Wide Area
+          Network), which looks like a normal Ethernet LAN (Local Area
+          Network), but can be distributed all over the Internet. If you want
+          to do that, say Y here and to "IP multicast routing" below.
+config IP_MROUTE
+        bool "IP: multicast routing"
+        depends on IP_MULTICAST
+        help
+          This is used if you want your machine to act as a router for IP
+          packets that have several destination addresses. It is needed on the
+          MBONE, a high bandwidth network on top of the Internet which carries
+          audio and video broadcasts. In order to do that, you would most
+          likely run the program mrouted. Information about the multicast
+          capabilities of the various network cards is contained in
+          <file:Documentation/networking/multicast.txt>. If you haven't heard
+          about it, you don't need it.
+config IP_PIMSM_V1
+        bool "IP: PIM-SM version 1 support"
+        depends on IP_MROUTE
+        help
+          Kernel side support for Sparse Mode PIM (Protocol Independent
+          Multicast) version 1. This multicast routing protocol is used widely
+          because Cisco supports it. You need special software to use it
+          (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+          information about PIM.
+          Say Y if you want to use PIM-SM v1. Note that you can say N here if
+          you just want to use Dense Mode PIM.
+config IP_PIMSM_V2
+        bool "IP: PIM-SM version 2 support"
+        depends on IP_MROUTE
+        help
+          Kernel side support for Sparse Mode PIM version 2. In order to use
+          this, you need an experimental routing daemon supporting it (pimd or
+          gated-5). This routing protocol is not used widely, so say N unless
+          you want to play with it.
+config ARPD
+        bool "IP: ARP daemon support (EXPERIMENTAL)"
+        depends on INET && EXPERIMENTAL
+        ---help---
+          Normally, the kernel maintains an internal cache which maps IP
+          addresses to hardware addresses on the local network, so that
+          Ethernet/Token Ring/ etc. frames are sent to the proper address on
+          the physical networking layer. For small networks having a few
+          hundred directly connected hosts or less, keeping this address
+          resolution (ARP) cache inside the kernel works well. However,
+          maintaining an internal ARP cache does not work well for very large
+          switched networks, and will use a lot of kernel memory if TCP/IP
+          connections are made to many machines on the network.
+          If you say Y here, the kernel's internal ARP cache will never grow
+          to more than 256 entries (the oldest entries are expired in a LIFO
+          manner) and communication will be attempted with the user space ARP
+          daemon arpd. Arpd then answers the address resolution request either
+          from its own cache or by asking the net.
+          This code is experimental and also obsolete. If you want to use it,
+          you need to find a version of the daemon arpd on the net somewhere,
+          and you should also say Y to "Kernel/User network link driver",
+          below. If unsure, say N.
+config SYN_COOKIES
+        bool "IP: TCP syncookie support (disabled per default)"
+        depends on INET
+        ---help---
+          Normal TCP/IP networking is open to an attack known as "SYN
+          flooding". This denial-of-service attack prevents legitimate remote
+          users from being able to connect to your computer during an ongoing
+          attack and requires very little work from the attacker, who can
+          operate from anywhere on the Internet.
+          SYN cookies provide protection against this type of attack. If you
+          say Y here, the TCP/IP stack will use a cryptographic challenge
+          protocol known as "SYN cookies" to enable legitimate users to
+          continue to connect, even when your machine is under attack. There
+          is no need for the legitimate users to change their TCP/IP software;
+          SYN cookies work transparently to them. For technical information
+          about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+          If you are SYN flooded, the source address reported by the kernel is
+          likely to have been forged by the attacker; it is only reported as
+          an aid in tracing the packets to their actual source and should not
+          be taken as absolute truth.
+          SYN cookies may prevent correct error reporting on clients when the
+          server is really overloaded. If this happens frequently better turn
+          them off.
+          If you say Y here, note that SYN cookies aren't enabled by default;
+          you can enable them by saying Y to "/proc file system support" and
+          "Sysctl support" below and executing the command
+          echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+          at boot time after the /proc file system has been mounted.
+          If unsure, say N.
+config INET_AH
+        tristate "IP: AH transformation"
+        depends on INET
+        select XFRM
+        select CRYPTO
+        select CRYPTO_HMAC
+        select CRYPTO_MD5
+        select CRYPTO_SHA1
+        ---help---
+          Support for IPsec AH.
+          If unsure, say Y.
+config INET_ESP
+        tristate "IP: ESP transformation"
+        depends on INET
+        select XFRM
+        select CRYPTO
+        select CRYPTO_HMAC
+        select CRYPTO_MD5
+        select CRYPTO_SHA1
+        select CRYPTO_DES
+        ---help---
+          Support for IPsec ESP.
+          If unsure, say Y.
+config INET_IPCOMP
+        tristate "IP: IPComp transformation"
+        depends on INET
+        select XFRM
+        select INET_TUNNEL
+        select CRYPTO
+        select CRYPTO_DEFLATE
+        ---help---
+          Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+          typically needed for IPsec.
+          
+          If unsure, say Y.
+config INET_TUNNEL
+        tristate "IP: tunnel transformation"
+        depends on INET
+        select XFRM
+        ---help---
+          Support for generic IP tunnel transformation, which is required by
+          the IP tunneling module as well as tunnel mode IPComp.
+          
+          If unsure, say Y.
+config IP_TCPDIAG
+        tristate "IP: TCP socket monitoring interface"
+        depends on INET
+        default y
+        ---help---
+          Support for TCP socket monitoring interface used by native Linux
+          tools such as ss. ss is included in iproute2, currently downloadable
+          at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
+          and have selected IPv6 as a module, you need to build this as a
+          module too.
+          
+          If unsure, say Y.
+config IP_TCPDIAG_IPV6
+        def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 000000000000..8b379627ebb6
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,33 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+obj-y     := utils.o route.o inetpeer.o protocol.o \
+             ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+             ip_output.o ip_sockglue.o \
+             tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+             datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+             sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o 
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
+obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_IP_VS) += ipvs/
+obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+                      xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 000000000000..c34dab67e461
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1188 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              PF_INET protocol family socket handler.
+ *
+ * Version:     $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Florian La Roche, <flla@stud.uni-sb.de>
+ *              Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ *              piggy,
+ *              Karl Knutson    :       Socket protocol table
+ *              A.N.Kuznetsov   :       Socket death error in accept().
+ *              John Richardson :       Fix non blocking error in connect()
+ *                                      so sockets that fail to connect
+ *                                      don't return -EINPROGRESS.
+ *              Alan Cox        :       Asynchronous I/O support
+ *              Alan Cox        :       Keep correct socket pointer on sock
+ *                                      structures
+ *                                      when accept() ed
+ *              Alan Cox        :       Semantics of SO_LINGER aren't state
+ *                                      moved to close when you look carefully.
+ *                                      With this fixed and the accept bug fixed
+ *                                      some RPC stuff seems happier.
+ *              Niibe Yutaka    :       4.4BSD style write async I/O
+ *              Alan Cox,
+ *              Tony Gale       :       Fixed reuse semantics.
+ *              Alan Cox        :       bind() shouldn't abort existing but dead
+ *                                      sockets. Stops FTP netin:.. I hope.
+ *              Alan Cox        :       bind() works correctly for RAW sockets.
+ *                                      Note that FreeBSD at least was broken
+ *                                      in this respect so be careful with
+ *                                      compatibility tests...
+ *              Alan Cox        :       routing cache support
+ *              Alan Cox        :       memzero the socket structure for
+ *                                      compactness.
+ *              Matt Day        :       nonblock connect error handler
+ *              Alan Cox        :       Allow large numbers of pending sockets
+ *                                      (eg for big web sites), but only if
+ *                                      specifically application requested.
+ *              Alan Cox        :       New buffering throughout IP. Used
+ *                                      dumbly.
+ *              Alan Cox        :       New buffering now used smartly.
+ *              Alan Cox        :       BSD rather than common sense
+ *                                      interpretation of listen.
+ *              Germano Caronni :       Assorted small races.
+ *              Alan Cox        :       sendmsg/recvmsg basic support.
+ *              Alan Cox        :       Only sendmsg/recvmsg now supported.
+ *              Alan Cox        :       Locked down bind (see security list).
+ *              Alan Cox        :       Loosened bind a little.
+ *              Mike McLagan    :       ADD/DEL DLCI Ioctls
+ *      Willy Konynenberg       :       Transparent proxying support.
+ *              David S. Miller :       New socket lookup architecture.
+ *                                      Some other random speedups.
+ *              Cyrus Durgin    :       Cleaned up file for kmod hacks.
+ *              Andi Kleen      :       Fix inet_stream_connect TCP race.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
+#ifdef INET_REFCNT_DEBUG
+atomic_t inet_sock_nr;
+#endif
+extern void ip_mc_drop_socket(struct sock *sk);
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+/* New destruction routine */
+void inet_sock_destruct(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        __skb_queue_purge(&sk->sk_receive_queue);
+        __skb_queue_purge(&sk->sk_error_queue);
+        if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+                printk("Attempt to release TCP socket in state %d %p\n",
+                       sk->sk_state, sk);
+                return;
+        }
+        if (!sock_flag(sk, SOCK_DEAD)) {
+                printk("Attempt to release alive inet socket %p\n", sk);
+                return;
+        }
+        BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+        BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+        BUG_TRAP(!sk->sk_wmem_queued);
+        BUG_TRAP(!sk->sk_forward_alloc);
+        if (inet->opt)
+                kfree(inet->opt);
+        dst_release(sk->sk_dst_cache);
+#ifdef INET_REFCNT_DEBUG
+        atomic_dec(&inet_sock_nr);
+        printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
+               sk, atomic_read(&inet_sock_nr));
+#endif
+}
+/*
+ *      The routines beyond this point handle the behaviour of an AF_INET
+ *      socket object. Mostly it punts to the subprotocols of IP to do
+ *      the work.
+ */
+/*
+ *      Automatically bind an unbound socket.
+ */
+static int inet_autobind(struct sock *sk)
+{
+        struct inet_sock *inet;
+        /* We may need to bind the socket. */
+        lock_sock(sk);
+        inet = inet_sk(sk);
+        if (!inet->num) {
+                if (sk->sk_prot->get_port(sk, 0)) {
+                        release_sock(sk);
+                        return -EAGAIN;
+                }
+                inet->sport = htons(inet->num);
+        }
+        release_sock(sk);
+        return 0;
+}
+/*
+ *      Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+        struct sock *sk = sock->sk;
+        unsigned char old_state;
+        int err;
+        lock_sock(sk);
+        err = -EINVAL;
+        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+                goto out;
+        old_state = sk->sk_state;
+        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+                goto out;
+        /* Really, if the socket is already in listen state
+         * we can only allow the backlog to be adjusted.
+         */
+        if (old_state != TCP_LISTEN) {
+                err = tcp_listen_start(sk);
+                if (err)
+                        goto out;
+        }
+        sk->sk_max_ack_backlog = backlog;
+        err = 0;
+out:
+        release_sock(sk);
+        return err;
+}
+/*
+ *      Create an inet socket.
+ */
+static int inet_create(struct socket *sock, int protocol)
+{
+        struct sock *sk;
+        struct list_head *p;
+        struct inet_protosw *answer;
+        struct inet_sock *inet;
+        struct proto *answer_prot;
+        unsigned char answer_flags;
+        char answer_no_check;
+        int err;
+        sock->state = SS_UNCONNECTED;
+        /* Look for the requested type/protocol pair. */
+        answer = NULL;
+        rcu_read_lock();
+        list_for_each_rcu(p, &inetsw[sock->type]) {
+                answer = list_entry(p, struct inet_protosw, list);
+                /* Check the non-wild match. */
+                if (protocol == answer->protocol) {
+                        if (protocol != IPPROTO_IP)
+                                break;
+                } else {
+                        /* Check for the two wild cases. */
+                        if (IPPROTO_IP == protocol) {
+                                protocol = answer->protocol;
+                                break;
+                        }
+                        if (IPPROTO_IP == answer->protocol)
+                                break;
+                }
+                answer = NULL;
+        }
+        err = -ESOCKTNOSUPPORT;
+        if (!answer)
+                goto out_rcu_unlock;
+        err = -EPERM;
+        if (answer->capability > 0 && !capable(answer->capability))
+                goto out_rcu_unlock;
+        err = -EPROTONOSUPPORT;
+        if (!protocol)
+                goto out_rcu_unlock;
+        sock->ops = answer->ops;
+        answer_prot = answer->prot;
+        answer_no_check = answer->no_check;
+        answer_flags = answer->flags;
+        rcu_read_unlock();
+        BUG_TRAP(answer_prot->slab != NULL);
+        err = -ENOBUFS;
+        sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
+        if (sk == NULL)
+                goto out;
+        err = 0;
+        sk->sk_no_check = answer_no_check;
+        if (INET_PROTOSW_REUSE & answer_flags)
+                sk->sk_reuse = 1;
+        inet = inet_sk(sk);
+        if (SOCK_RAW == sock->type) {
+                inet->num = protocol;
+                if (IPPROTO_RAW == protocol)
+                        inet->hdrincl = 1;
+        }
+        if (ipv4_config.no_pmtu_disc)
+                inet->pmtudisc = IP_PMTUDISC_DONT;
+        else
+                inet->pmtudisc = IP_PMTUDISC_WANT;
+        inet->id = 0;
+        sock_init_data(sock, sk);
+        sk->sk_destruct    = inet_sock_destruct;
+        sk->sk_family      = PF_INET;
+        sk->sk_protocol    = protocol;
+        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+        inet->uc_ttl    = -1;
+        inet->mc_loop   = 1;
+        inet->mc_ttl    = 1;
+        inet->mc_index  = 0;
+        inet->mc_list   = NULL;
+#ifdef INET_REFCNT_DEBUG
+        atomic_inc(&inet_sock_nr);
+#endif
+        if (inet->num) {
+                /* It assumes that any protocol which allows
+                 * the user to assign a number at socket
+                 * creation time automatically
+                 * shares.
+                 */
+                inet->sport = htons(inet->num);
+                /* Add to protocol hash chains. */
+                sk->sk_prot->hash(sk);
+        }
+        if (sk->sk_prot->init) {
+                err = sk->sk_prot->init(sk);
+                if (err)
+                        sk_common_release(sk);
+        }
+out:
+        return err;
+out_rcu_unlock:
+        rcu_read_unlock();
+        goto out;
+}
+/*
+ *      The peer socket should always be NULL (or else). When we call this
+ *      function we are destroying the object and from then on nobody
+ *      should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+        struct sock *sk = sock->sk;
+        if (sk) {
+                long timeout;
+                /* Applications forget to leave groups before exiting */
+                ip_mc_drop_socket(sk);
+                /* If linger is set, we don't return until the close
+                 * is complete.  Otherwise we return immediately. The
+                 * actually closing is done the same either way.
+                 *
+                 * If the close is due to the process exiting, we never
+                 * linger..
+                 */
+                timeout = 0;
+                if (sock_flag(sk, SOCK_LINGER) &&
+                    !(current->flags & PF_EXITING))
+                        timeout = sk->sk_lingertime;
+                sock->sk = NULL;
+                sk->sk_prot->close(sk, timeout);
+        }
+        return 0;
+}
+/* It is off by default, see below. */
+int sysctl_ip_nonlocal_bind;
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+        struct sock *sk = sock->sk;
+        struct inet_sock *inet = inet_sk(sk);
+        unsigned short snum;
+        int chk_addr_ret;
+        int err;
+        /* If the socket has its own bind function then use it. (RAW) */
+        if (sk->sk_prot->bind) {
+                err = sk->sk_prot->bind(sk, uaddr, addr_len);
+                goto out;
+        }
+        err = -EINVAL;
+        if (addr_len < sizeof(struct sockaddr_in))
+                goto out;
+        chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+        /* Not specified by any standard per-se, however it breaks too
+         * many applications when removed.  It is unfortunate since
+         * allowing applications to make a non-local bind solves
+         * several problems with systems using dynamic addressing.
+         * (ie. your servers still start up even if your ISDN link
+         *  is temporarily down)
+         */
+        err = -EADDRNOTAVAIL;
+        if (!sysctl_ip_nonlocal_bind &&
+            !inet->freebind &&
+            addr->sin_addr.s_addr != INADDR_ANY &&
+            chk_addr_ret != RTN_LOCAL &&
+            chk_addr_ret != RTN_MULTICAST &&
+            chk_addr_ret != RTN_BROADCAST)
+                goto out;
+        snum = ntohs(addr->sin_port);
+        err = -EACCES;
+        if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+                goto out;
+        /*      We keep a pair of addresses. rcv_saddr is the one
+         *      used by hash lookups, and saddr is used for transmit.
+         *
+         *      In the BSD API these are the same except where it
+         *      would be illegal to use them (multicast/broadcast) in
+         *      which case the sending device address is used.
+         */
+        lock_sock(sk);
+        /* Check these errors (active socket, double bind). */
+        err = -EINVAL;
+        if (sk->sk_state != TCP_CLOSE || inet->num)
+                goto out_release_sock;
+        inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+        if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+                inet->saddr = 0;  /* Use device */
+        /* Make sure we are allowed to bind here. */
+        if (sk->sk_prot->get_port(sk, snum)) {
+                inet->saddr = inet->rcv_saddr = 0;
+                err = -EADDRINUSE;
+                goto out_release_sock;
+        }
+        if (inet->rcv_saddr)
+                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+        if (snum)
+                sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+        inet->sport = htons(inet->num);
+        inet->daddr = 0;
+        inet->dport = 0;
+        sk_dst_reset(sk);
+        err = 0;
+out_release_sock:
+        release_sock(sk);
+out:
+        return err;
+}
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+                       int addr_len, int flags)
+{
+        struct sock *sk = sock->sk;
+        if (uaddr->sa_family == AF_UNSPEC)
+                return sk->sk_prot->disconnect(sk, flags);
+        if (!inet_sk(sk)->num && inet_autobind(sk))
+                return -EAGAIN;
+        return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+}
+static long inet_wait_for_connect(struct sock *sk, long timeo)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+        /* Basic assumption: if someone sets sk->sk_err, he _must_
+         * change state of the socket from TCP_SYN_*.
+         * Connect() does not allow to get error notifications
+         * without closing the socket.
+         */
+        while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+                release_sock(sk);
+                timeo = schedule_timeout(timeo);
+                lock_sock(sk);
+                if (signal_pending(current) || !timeo)
+                        break;
+                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+        }
+        finish_wait(sk->sk_sleep, &wait);
+        return timeo;
+}
+/*
+ *      Connect to a remote host. There is regrettably still a little
+ *      TCP 'magic' in here.
+ */
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+                        int addr_len, int flags)
+{
+        struct sock *sk = sock->sk;
+        int err;
+        long timeo;
+        lock_sock(sk);
+        if (uaddr->sa_family == AF_UNSPEC) {
+                err = sk->sk_prot->disconnect(sk, flags);
+                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+                goto out;
+        }
+        switch (sock->state) {
+        default:
+                err = -EINVAL;
+                goto out;
+        case SS_CONNECTED:
+                err = -EISCONN;
+                goto out;
+        case SS_CONNECTING:
+                err = -EALREADY;
+                /* Fall out of switch with err, set for this state */
+                break;
+        case SS_UNCONNECTED:
+                err = -EISCONN;
+                if (sk->sk_state != TCP_CLOSE)
+                        goto out;
+                err = sk->sk_prot->connect(sk, uaddr, addr_len);
+                if (err < 0)
+                        goto out;
+                sock->state = SS_CONNECTING;
+                /* Just entered SS_CONNECTING state; the only
+                 * difference is that return value in non-blocking
+                 * case is EINPROGRESS, rather than EALREADY.
+                 */
+                err = -EINPROGRESS;
+                break;
+        }
+        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+                /* Error code is set above */
+                if (!timeo || !inet_wait_for_connect(sk, timeo))
+                        goto out;
+                err = sock_intr_errno(timeo);
+                if (signal_pending(current))
+                        goto out;
+        }
+        /* Connection was closed by RST, timeout, ICMP error
+         * or another process disconnected us.
+         */
+        if (sk->sk_state == TCP_CLOSE)
+                goto sock_error;
+        /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+         * and error was received after socket entered established state.
+         * Hence, it is handled normally after connect() return successfully.
+         */
+        sock->state = SS_CONNECTED;
+        err = 0;
+out:
+        release_sock(sk);
+        return err;
+sock_error:
+        err = sock_error(sk) ? : -ECONNABORTED;
+        sock->state = SS_UNCONNECTED;
+        if (sk->sk_prot->disconnect(sk, flags))
+                sock->state = SS_DISCONNECTING;
+        goto out;
+}
+/*
+ *      Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+        struct sock *sk1 = sock->sk;
+        int err = -EINVAL;
+        struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+        if (!sk2)
+                goto do_err;
+        lock_sock(sk2);
+        BUG_TRAP((1 << sk2->sk_state) &
+                 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+        sock_graft(sk2, newsock);
+        newsock->state = SS_CONNECTED;
+        err = 0;
+        release_sock(sk2);
+do_err:
+        return err;
+}
+/*
+ *      This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+                        int *uaddr_len, int peer)
+{
+        struct sock *sk         = sock->sk;
+        struct inet_sock *inet  = inet_sk(sk);
+        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+        sin->sin_family = AF_INET;
+        if (peer) {
+                if (!inet->dport ||
+                    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+                     peer == 1))
+                        return -ENOTCONN;
+                sin->sin_port = inet->dport;
+                sin->sin_addr.s_addr = inet->daddr;
+        } else {
+                __u32 addr = inet->rcv_saddr;
+                if (!addr)
+                        addr = inet->saddr;
+                sin->sin_port = inet->sport;
+                sin->sin_addr.s_addr = addr;
+        }
+        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+        *uaddr_len = sizeof(*sin);
+        return 0;
+}
+int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                 size_t size)
+{
+        struct sock *sk = sock->sk;
+        /* We may need to bind the socket. */
+        if (!inet_sk(sk)->num && inet_autobind(sk))
+                return -EAGAIN;
+        return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+}
+static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+        struct sock *sk = sock->sk;
+        /* We may need to bind the socket. */
+        if (!inet_sk(sk)->num && inet_autobind(sk))
+                return -EAGAIN;
+        if (sk->sk_prot->sendpage)
+                return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+        return sock_no_sendpage(sock, page, offset, size, flags);
+}
+int inet_shutdown(struct socket *sock, int how)
+{
+        struct sock *sk = sock->sk;
+        int err = 0;
+        /* This should really check to make sure
+         * the socket is a TCP socket. (WHY AC...)
+         */
+        how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+                       1->2 bit 2 snds.
+                       2->3 */
+        if ((how & ~SHUTDOWN_MASK) || !how)     /* MAXINT->0 */
+                return -EINVAL;
+        lock_sock(sk);
+        if (sock->state == SS_CONNECTING) {
+                if ((1 << sk->sk_state) &
+                    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+                        sock->state = SS_DISCONNECTING;
+                else
+                        sock->state = SS_CONNECTED;
+        }
+        switch (sk->sk_state) {
+        case TCP_CLOSE:
+                err = -ENOTCONN;
+                /* Hack to wake up other listeners, who can poll for
+                   POLLHUP, even on eg. unconnected UDP sockets -- RR */
+        default:
+                sk->sk_shutdown |= how;
+                if (sk->sk_prot->shutdown)
+                        sk->sk_prot->shutdown(sk, how);
+                break;
+        /* Remaining two branches are temporary solution for missing
+         * close() in multithreaded environment. It is _not_ a good idea,
+         * but we have no choice until close() is repaired at VFS level.
+         */
+        case TCP_LISTEN:
+                if (!(how & RCV_SHUTDOWN))
+                        break;
+                /* Fall through */
+        case TCP_SYN_SENT:
+                err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+                break;
+        }
+        /* Wake up anyone sleeping in poll. */
+        sk->sk_state_change(sk);
+        release_sock(sk);
+        return err;
+}
+/*
+ *      ioctl() calls you can issue on an INET socket. Most of these are
+ *      device configuration and stuff and very rarely used. Some ioctls
+ *      pass on to the socket itself.
+ *
+ *      NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ *      loads the devconfigure module does its configuring and unloads it.
+ *      There's a good 20K of config code hanging around the kernel.
+ */
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+        struct sock *sk = sock->sk;
+        int err = 0;
+        switch (cmd) {
+                case SIOCGSTAMP:
+                        err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+                        break;
+                case SIOCADDRT:
+                case SIOCDELRT:
+                case SIOCRTMSG:
+                        err = ip_rt_ioctl(cmd, (void __user *)arg);
+                        break;
+                case SIOCDARP:
+                case SIOCGARP:
+                case SIOCSARP:
+                        err = arp_ioctl(cmd, (void __user *)arg);
+                        break;
+                case SIOCGIFADDR:
+                case SIOCSIFADDR:
+                case SIOCGIFBRDADDR:
+                case SIOCSIFBRDADDR:
+                case SIOCGIFNETMASK:
+                case SIOCSIFNETMASK:
+                case SIOCGIFDSTADDR:
+                case SIOCSIFDSTADDR:
+                case SIOCSIFPFLAGS:
+                case SIOCGIFPFLAGS:
+                case SIOCSIFFLAGS:
+                        err = devinet_ioctl(cmd, (void __user *)arg);
+                        break;
+                default:
+                        if (!sk->sk_prot->ioctl ||
+                            (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
+                                                                -ENOIOCTLCMD)
+                                err = dev_ioctl(cmd, (void __user *)arg);
+                        break;
+        }
+        return err;
+}
+struct proto_ops inet_stream_ops = {
+        .family =       PF_INET,
+        .owner =        THIS_MODULE,
+        .release =      inet_release,
+        .bind =         inet_bind,
+        .connect =      inet_stream_connect,
+        .socketpair =   sock_no_socketpair,
+        .accept =       inet_accept,
+        .getname =      inet_getname,
+        .poll =         tcp_poll,
+        .ioctl =        inet_ioctl,
+        .listen =       inet_listen,
+        .shutdown =     inet_shutdown,
+        .setsockopt =   sock_common_setsockopt,
+        .getsockopt =   sock_common_getsockopt,
+        .sendmsg =      inet_sendmsg,
+        .recvmsg =      sock_common_recvmsg,
+        .mmap =         sock_no_mmap,
+        .sendpage =     tcp_sendpage
+};
+struct proto_ops inet_dgram_ops = {
+        .family =       PF_INET,
+        .owner =        THIS_MODULE,
+        .release =      inet_release,
+        .bind =         inet_bind,
+        .connect =      inet_dgram_connect,
+        .socketpair =   sock_no_socketpair,
+        .accept =       sock_no_accept,
+        .getname =      inet_getname,
+        .poll =         udp_poll,
+        .ioctl =        inet_ioctl,
+        .listen =       sock_no_listen,
+        .shutdown =     inet_shutdown,
+        .setsockopt =   sock_common_setsockopt,
+        .getsockopt =   sock_common_getsockopt,
+        .sendmsg =      inet_sendmsg,
+        .recvmsg =      sock_common_recvmsg,
+        .mmap =         sock_no_mmap,
+        .sendpage =     inet_sendpage,
+};
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static struct proto_ops inet_sockraw_ops = {
+        .family =       PF_INET,
+        .owner =        THIS_MODULE,
+        .release =      inet_release,
+        .bind =         inet_bind,
+        .connect =      inet_dgram_connect,
+        .socketpair =   sock_no_socketpair,
+        .accept =       sock_no_accept,
+        .getname =      inet_getname,
+        .poll =         datagram_poll,
+        .ioctl =        inet_ioctl,
+        .listen =       sock_no_listen,
+        .shutdown =     inet_shutdown,
+        .setsockopt =   sock_common_setsockopt,
+        .getsockopt =   sock_common_getsockopt,
+        .sendmsg =      inet_sendmsg,
+        .recvmsg =      sock_common_recvmsg,
+        .mmap =         sock_no_mmap,
+        .sendpage =     inet_sendpage,
+};
+static struct net_proto_family inet_family_ops = {
+        .family = PF_INET,
+        .create = inet_create,
+        .owner  = THIS_MODULE,
+};
+extern void tcp_init(void);
+extern void tcp_v4_init(struct net_proto_family *);
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+        {
+                .type =       SOCK_STREAM,
+                .protocol =   IPPROTO_TCP,
+                .prot =       &tcp_prot,
+                .ops =        &inet_stream_ops,
+                .capability = -1,
+                .no_check =   0,
+                .flags =      INET_PROTOSW_PERMANENT,
+        },
+        {
+                .type =       SOCK_DGRAM,
+                .protocol =   IPPROTO_UDP,
+                .prot =       &udp_prot,
+                .ops =        &inet_dgram_ops,
+                .capability = -1,
+                .no_check =   UDP_CSUM_DEFAULT,
+                .flags =      INET_PROTOSW_PERMANENT,
+       },
+        
+       {
+               .type =       SOCK_RAW,
+               .protocol =   IPPROTO_IP,        /* wild card */
+               .prot =       &raw_prot,
+               .ops =        &inet_sockraw_ops,
+               .capability = CAP_NET_RAW,
+               .no_check =   UDP_CSUM_DEFAULT,
+               .flags =      INET_PROTOSW_REUSE,
+       }
+};
+#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+void inet_register_protosw(struct inet_protosw *p)
+{
+        struct list_head *lh;
+        struct inet_protosw *answer;
+        int protocol = p->protocol;
+        struct list_head *last_perm;
+        spin_lock_bh(&inetsw_lock);
+        if (p->type >= SOCK_MAX)
+                goto out_illegal;
+        /* If we are trying to override a permanent protocol, bail. */
+        answer = NULL;
+        last_perm = &inetsw[p->type];
+        list_for_each(lh, &inetsw[p->type]) {
+                answer = list_entry(lh, struct inet_protosw, list);
+                /* Check only the non-wild match. */
+                if (INET_PROTOSW_PERMANENT & answer->flags) {
+                        if (protocol == answer->protocol)
+                                break;
+                        last_perm = lh;
+                }
+                answer = NULL;
+        }
+        if (answer)
+                goto out_permanent;
+        /* Add the new entry after the last permanent entry if any, so that
+         * the new entry does not override a permanent entry when matched with
+         * a wild-card protocol. But it is allowed to override any existing
+         * non-permanent entry.  This means that when we remove this entry, the 
+         * system automatically returns to the old behavior.
+         */
+        list_add_rcu(&p->list, last_perm);
+out:
+        spin_unlock_bh(&inetsw_lock);
+        synchronize_net();
+        return;
+out_permanent:
+        printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+               protocol);
+        goto out;
+out_illegal:
+        printk(KERN_ERR
+               "Ignoring attempt to register invalid socket type %d.\n",
+               p->type);
+        goto out;
+}
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+        if (INET_PROTOSW_PERMANENT & p->flags) {
+                printk(KERN_ERR
+                       "Attempt to unregister permanent protocol %d.\n",
+                       p->protocol);
+        } else {
+                spin_lock_bh(&inetsw_lock);
+                list_del_rcu(&p->list);
+                spin_unlock_bh(&inetsw_lock);
+                synchronize_net();
+        }
+}
+#ifdef CONFIG_IP_MULTICAST
+static struct net_protocol igmp_protocol = {
+        .handler =      igmp_rcv,
+};
+#endif
+static struct net_protocol tcp_protocol = {
+        .handler =      tcp_v4_rcv,
+        .err_handler =  tcp_v4_err,
+        .no_policy =    1,
+};
+static struct net_protocol udp_protocol = {
+        .handler =      udp_rcv,
+        .err_handler =  udp_err,
+        .no_policy =    1,
+};
+static struct net_protocol icmp_protocol = {
+        .handler =      icmp_rcv,
+};
+static int __init init_ipv4_mibs(void)
+{
+        net_statistics[0] = alloc_percpu(struct linux_mib);
+        net_statistics[1] = alloc_percpu(struct linux_mib);
+        ip_statistics[0] = alloc_percpu(struct ipstats_mib);
+        ip_statistics[1] = alloc_percpu(struct ipstats_mib);
+        icmp_statistics[0] = alloc_percpu(struct icmp_mib);
+        icmp_statistics[1] = alloc_percpu(struct icmp_mib);
+        tcp_statistics[0] = alloc_percpu(struct tcp_mib);
+        tcp_statistics[1] = alloc_percpu(struct tcp_mib);
+        udp_statistics[0] = alloc_percpu(struct udp_mib);
+        udp_statistics[1] = alloc_percpu(struct udp_mib);
+        if (!
+            (net_statistics[0] && net_statistics[1] && ip_statistics[0]
+             && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
+             && udp_statistics[0] && udp_statistics[1]))
+                return -ENOMEM;
+        (void) tcp_mib_init();
+        return 0;
+}
+static int ipv4_proc_init(void);
+extern void ipfrag_init(void);
+static int __init inet_init(void)
+{
+        struct sk_buff *dummy_skb;
+        struct inet_protosw *q;
+        struct list_head *r;
+        int rc = -EINVAL;
+        if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) {
+                printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
+                goto out;
+        }
+        rc = proto_register(&tcp_prot, 1);
+        if (rc)
+                goto out;
+        rc = proto_register(&udp_prot, 1);
+        if (rc)
+                goto out_unregister_tcp_proto;
+        rc = proto_register(&raw_prot, 1);
+        if (rc)
+                goto out_unregister_udp_proto;
+        /*
+         *      Tell SOCKET that we are alive... 
+         */
+        (void)sock_register(&inet_family_ops);
+        /*
+         *      Add all the base protocols.
+         */
+        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+                printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+        if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+                printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+        if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+                printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+#ifdef CONFIG_IP_MULTICAST
+        if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+                printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+#endif
+        /* Register the socket-side information for inet_create. */
+        for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+                INIT_LIST_HEAD(r);
+        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+                inet_register_protosw(q);
+        /*
+         *      Set the ARP module up
+         */
+        arp_init();
+        /*
+         *      Set the IP module up
+         */
+        ip_init();
+        tcp_v4_init(&inet_family_ops);
+        /* Setup TCP slab cache for open requests. */
+        tcp_init();
+        /*
+         *      Set the ICMP layer up
+         */
+        icmp_init(&inet_family_ops);
+        /*
+         *      Initialise the multicast router
+         */
+#if defined(CONFIG_IP_MROUTE)
+        ip_mr_init();
+#endif
+        /*
+         *      Initialise per-cpu ipv4 mibs
+         */ 
+        if(init_ipv4_mibs())
+                printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+        
+        ipv4_proc_init();
+        ipfrag_init();
+        rc = 0;
+out:
+        return rc;
+out_unregister_tcp_proto:
+        proto_unregister(&tcp_prot);
+out_unregister_udp_proto:
+        proto_unregister(&udp_prot);
+        goto out;
+}
+module_init(inet_init);
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+extern int  fib_proc_init(void);
+extern void fib_proc_exit(void);
+extern int  ip_misc_proc_init(void);
+extern int  raw_proc_init(void);
+extern void raw_proc_exit(void);
+extern int  tcp4_proc_init(void);
+extern void tcp4_proc_exit(void);
+extern int  udp4_proc_init(void);
+extern void udp4_proc_exit(void);
+static int __init ipv4_proc_init(void)
+{
+        int rc = 0;
+        if (raw_proc_init())
+                goto out_raw;
+        if (tcp4_proc_init())
+                goto out_tcp;
+        if (udp4_proc_init())
+                goto out_udp;
+        if (fib_proc_init())
+                goto out_fib;
+        if (ip_misc_proc_init())
+                goto out_misc;
+out:
+        return rc;
+out_misc:
+        fib_proc_exit();
+out_fib:
+        udp4_proc_exit();
+out_udp:
+        tcp4_proc_exit();
+out_tcp:
+        raw_proc_exit();
+out_raw:
+        rc = -ENOMEM;
+        goto out;
+}
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+        return 0;
+}
+#endif /* CONFIG_PROC_FS */
+MODULE_ALIAS_NETPROTO(PF_INET);
+EXPORT_SYMBOL(inet_accept);
+EXPORT_SYMBOL(inet_bind);
+EXPORT_SYMBOL(inet_dgram_connect);
+EXPORT_SYMBOL(inet_dgram_ops);
+EXPORT_SYMBOL(inet_getname);
+EXPORT_SYMBOL(inet_ioctl);
+EXPORT_SYMBOL(inet_listen);
+EXPORT_SYMBOL(inet_register_protosw);
+EXPORT_SYMBOL(inet_release);
+EXPORT_SYMBOL(inet_sendmsg);
+EXPORT_SYMBOL(inet_shutdown);
+EXPORT_SYMBOL(inet_sock_destruct);
+EXPORT_SYMBOL(inet_stream_connect);
+EXPORT_SYMBOL(inet_stream_ops);
+EXPORT_SYMBOL(inet_unregister_protosw);
+EXPORT_SYMBOL(net_statistics);
+#ifdef INET_REFCNT_DEBUG
+EXPORT_SYMBOL(inet_sock_nr);
+#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 000000000000..0e98f2235b6e
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,335 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <net/icmp.h>
+#include <asm/scatterlist.h>
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr)
+{
+        unsigned char * optptr = (unsigned char*)(iph+1);
+        int  l = iph->ihl*4 - sizeof(struct iphdr);
+        int  optlen;
+        while (l > 0) {
+                switch (*optptr) {
+                case IPOPT_END:
+                        return 0;
+                case IPOPT_NOOP:
+                        l--;
+                        optptr++;
+                        continue;
+                }
+                optlen = optptr[1];
+                if (optlen<2 || optlen>l)
+                        return -EINVAL;
+                switch (*optptr) {
+                case IPOPT_SEC:
+                case 0x85:      /* Some "Extended Security" crap. */
+                case 0x86:      /* Another "Commercial Security" crap. */
+                case IPOPT_RA:
+                case 0x80|21:   /* RFC1770 */
+                        break;
+                case IPOPT_LSRR:
+                case IPOPT_SSRR:
+                        if (optlen < 6)
+                                return -EINVAL;
+                        memcpy(daddr, optptr+optlen-4, 4);
+                        /* Fall through */
+                default:
+                        memset(optptr+2, 0, optlen-2);
+                }
+                l -= optlen;
+                optptr += optlen;
+        }
+        return 0;
+}
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int err;
+        struct iphdr *iph, *top_iph;
+        struct ip_auth_hdr *ah;
+        struct ah_data *ahp;
+        union {
+                struct iphdr    iph;
+                char            buf[60];
+        } tmp_iph;
+        top_iph = skb->nh.iph;
+        iph = &tmp_iph.iph;
+        iph->tos = top_iph->tos;
+        iph->ttl = top_iph->ttl;
+        iph->frag_off = top_iph->frag_off;
+        if (top_iph->ihl != 5) {
+                iph->daddr = top_iph->daddr;
+                memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+                err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+                if (err)
+                        goto error;
+        }
+        ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
+        ah->nexthdr = top_iph->protocol;
+        top_iph->tos = 0;
+        top_iph->tot_len = htons(skb->len);
+        top_iph->frag_off = 0;
+        top_iph->ttl = 0;
+        top_iph->protocol = IPPROTO_AH;
+        top_iph->check = 0;
+        ahp = x->data;
+        ah->hdrlen  = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 
+                                   ahp->icv_trunc_len) >> 2) - 2;
+        ah->reserved = 0;
+        ah->spi = x->id.spi;
+        ah->seq_no = htonl(++x->replay.oseq);
+        ahp->icv(ahp, skb, ah->auth_data);
+        top_iph->tos = iph->tos;
+        top_iph->ttl = iph->ttl;
+        top_iph->frag_off = iph->frag_off;
+        if (top_iph->ihl != 5) {
+                top_iph->daddr = iph->daddr;
+                memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+        }
+        ip_send_check(top_iph);
+        err = 0;
+error:
+        return err;
+}
+static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+        int ah_hlen;
+        struct iphdr *iph;
+        struct ip_auth_hdr *ah;
+        struct ah_data *ahp;
+        char work_buf[60];
+        if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+                goto out;
+        ah = (struct ip_auth_hdr*)skb->data;
+        ahp = x->data;
+        ah_hlen = (ah->hdrlen + 2) << 2;
+        
+        if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
+            ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len)) 
+                goto out;
+        if (!pskb_may_pull(skb, ah_hlen))
+                goto out;
+        /* We are going to _remove_ AH header to keep sockets happy,
+         * so... Later this can change. */
+        if (skb_cloned(skb) &&
+            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+                goto out;
+        skb->ip_summed = CHECKSUM_NONE;
+        ah = (struct ip_auth_hdr*)skb->data;
+        iph = skb->nh.iph;
+        memcpy(work_buf, iph, iph->ihl*4);
+        iph->ttl = 0;
+        iph->tos = 0;
+        iph->frag_off = 0;
+        iph->check = 0;
+        if (iph->ihl != 5) {
+                u32 dummy;
+                if (ip_clear_mutable_options(iph, &dummy))
+                        goto out;
+        }
+        {
+                u8 auth_data[MAX_AH_AUTH_LEN];
+                
+                memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+                skb_push(skb, skb->data - skb->nh.raw);
+                ahp->icv(ahp, skb, ah->auth_data);
+                if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
+                        x->stats.integrity_failed++;
+                        goto out;
+                }
+        }
+        ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
+        skb->nh.raw = skb_pull(skb, ah_hlen);
+        memcpy(skb->nh.raw, work_buf, iph->ihl*4);
+        skb->nh.iph->tot_len = htons(skb->len);
+        skb_pull(skb, skb->nh.iph->ihl*4);
+        skb->h.raw = skb->data;
+        return 0;
+out:
+        return -EINVAL;
+}
+static void ah4_err(struct sk_buff *skb, u32 info)
+{
+        struct iphdr *iph = (struct iphdr*)skb->data;
+        struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
+        struct xfrm_state *x;
+        if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+            skb->h.icmph->code != ICMP_FRAG_NEEDED)
+                return;
+        x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+        if (!x)
+                return;
+        printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
+               ntohl(ah->spi), ntohl(iph->daddr));
+        xfrm_state_put(x);
+}
+static int ah_init_state(struct xfrm_state *x, void *args)
+{
+        struct ah_data *ahp = NULL;
+        struct xfrm_algo_desc *aalg_desc;
+        if (!x->aalg)
+                goto error;
+        /* null auth can use a zero length key */
+        if (x->aalg->alg_key_len > 512)
+                goto error;
+        if (x->encap)
+                goto error;
+        ahp = kmalloc(sizeof(*ahp), GFP_KERNEL);
+        if (ahp == NULL)
+                return -ENOMEM;
+        memset(ahp, 0, sizeof(*ahp));
+        ahp->key = x->aalg->alg_key;
+        ahp->key_len = (x->aalg->alg_key_len+7)/8;
+        ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+        if (!ahp->tfm)
+                goto error;
+        ahp->icv = ah_hmac_digest;
+        
+        /*
+         * Lookup the algorithm description maintained by xfrm_algo,
+         * verify crypto transform properties, and store information
+         * we need for AH processing.  This lookup cannot fail here
+         * after a successful crypto_alloc_tfm().
+         */
+        aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+        BUG_ON(!aalg_desc);
+        if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+            crypto_tfm_alg_digestsize(ahp->tfm)) {
+                printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
+                       x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm),
+                       aalg_desc->uinfo.auth.icv_fullbits/8);
+                goto error;
+        }
+        
+        ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+        ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+        
+        BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
+        
+        ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
+        if (!ahp->work_icv)
+                goto error;
+        
+        x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
+        if (x->props.mode)
+                x->props.header_len += sizeof(struct iphdr);
+        x->data = ahp;
+        return 0;
+error:
+        if (ahp) {
+                if (ahp->work_icv)
+                        kfree(ahp->work_icv);
+                if (ahp->tfm)
+                        crypto_free_tfm(ahp->tfm);
+                kfree(ahp);
+        }
+        return -EINVAL;
+}
+static void ah_destroy(struct xfrm_state *x)
+{
+        struct ah_data *ahp = x->data;
+        if (!ahp)
+                return;
+        if (ahp->work_icv) {
+                kfree(ahp->work_icv);
+                ahp->work_icv = NULL;
+        }
+        if (ahp->tfm) {
+                crypto_free_tfm(ahp->tfm);
+                ahp->tfm = NULL;
+        }
+        kfree(ahp);
+}
+static struct xfrm_type ah_type =
+{
+        .description    = "AH4",
+        .owner          = THIS_MODULE,
+        .proto          = IPPROTO_AH,
+        .init_state     = ah_init_state,
+        .destructor     = ah_destroy,
+        .input          = ah_input,
+        .output         = ah_output
+};
+static struct net_protocol ah4_protocol = {
+        .handler        =       xfrm4_rcv,
+        .err_handler    =       ah4_err,
+        .no_policy      =       1,
+};
+static int __init ah4_init(void)
+{
+        if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+                printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+                return -EAGAIN;
+        }
+        if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+                printk(KERN_INFO "ip ah init: can't add protocol\n");
+                xfrm_unregister_type(&ah_type, AF_INET);
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void __exit ah4_fini(void)
+{
+        if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+                printk(KERN_INFO "ip ah close: can't remove protocol\n");
+        if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+                printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+}
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 000000000000..a642fd612853
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1425 @@
+/* linux/net/inet/arp.c
+ *
+ * Version:     $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $
+ *
+ * Copyright (C) 1994 by Florian  La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *              Alan Cox        :       Removed the Ethernet assumptions in 
+ *                                      Florian's code
+ *              Alan Cox        :       Fixed some small errors in the ARP 
+ *                                      logic
+ *              Alan Cox        :       Allow >4K in /proc
+ *              Alan Cox        :       Make ARP add its own protocol entry
+ *              Ross Martin     :       Rewrote arp_rcv() and arp_get_info()
+ *              Stephen Henson  :       Add AX25 support to arp_get_info()
+ *              Alan Cox        :       Drop data when a device is downed.
+ *              Alan Cox        :       Use init_timer().
+ *              Alan Cox        :       Double lock fixes.
+ *              Martin Seine    :       Move the arphdr structure
+ *                                      to if_arp.h for compatibility.
+ *                                      with BSD based programs.
+ *              Andrew Tridgell :       Added ARP netmask code and
+ *                                      re-arranged proxy handling.
+ *              Alan Cox        :       Changed to use notifiers.
+ *              Niibe Yutaka    :       Reply for this device or proxies only.
+ *              Alan Cox        :       Don't proxy across hardware types!
+ *              Jonathan Naylor :       Added support for NET/ROM.
+ *              Mike Shaver     :       RFC1122 checks.
+ *              Jonathan Naylor :       Only lookup the hardware address for
+ *                                      the correct hardware type.
+ *              Germano Caronni :       Assorted subtle races.
+ *              Craig Schlenter :       Don't modify permanent entry 
+ *                                      during arp_rcv.
+ *              Russ Nelson     :       Tidied up a few bits.
+ *              Alexey Kuznetsov:       Major changes to caching and behaviour,
+ *                                      eg intelligent arp probing and 
+ *                                      generation
+ *                                      of host down events.
+ *              Alan Cox        :       Missing unlock in device events.
+ *              Eckes           :       ARP ioctl control errors.
+ *              Alexey Kuznetsov:       Arp free fix.
+ *              Manuel Rodriguez:       Gratuitous ARP.
+ *              Jonathan Layes  :       Added arpd support through kerneld 
+ *                                      message queue (960314)
+ *              Mike Shaver     :       /proc/sys/net/ipv4/arp_* support
+ *              Mike McLagan    :       Routing by source
+ *              Stuart Cheshire :       Metricom and grat arp fixes
+ *                                      *** FOR 2.1 clean this up ***
+ *              Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ *              Alan Cox        :       Took the AP1000 nasty FDDI hack and
+ *                                      folded into the mainstream FDDI code.
+ *                                      Ack spit, Linus how did you allow that
+ *                                      one in...
+ *              Jes Sorensen    :       Make FDDI work again in 2.1.x and
+ *                                      clean up the APFDDI & gen. FDDI bits.
+ *              Alexey Kuznetsov:       new arp state machine;
+ *                                      now it is in net/core/neighbour.c.
+ *              Krzysztof Halasa:       Added Frame Relay ARP support.
+ *              Arnaldo C. Melo :       convert /proc/net/arp to seq_file
+ *              Shmulik Hen:            Split arp_send to arp_create and
+ *                                      arp_xmit so intermediate drivers like
+ *                                      bonding can change the skb before
+ *                                      sending (e.g. insert 8021q tag).
+ *              Harald Welte    :       convert to make use of jenkins hash
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#include <net/ax25.h>
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#include <net/netrom.h>
+#endif
+#endif
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+#include <net/atmclip.h>
+struct neigh_table *clip_tbl_hook;
+#endif
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/netfilter_arp.h>
+/*
+ *      Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+static struct neigh_ops arp_generic_ops = {
+        .family =               AF_INET,
+        .solicit =              arp_solicit,
+        .error_report =         arp_error_report,
+        .output =               neigh_resolve_output,
+        .connected_output =     neigh_connected_output,
+        .hh_output =            dev_queue_xmit,
+        .queue_xmit =           dev_queue_xmit,
+};
+static struct neigh_ops arp_hh_ops = {
+        .family =               AF_INET,
+        .solicit =              arp_solicit,
+        .error_report =         arp_error_report,
+        .output =               neigh_resolve_output,
+        .connected_output =     neigh_resolve_output,
+        .hh_output =            dev_queue_xmit,
+        .queue_xmit =           dev_queue_xmit,
+};
+static struct neigh_ops arp_direct_ops = {
+        .family =               AF_INET,
+        .output =               dev_queue_xmit,
+        .connected_output =     dev_queue_xmit,
+        .hh_output =            dev_queue_xmit,
+        .queue_xmit =           dev_queue_xmit,
+};
+struct neigh_ops arp_broken_ops = {
+        .family =               AF_INET,
+        .solicit =              arp_solicit,
+        .error_report =         arp_error_report,
+        .output =               neigh_compat_output,
+        .connected_output =     neigh_compat_output,
+        .hh_output =            dev_queue_xmit,
+        .queue_xmit =           dev_queue_xmit,
+};
+struct neigh_table arp_tbl = {
+        .family =       AF_INET,
+        .entry_size =   sizeof(struct neighbour) + 4,
+        .key_len =      4,
+        .hash =         arp_hash,
+        .constructor =  arp_constructor,
+        .proxy_redo =   parp_redo,
+        .id =           "arp_cache",
+        .parms = {
+                .tbl =                  &arp_tbl,
+                .base_reachable_time =  30 * HZ,
+                .retrans_time = 1 * HZ,
+                .gc_staletime = 60 * HZ,
+                .reachable_time =               30 * HZ,
+                .delay_probe_time =     5 * HZ,
+                .queue_len =            3,
+                .ucast_probes = 3,
+                .mcast_probes = 3,
+                .anycast_delay =        1 * HZ,
+                .proxy_delay =          (8 * HZ) / 10,
+                .proxy_qlen =           64,
+                .locktime =             1 * HZ,
+        },
+        .gc_interval =  30 * HZ,
+        .gc_thresh1 =   128,
+        .gc_thresh2 =   512,
+        .gc_thresh3 =   1024,
+};
+int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+        switch (dev->type) {
+        case ARPHRD_ETHER:
+        case ARPHRD_FDDI:
+        case ARPHRD_IEEE802:
+                ip_eth_mc_map(addr, haddr);
+                return 0; 
+        case ARPHRD_IEEE802_TR:
+                ip_tr_mc_map(addr, haddr);
+                return 0;
+        case ARPHRD_INFINIBAND:
+                ip_ib_mc_map(addr, haddr);
+                return 0;
+        default:
+                if (dir) {
+                        memcpy(haddr, dev->broadcast, dev->addr_len);
+                        return 0;
+                }
+        }
+        return -EINVAL;
+}
+static u32 arp_hash(const void *pkey, const struct net_device *dev)
+{
+        return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+}
+static int arp_constructor(struct neighbour *neigh)
+{
+        u32 addr = *(u32*)neigh->primary_key;
+        struct net_device *dev = neigh->dev;
+        struct in_device *in_dev;
+        struct neigh_parms *parms;
+        neigh->type = inet_addr_type(addr);
+        rcu_read_lock();
+        in_dev = rcu_dereference(__in_dev_get(dev));
+        if (in_dev == NULL) {
+                rcu_read_unlock();
+                return -EINVAL;
+        }
+        parms = in_dev->arp_parms;
+        __neigh_parms_put(neigh->parms);
+        neigh->parms = neigh_parms_clone(parms);
+        rcu_read_unlock();
+        if (dev->hard_header == NULL) {
+                neigh->nud_state = NUD_NOARP;
+                neigh->ops = &arp_direct_ops;
+                neigh->output = neigh->ops->queue_xmit;
+        } else {
+                /* Good devices (checked by reading texts, but only Ethernet is
+                   tested)
+                   ARPHRD_ETHER: (ethernet, apfddi)
+                   ARPHRD_FDDI: (fddi)
+                   ARPHRD_IEEE802: (tr)
+                   ARPHRD_METRICOM: (strip)
+                   ARPHRD_ARCNET:
+                   etc. etc. etc.
+                   ARPHRD_IPDDP will also work, if author repairs it.
+                   I did not it, because this driver does not work even
+                   in old paradigm.
+                 */
+#if 1
+                /* So... these "amateur" devices are hopeless.
+                   The only thing, that I can say now:
+                   It is very sad that we need to keep ugly obsolete
+                   code to make them happy.
+                   They should be moved to more reasonable state, now
+                   they use rebuild_header INSTEAD OF hard_start_xmit!!!
+                   Besides that, they are sort of out of date
+                   (a lot of redundant clones/copies, useless in 2.1),
+                   I wonder why people believe that they work.
+                 */
+                switch (dev->type) {
+                default:
+                        break;
+                case ARPHRD_ROSE:       
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+                case ARPHRD_AX25:
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+                case ARPHRD_NETROM:
+#endif
+                        neigh->ops = &arp_broken_ops;
+                        neigh->output = neigh->ops->output;
+                        return 0;
+#endif
+                ;}
+#endif
+                if (neigh->type == RTN_MULTICAST) {
+                        neigh->nud_state = NUD_NOARP;
+                        arp_mc_map(addr, neigh->ha, dev, 1);
+                } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+                        neigh->nud_state = NUD_NOARP;
+                        memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+                } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+                        neigh->nud_state = NUD_NOARP;
+                        memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+                }
+                if (dev->hard_header_cache)
+                        neigh->ops = &arp_hh_ops;
+                else
+                        neigh->ops = &arp_generic_ops;
+                if (neigh->nud_state&NUD_VALID)
+                        neigh->output = neigh->ops->connected_output;
+                else
+                        neigh->output = neigh->ops->output;
+        }
+        return 0;
+}
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+        dst_link_failure(skb);
+        kfree_skb(skb);
+}
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+        u32 saddr = 0;
+        u8  *dst_ha = NULL;
+        struct net_device *dev = neigh->dev;
+        u32 target = *(u32*)neigh->primary_key;
+        int probes = atomic_read(&neigh->probes);
+        struct in_device *in_dev = in_dev_get(dev);
+        if (!in_dev)
+                return;
+        switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+        default:
+        case 0:         /* By default announce any local IP */
+                if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
+                        saddr = skb->nh.iph->saddr;
+                break;
+        case 1:         /* Restrict announcements of saddr in same subnet */
+                if (!skb)
+                        break;
+                saddr = skb->nh.iph->saddr;
+                if (inet_addr_type(saddr) == RTN_LOCAL) {
+                        /* saddr should be known to target */
+                        if (inet_addr_onlink(in_dev, target, saddr))
+                                break;
+                }
+                saddr = 0;
+                break;
+        case 2:         /* Avoid secondary IPs, get a primary/preferred one */
+                break;
+        }
+        if (in_dev)
+                in_dev_put(in_dev);
+        if (!saddr)
+                saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+        if ((probes -= neigh->parms->ucast_probes) < 0) {
+                if (!(neigh->nud_state&NUD_VALID))
+                        printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+                dst_ha = neigh->ha;
+                read_lock_bh(&neigh->lock);
+        } else if ((probes -= neigh->parms->app_probes) < 0) {
+#ifdef CONFIG_ARPD
+                neigh_app_ns(neigh);
+#endif
+                return;
+        }
+        arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+                 dst_ha, dev->dev_addr, NULL);
+        if (dst_ha)
+                read_unlock_bh(&neigh->lock);
+}
+static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
+                      u32 sip, u32 tip)
+{
+        int scope;
+        switch (IN_DEV_ARP_IGNORE(in_dev)) {
+        case 0: /* Reply, the tip is already validated */
+                return 0;
+        case 1: /* Reply only if tip is configured on the incoming interface */
+                sip = 0;
+                scope = RT_SCOPE_HOST;
+                break;
+        case 2: /*
+                 * Reply only if tip is configured on the incoming interface
+                 * and is in same subnet as sip
+                 */
+                scope = RT_SCOPE_HOST;
+                break;
+        case 3: /* Do not reply for scope host addresses */
+                sip = 0;
+                scope = RT_SCOPE_LINK;
+                dev = NULL;
+                break;
+        case 4: /* Reserved */
+        case 5:
+        case 6:
+        case 7:
+                return 0;
+        case 8: /* Do not reply */
+                return 1;
+        default:
+                return 0;
+        }
+        return !inet_confirm_addr(dev, sip, tip, scope);
+}
+static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev)
+{
+        struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
+                                                 .saddr = tip } } };
+        struct rtable *rt;
+        int flag = 0; 
+        /*unsigned long now; */
+        if (ip_route_output_key(&rt, &fl) < 0) 
+                return 1;
+        if (rt->u.dst.dev != dev) { 
+                NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+                flag = 1;
+        } 
+        ip_rt_put(rt); 
+        return flag; 
+} 
+/* OBSOLETE FUNCTIONS */
+/*
+ *      Find an arp mapping in the cache. If not found, post a request.
+ *
+ *      It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
+ *      even if it exists. It is supposed that skb->dev was mangled
+ *      by a virtual device (eql, shaper). Nobody but broken devices
+ *      is allowed to use this function, it is scheduled to be removed. --ANK
+ */
+static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev)
+{
+        switch (addr_hint) {
+        case RTN_LOCAL:
+                printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+                memcpy(haddr, dev->dev_addr, dev->addr_len);
+                return 1;
+        case RTN_MULTICAST:
+                arp_mc_map(paddr, haddr, dev, 1);
+                return 1;
+        case RTN_BROADCAST:
+                memcpy(haddr, dev->broadcast, dev->addr_len);
+                return 1;
+        }
+        return 0;
+}
+int arp_find(unsigned char *haddr, struct sk_buff *skb)
+{
+        struct net_device *dev = skb->dev;
+        u32 paddr;
+        struct neighbour *n;
+        if (!skb->dst) {
+                printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+                kfree_skb(skb);
+                return 1;
+        }
+        paddr = ((struct rtable*)skb->dst)->rt_gateway;
+        if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
+                return 0;
+        n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
+        if (n) {
+                n->used = jiffies;
+                if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
+                        read_lock_bh(&n->lock);
+                        memcpy(haddr, n->ha, dev->addr_len);
+                        read_unlock_bh(&n->lock);
+                        neigh_release(n);
+                        return 0;
+                }
+                neigh_release(n);
+        } else
+                kfree_skb(skb);
+        return 1;
+}
+/* END OF OBSOLETE FUNCTIONS */
+int arp_bind_neighbour(struct dst_entry *dst)
+{
+        struct net_device *dev = dst->dev;
+        struct neighbour *n = dst->neighbour;
+        if (dev == NULL)
+                return -EINVAL;
+        if (n == NULL) {
+                u32 nexthop = ((struct rtable*)dst)->rt_gateway;
+                if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+                        nexthop = 0;
+                n = __neigh_lookup_errno(
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+                    dev->type == ARPHRD_ATM ? clip_tbl_hook :
+#endif
+                    &arp_tbl, &nexthop, dev);
+                if (IS_ERR(n))
+                        return PTR_ERR(n);
+                dst->neighbour = n;
+        }
+        return 0;
+}
+/*
+ * Check if we can use proxy ARP for this path
+ */
+static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
+{
+        struct in_device *out_dev;
+        int imi, omi = -1;
+        if (!IN_DEV_PROXY_ARP(in_dev))
+                return 0;
+        if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+                return 1;
+        if (imi == -1)
+                return 0;
+        /* place to check for proxy_arp for routes */
+        if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+                omi = IN_DEV_MEDIUM_ID(out_dev);
+                in_dev_put(out_dev);
+        }
+        return (omi != imi && omi != -1);
+}
+/*
+ *      Interface to link layer: send routine and receive handler.
+ */
+/*
+ *      Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ *      message.
+ */
+struct sk_buff *arp_create(int type, int ptype, u32 dest_ip,
+                           struct net_device *dev, u32 src_ip,
+                           unsigned char *dest_hw, unsigned char *src_hw,
+                           unsigned char *target_hw)
+{
+        struct sk_buff *skb;
+        struct arphdr *arp;
+        unsigned char *arp_ptr;
+        /*
+         *      Allocate a buffer
+         */
+        
+        skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
+                                + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+        if (skb == NULL)
+                return NULL;
+        skb_reserve(skb, LL_RESERVED_SPACE(dev));
+        skb->nh.raw = skb->data;
+        arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+        skb->dev = dev;
+        skb->protocol = htons(ETH_P_ARP);
+        if (src_hw == NULL)
+                src_hw = dev->dev_addr;
+        if (dest_hw == NULL)
+                dest_hw = dev->broadcast;
+        /*
+         *      Fill the device header for the ARP frame
+         */
+        if (dev->hard_header &&
+            dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+                goto out;
+        /*
+         * Fill out the arp protocol part.
+         *
+         * The arp hardware type should match the device type, except for FDDI,
+         * which (according to RFC 1390) should always equal 1 (Ethernet).
+         */
+        /*
+         *      Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+         *      DIX code for the protocol. Make these device structure fields.
+         */
+        switch (dev->type) {
+        default:
+                arp->ar_hrd = htons(dev->type);
+                arp->ar_pro = htons(ETH_P_IP);
+                break;
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+        case ARPHRD_AX25:
+                arp->ar_hrd = htons(ARPHRD_AX25);
+                arp->ar_pro = htons(AX25_P_IP);
+                break;
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+        case ARPHRD_NETROM:
+                arp->ar_hrd = htons(ARPHRD_NETROM);
+                arp->ar_pro = htons(AX25_P_IP);
+                break;
+#endif
+#endif
+#ifdef CONFIG_FDDI
+        case ARPHRD_FDDI:
+                arp->ar_hrd = htons(ARPHRD_ETHER);
+                arp->ar_pro = htons(ETH_P_IP);
+                break;
+#endif
+#ifdef CONFIG_TR
+        case ARPHRD_IEEE802_TR:
+                arp->ar_hrd = htons(ARPHRD_IEEE802);
+                arp->ar_pro = htons(ETH_P_IP);
+                break;
+#endif
+        }
+        arp->ar_hln = dev->addr_len;
+        arp->ar_pln = 4;
+        arp->ar_op = htons(type);
+        arp_ptr=(unsigned char *)(arp+1);
+        memcpy(arp_ptr, src_hw, dev->addr_len);
+        arp_ptr+=dev->addr_len;
+        memcpy(arp_ptr, &src_ip,4);
+        arp_ptr+=4;
+        if (target_hw != NULL)
+                memcpy(arp_ptr, target_hw, dev->addr_len);
+        else
+                memset(arp_ptr, 0, dev->addr_len);
+        arp_ptr+=dev->addr_len;
+        memcpy(arp_ptr, &dest_ip, 4);
+        return skb;
+out:
+        kfree_skb(skb);
+        return NULL;
+}
+/*
+ *      Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+        /* Send it off, maybe filter it using firewalling first.  */
+        NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+}
+/*
+ *      Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, u32 dest_ip, 
+              struct net_device *dev, u32 src_ip, 
+              unsigned char *dest_hw, unsigned char *src_hw,
+              unsigned char *target_hw)
+{
+        struct sk_buff *skb;
+        /*
+         *      No arp on this interface.
+         */
+        
+        if (dev->flags&IFF_NOARP)
+                return;
+        skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+                         dest_hw, src_hw, target_hw);
+        if (skb == NULL) {
+                return;
+        }
+        arp_xmit(skb);
+}
+static void parp_redo(struct sk_buff *skb)
+{
+        nf_reset(skb);
+        arp_rcv(skb, skb->dev, NULL);
+}
+/*
+ *      Process an arp request.
+ */
+static int arp_process(struct sk_buff *skb)
+{
+        struct net_device *dev = skb->dev;
+        struct in_device *in_dev = in_dev_get(dev);
+        struct arphdr *arp;
+        unsigned char *arp_ptr;
+        struct rtable *rt;
+        unsigned char *sha, *tha;
+        u32 sip, tip;
+        u16 dev_type = dev->type;
+        int addr_type;
+        struct neighbour *n;
+        /* arp_rcv below verifies the ARP header and verifies the device
+         * is ARP'able.
+         */
+        if (in_dev == NULL)
+                goto out;
+        arp = skb->nh.arph;
+        switch (dev_type) {
+        default:        
+                if (arp->ar_pro != htons(ETH_P_IP) ||
+                    htons(dev_type) != arp->ar_hrd)
+                        goto out;
+                break;
+#ifdef CONFIG_NET_ETHERNET
+        case ARPHRD_ETHER:
+#endif
+#ifdef CONFIG_TR
+        case ARPHRD_IEEE802_TR:
+#endif
+#ifdef CONFIG_FDDI
+        case ARPHRD_FDDI:
+#endif
+#ifdef CONFIG_NET_FC
+        case ARPHRD_IEEE802:
+#endif
+#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
+    defined(CONFIG_FDDI)         || defined(CONFIG_NET_FC)
+                /*
+                 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+                 * devices, according to RFC 2625) devices will accept ARP
+                 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+                 * This is the case also of FDDI, where the RFC 1390 says that
+                 * FDDI devices should accept ARP hardware of (1) Ethernet,
+                 * however, to be more robust, we'll accept both 1 (Ethernet)
+                 * or 6 (IEEE 802.2)
+                 */
+                if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+                     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+                    arp->ar_pro != htons(ETH_P_IP))
+                        goto out;
+                break;
+#endif
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+        case ARPHRD_AX25:
+                if (arp->ar_pro != htons(AX25_P_IP) ||
+                    arp->ar_hrd != htons(ARPHRD_AX25))
+                        goto out;
+                break;
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+        case ARPHRD_NETROM:
+                if (arp->ar_pro != htons(AX25_P_IP) ||
+                    arp->ar_hrd != htons(ARPHRD_NETROM))
+                        goto out;
+                break;
+#endif
+#endif
+        }
+        /* Understand only these message types */
+        if (arp->ar_op != htons(ARPOP_REPLY) &&
+            arp->ar_op != htons(ARPOP_REQUEST))
+                goto out;
+/*
+ *      Extract fields
+ */
+        arp_ptr= (unsigned char *)(arp+1);
+        sha     = arp_ptr;
+        arp_ptr += dev->addr_len;
+        memcpy(&sip, arp_ptr, 4);
+        arp_ptr += 4;
+        tha     = arp_ptr;
+        arp_ptr += dev->addr_len;
+        memcpy(&tip, arp_ptr, 4);
+/* 
+ *      Check for bad requests for 127.x.x.x and requests for multicast
+ *      addresses.  If this is one such, delete it.
+ */
+        if (LOOPBACK(tip) || MULTICAST(tip))
+                goto out;
+/*
+ *     Special case: We must set Frame Relay source Q.922 address
+ */
+        if (dev_type == ARPHRD_DLCI)
+                sha = dev->broadcast;
+/*
+ *  Process entry.  The idea here is we want to send a reply if it is a
+ *  request for us or if it is a request for someone else that we hold
+ *  a proxy for.  We want to add an entry to our cache if it is a reply
+ *  to us or if it is a request for our address.  
+ *  (The assumption for this last is that if someone is requesting our 
+ *  address, they are probably intending to talk to us, so it saves time 
+ *  if we cache their address.  Their address is also probably not in 
+ *  our cache, since ours is not in their cache.)
+ * 
+ *  Putting this another way, we only care about replies if they are to
+ *  us, in which case we add them to the cache.  For requests, we care
+ *  about those for us and those for our proxies.  We reply to both,
+ *  and in the case of requests for us we add the requester to the arp 
+ *  cache.
+ */
+        /* Special case: IPv4 duplicate address detection packet (RFC2131) */
+        if (sip == 0) {
+                if (arp->ar_op == htons(ARPOP_REQUEST) &&
+                    inet_addr_type(tip) == RTN_LOCAL &&
+                    !arp_ignore(in_dev,dev,sip,tip))
+                        arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
+                goto out;
+        }
+        if (arp->ar_op == htons(ARPOP_REQUEST) &&
+            ip_route_input(skb, tip, sip, 0, dev) == 0) {
+                rt = (struct rtable*)skb->dst;
+                addr_type = rt->rt_type;
+                if (addr_type == RTN_LOCAL) {
+                        n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+                        if (n) {
+                                int dont_send = 0;
+                                if (!dont_send)
+                                        dont_send |= arp_ignore(in_dev,dev,sip,tip);
+                                if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+                                        dont_send |= arp_filter(sip,tip,dev); 
+                                if (!dont_send)
+                                        arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+                                neigh_release(n);
+                        }
+                        goto out;
+                } else if (IN_DEV_FORWARD(in_dev)) {
+                        if ((rt->rt_flags&RTCF_DNAT) ||
+                            (addr_type == RTN_UNICAST  && rt->u.dst.dev != dev &&
+                             (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+                                if (n)
+                                        neigh_release(n);
+                                if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || 
+                                    skb->pkt_type == PACKET_HOST ||
+                                    in_dev->arp_parms->proxy_delay == 0) {
+                                        arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+                                } else {
+                                        pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+                                        in_dev_put(in_dev);
+                                        return 0;
+                                }
+                                goto out;
+                        }
+                }
+        }
+        /* Update our ARP tables */
+        n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
+        /* Unsolicited ARP is not accepted by default.
+           It is possible, that this option should be enabled for some
+           devices (strip is candidate)
+         */
+        if (n == NULL &&
+            arp->ar_op == htons(ARPOP_REPLY) &&
+            inet_addr_type(sip) == RTN_UNICAST)
+                n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
+#endif
+        if (n) {
+                int state = NUD_REACHABLE;
+                int override;
+                /* If several different ARP replies follows back-to-back,
+                   use the FIRST one. It is possible, if several proxy
+                   agents are active. Taking the first reply prevents
+                   arp trashing and chooses the fastest router.
+                 */
+                override = time_after(jiffies, n->updated + n->parms->locktime);
+                /* Broadcast replies and request packets
+                   do not assert neighbour reachability.
+                 */
+                if (arp->ar_op != htons(ARPOP_REPLY) ||
+                    skb->pkt_type != PACKET_HOST)
+                        state = NUD_STALE;
+                neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+                neigh_release(n);
+        }
+out:
+        if (in_dev)
+                in_dev_put(in_dev);
+        kfree_skb(skb);
+        return 0;
+}
+/*
+ *      Receive an arp request from the device layer.
+ */
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+        struct arphdr *arp;
+        /* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
+        if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
+                                 (2 * dev->addr_len) +
+                                 (2 * sizeof(u32)))))
+                goto freeskb;
+        arp = skb->nh.arph;
+        if (arp->ar_hln != dev->addr_len ||
+            dev->flags & IFF_NOARP ||
+            skb->pkt_type == PACKET_OTHERHOST ||
+            skb->pkt_type == PACKET_LOOPBACK ||
+            arp->ar_pln != 4)
+                goto freeskb;
+        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+                goto out_of_mem;
+        return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+freeskb:
+        kfree_skb(skb);
+out_of_mem:
+        return 0;
+}
+/*
+ *      User level interface (ioctl)
+ */
+/*
+ *      Set (create) an ARP cache entry.
+ */
+static int arp_req_set(struct arpreq *r, struct net_device * dev)
+{
+        u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+        struct neighbour *neigh;
+        int err;
+        if (r->arp_flags&ATF_PUBL) {
+                u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
+                if (mask && mask != 0xFFFFFFFF)
+                        return -EINVAL;
+                if (!dev && (r->arp_flags & ATF_COM)) {
+                        dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
+                        if (!dev)
+                                return -ENODEV;
+                }
+                if (mask) {
+                        if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
+                                return -ENOBUFS;
+                        return 0;
+                }
+                if (dev == NULL) {
+                        ipv4_devconf.proxy_arp = 1;
+                        return 0;
+                }
+                if (__in_dev_get(dev)) {
+                        __in_dev_get(dev)->cnf.proxy_arp = 1;
+                        return 0;
+                }
+                return -ENXIO;
+        }
+        if (r->arp_flags & ATF_PERM)
+                r->arp_flags |= ATF_COM;
+        if (dev == NULL) {
+                struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+                                                         .tos = RTO_ONLINK } } };
+                struct rtable * rt;
+                if ((err = ip_route_output_key(&rt, &fl)) != 0)
+                        return err;
+                dev = rt->u.dst.dev;
+                ip_rt_put(rt);
+                if (!dev)
+                        return -EINVAL;
+        }
+        switch (dev->type) {
+#ifdef CONFIG_FDDI
+        case ARPHRD_FDDI:
+                /*
+                 * According to RFC 1390, FDDI devices should accept ARP
+                 * hardware types of 1 (Ethernet).  However, to be more
+                 * robust, we'll accept hardware types of either 1 (Ethernet)
+                 * or 6 (IEEE 802.2).
+                 */
+                if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+                    r->arp_ha.sa_family != ARPHRD_ETHER &&
+                    r->arp_ha.sa_family != ARPHRD_IEEE802)
+                        return -EINVAL;
+                break;
+#endif
+        default:
+                if (r->arp_ha.sa_family != dev->type)
+                        return -EINVAL;
+                break;
+        }
+        neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+        err = PTR_ERR(neigh);
+        if (!IS_ERR(neigh)) {
+                unsigned state = NUD_STALE;
+                if (r->arp_flags & ATF_PERM)
+                        state = NUD_PERMANENT;
+                err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+                                   r->arp_ha.sa_data : NULL, state, 
+                                   NEIGH_UPDATE_F_OVERRIDE|
+                                   NEIGH_UPDATE_F_ADMIN);
+                neigh_release(neigh);
+        }
+        return err;
+}
+static unsigned arp_state_to_flags(struct neighbour *neigh)
+{
+        unsigned flags = 0;
+        if (neigh->nud_state&NUD_PERMANENT)
+                flags = ATF_PERM|ATF_COM;
+        else if (neigh->nud_state&NUD_VALID)
+                flags = ATF_COM;
+        return flags;
+}
+/*
+ *      Get an ARP cache entry.
+ */
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+        u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+        struct neighbour *neigh;
+        int err = -ENXIO;
+        neigh = neigh_lookup(&arp_tbl, &ip, dev);
+        if (neigh) {
+                read_lock_bh(&neigh->lock);
+                memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+                r->arp_flags = arp_state_to_flags(neigh);
+                read_unlock_bh(&neigh->lock);
+                r->arp_ha.sa_family = dev->type;
+                strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+                neigh_release(neigh);
+                err = 0;
+        }
+        return err;
+}
+static int arp_req_delete(struct arpreq *r, struct net_device * dev)
+{
+        int err;
+        u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+        struct neighbour *neigh;
+        if (r->arp_flags & ATF_PUBL) {
+                u32 mask =
+                       ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+                if (mask == 0xFFFFFFFF)
+                        return pneigh_delete(&arp_tbl, &ip, dev);
+                if (mask == 0) {
+                        if (dev == NULL) {
+                                ipv4_devconf.proxy_arp = 0;
+                                return 0;
+                        }
+                        if (__in_dev_get(dev)) {
+                                __in_dev_get(dev)->cnf.proxy_arp = 0;
+                                return 0;
+                        }
+                        return -ENXIO;
+                }
+                return -EINVAL;
+        }
+        if (dev == NULL) {
+                struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+                                                         .tos = RTO_ONLINK } } };
+                struct rtable * rt;
+                if ((err = ip_route_output_key(&rt, &fl)) != 0)
+                        return err;
+                dev = rt->u.dst.dev;
+                ip_rt_put(rt);
+                if (!dev)
+                        return -EINVAL;
+        }
+        err = -ENXIO;
+        neigh = neigh_lookup(&arp_tbl, &ip, dev);
+        if (neigh) {
+                if (neigh->nud_state&~NUD_NOARP)
+                        err = neigh_update(neigh, NULL, NUD_FAILED, 
+                                           NEIGH_UPDATE_F_OVERRIDE|
+                                           NEIGH_UPDATE_F_ADMIN);
+                neigh_release(neigh);
+        }
+        return err;
+}
+/*
+ *      Handle an ARP layer I/O control request.
+ */
+int arp_ioctl(unsigned int cmd, void __user *arg)
+{
+        int err;
+        struct arpreq r;
+        struct net_device *dev = NULL;
+        switch (cmd) {
+                case SIOCDARP:
+                case SIOCSARP:
+                        if (!capable(CAP_NET_ADMIN))
+                                return -EPERM;
+                case SIOCGARP:
+                        err = copy_from_user(&r, arg, sizeof(struct arpreq));
+                        if (err)
+                                return -EFAULT;
+                        break;
+                default:
+                        return -EINVAL;
+        }
+        if (r.arp_pa.sa_family != AF_INET)
+                return -EPFNOSUPPORT;
+        if (!(r.arp_flags & ATF_PUBL) &&
+            (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+                return -EINVAL;
+        if (!(r.arp_flags & ATF_NETMASK))
+                ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+                                                           htonl(0xFFFFFFFFUL);
+        rtnl_lock();
+        if (r.arp_dev[0]) {
+                err = -ENODEV;
+                if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
+                        goto out;
+                /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+                if (!r.arp_ha.sa_family)
+                        r.arp_ha.sa_family = dev->type;
+                err = -EINVAL;
+                if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+                        goto out;
+        } else if (cmd == SIOCGARP) {
+                err = -ENODEV;
+                goto out;
+        }
+        switch(cmd) {
+        case SIOCDARP:
+                err = arp_req_delete(&r, dev);
+                break;
+        case SIOCSARP:
+                err = arp_req_set(&r, dev);
+                break;
+        case SIOCGARP:
+                err = arp_req_get(&r, dev);
+                if (!err && copy_to_user(arg, &r, sizeof(r)))
+                        err = -EFAULT;
+                break;
+        }
+out:
+        rtnl_unlock();
+        return err;
+}
+static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+        struct net_device *dev = ptr;
+        switch (event) {
+        case NETDEV_CHANGEADDR:
+                neigh_changeaddr(&arp_tbl, dev);
+                rt_cache_flush(0);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block arp_netdev_notifier = {
+        .notifier_call = arp_netdev_event,
+};
+/* Note, that it is not on notifier chain.
+   It is necessary, that this routine was called after route cache will be
+   flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+        neigh_ifdown(&arp_tbl, dev);
+}
+/*
+ *      Called once on startup.
+ */
+static struct packet_type arp_packet_type = {
+        .type = __constant_htons(ETH_P_ARP),
+        .func = arp_rcv,
+};
+static int arp_proc_init(void);
+void __init arp_init(void)
+{
+        neigh_table_init(&arp_tbl);
+        dev_add_pack(&arp_packet_type);
+        arp_proc_init();
+#ifdef CONFIG_SYSCTL
+        neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
+                              NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+        register_netdevice_notifier(&arp_netdev_notifier);
+}
+#ifdef CONFIG_PROC_FS
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+/* ------------------------------------------------------------------------ */
+/*
+ *      ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+        char c, *s;
+        int n;
+        for (n = 0, s = buf; n < 6; n++) {
+                c = (a->ax25_call[n] >> 1) & 0x7F;
+                if (c != ' ') *s++ = c;
+        }
+        
+        *s++ = '-';
+        if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+                *s++ = '1';
+                n -= 10;
+        }
+        
+        *s++ = n + '0';
+        *s++ = '\0';
+        if (*buf == '\0' || *buf == '-')
+           return "*";
+        return buf;
+}
+#endif /* CONFIG_AX25 */
+#define HBUFFERLEN 30
+static void arp_format_neigh_entry(struct seq_file *seq,
+                                   struct neighbour *n)
+{
+        char hbuffer[HBUFFERLEN];
+        const char hexbuf[] = "0123456789ABCDEF";
+        int k, j;
+        char tbuf[16];
+        struct net_device *dev = n->dev;
+        int hatype = dev->type;
+        read_lock(&n->lock);
+        /* Convert hardware address to XX:XX:XX:XX ... form. */
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+        if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+                ax2asc2((ax25_address *)n->ha, hbuffer);
+        else {
+#endif
+        for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+                hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
+                hbuffer[k++] = hexbuf[n->ha[j] & 15];
+                hbuffer[k++] = ':';
+        }
+        hbuffer[--k] = 0;
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+        }
+#endif
+        sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+        seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+                   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+        read_unlock(&n->lock);
+}
+static void arp_format_pneigh_entry(struct seq_file *seq,
+                                    struct pneigh_entry *n)
+{
+        struct net_device *dev = n->dev;
+        int hatype = dev ? dev->type : 0;
+        char tbuf[16];
+        sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key));
+        seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+                   tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+                   dev ? dev->name : "*");
+}
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN) {
+                seq_puts(seq, "IP address       HW type     Flags       "
+                              "HW address            Mask     Device\n");
+        } else {
+                struct neigh_seq_state *state = seq->private;
+                if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+                        arp_format_pneigh_entry(seq, v);
+                else
+                        arp_format_neigh_entry(seq, v);
+        }
+        return 0;
+}
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        /* Don't want to confuse "arp -a" w/ magic entries,
+         * so we tell the generic iterator to skip NUD_NOARP.
+         */
+        return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+/* ------------------------------------------------------------------------ */
+static struct seq_operations arp_seq_ops = {
+        .start  = arp_seq_start,
+        .next   = neigh_seq_next,
+        .stop   = neigh_seq_stop,
+        .show   = arp_seq_show,
+};
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+        if (!s)
+                goto out;
+        memset(s, 0, sizeof(*s));
+        rc = seq_open(file, &arp_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq          = file->private_data;
+        seq->private = s;
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations arp_seq_fops = {
+        .owner          = THIS_MODULE,
+        .open           = arp_seq_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
+static int __init arp_proc_init(void)
+{
+        if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
+                return -ENOMEM;
+        return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int __init arp_proc_init(void)
+{
+        return 0;
+}
+#endif /* CONFIG_PROC_FS */
+EXPORT_SYMBOL(arp_broken_ops);
+EXPORT_SYMBOL(arp_find);
+EXPORT_SYMBOL(arp_rcv);
+EXPORT_SYMBOL(arp_create);
+EXPORT_SYMBOL(arp_xmit);
+EXPORT_SYMBOL(arp_send);
+EXPORT_SYMBOL(arp_tbl);
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+EXPORT_SYMBOL(clip_tbl_hook);
+#endif
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 000000000000..b1db561f2542
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,73 @@
+/*
+ *      common UDP/RAW code
+ *      Linux INET implementation
+ *
+ * Authors:
+ *      Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/route.h>
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+        struct rtable *rt;
+        u32 saddr;
+        int oif;
+        int err;
+        
+        if (addr_len < sizeof(*usin)) 
+                return -EINVAL;
+        if (usin->sin_family != AF_INET) 
+                return -EAFNOSUPPORT;
+        sk_dst_reset(sk);
+        oif = sk->sk_bound_dev_if;
+        saddr = inet->saddr;
+        if (MULTICAST(usin->sin_addr.s_addr)) {
+                if (!oif)
+                        oif = inet->mc_index;
+                if (!saddr)
+                        saddr = inet->mc_addr;
+        }
+        err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
+                               RT_CONN_FLAGS(sk), oif,
+                               sk->sk_protocol,
+                               inet->sport, usin->sin_port, sk);
+        if (err)
+                return err;
+        if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+                ip_rt_put(rt);
+                return -EACCES;
+        }
+        if (!inet->saddr)
+                inet->saddr = rt->rt_src;       /* Update source address */
+        if (!inet->rcv_saddr)
+                inet->rcv_saddr = rt->rt_src;
+        inet->daddr = rt->rt_dst;
+        inet->dport = usin->sin_port;
+        sk->sk_state = TCP_ESTABLISHED;
+        inet->id = jiffies;
+        sk_dst_set(sk, &rt->u.dst);
+        return(0);
+}
+EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 000000000000..eea7ef010776
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,1508 @@
+/*
+ *      NET3    IP device support routines.
+ *
+ *      Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ *      Derived from the IP parts of dev.c 1.0.19
+ *              Authors:        Ross Biro, <bir7@leland.Stanford.Edu>
+ *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ *      Additional Authors:
+ *              Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *      Changes:
+ *              Alexey Kuznetsov:       pa_* fields are replaced with ifaddr
+ *                                      lists.
+ *              Cyrus Durgin:           updated for kmod
+ *              Matthias Andree:        in devinet_ioctl, compare label and
+ *                                      address (4.4BSD alias style support),
+ *                                      fall back to comparing just the label
+ *                                      if no match found.
+ */
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+struct ipv4_devconf ipv4_devconf = {
+        .accept_redirects = 1,
+        .send_redirects =  1,
+        .secure_redirects = 1,
+        .shared_media =   1,
+};
+static struct ipv4_devconf ipv4_devconf_dflt = {
+        .accept_redirects =  1,
+        .send_redirects =    1,
+        .secure_redirects =  1,
+        .shared_media =      1,
+        .accept_source_route = 1,
+};
+static void rtmsg_ifa(int event, struct in_ifaddr *);
+static struct notifier_block *inetaddr_chain;
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+                         int destroy);
+#ifdef CONFIG_SYSCTL
+static void devinet_sysctl_register(struct in_device *in_dev,
+                                    struct ipv4_devconf *p);
+static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+#endif
+/* Locks all the inet devices. */
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+        struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
+        if (ifa) {
+                memset(ifa, 0, sizeof(*ifa));
+                INIT_RCU_HEAD(&ifa->rcu_head);
+        }
+        return ifa;
+}
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+        struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+        if (ifa->ifa_dev)
+                in_dev_put(ifa->ifa_dev);
+        kfree(ifa);
+}
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
+{
+        call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+void in_dev_finish_destroy(struct in_device *idev)
+{
+        struct net_device *dev = idev->dev;
+        BUG_TRAP(!idev->ifa_list);
+        BUG_TRAP(!idev->mc_list);
+#ifdef NET_REFCNT_DEBUG
+        printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
+               idev, dev ? dev->name : "NIL");
+#endif
+        dev_put(dev);
+        if (!idev->dead)
+                printk("Freeing alive in_device %p\n", idev);
+        else {
+                kfree(idev);
+        }
+}
+struct in_device *inetdev_init(struct net_device *dev)
+{
+        struct in_device *in_dev;
+        ASSERT_RTNL();
+        in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL);
+        if (!in_dev)
+                goto out;
+        memset(in_dev, 0, sizeof(*in_dev));
+        INIT_RCU_HEAD(&in_dev->rcu_head);
+        memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+        in_dev->cnf.sysctl = NULL;
+        in_dev->dev = dev;
+        if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+                goto out_kfree;
+        /* Reference in_dev->dev */
+        dev_hold(dev);
+#ifdef CONFIG_SYSCTL
+        neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+                              NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+        /* Account for reference dev->ip_ptr */
+        in_dev_hold(in_dev);
+        rcu_assign_pointer(dev->ip_ptr, in_dev);
+#ifdef CONFIG_SYSCTL
+        devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+        ip_mc_init_dev(in_dev);
+        if (dev->flags & IFF_UP)
+                ip_mc_up(in_dev);
+out:
+        return in_dev;
+out_kfree:
+        kfree(in_dev);
+        in_dev = NULL;
+        goto out;
+}
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+        struct in_device *idev = container_of(head, struct in_device, rcu_head);
+        in_dev_put(idev);
+}
+static void inetdev_destroy(struct in_device *in_dev)
+{
+        struct in_ifaddr *ifa;
+        struct net_device *dev;
+        ASSERT_RTNL();
+        dev = in_dev->dev;
+        if (dev == &loopback_dev)
+                return;
+        in_dev->dead = 1;
+        ip_mc_destroy_dev(in_dev);
+        while ((ifa = in_dev->ifa_list) != NULL) {
+                inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+                inet_free_ifa(ifa);
+        }
+#ifdef CONFIG_SYSCTL
+        devinet_sysctl_unregister(&in_dev->cnf);
+#endif
+        dev->ip_ptr = NULL;
+#ifdef CONFIG_SYSCTL
+        neigh_sysctl_unregister(in_dev->arp_parms);
+#endif
+        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+        arp_ifdown(dev);
+        call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
+{
+        rcu_read_lock();
+        for_primary_ifa(in_dev) {
+                if (inet_ifa_match(a, ifa)) {
+                        if (!b || inet_ifa_match(b, ifa)) {
+                                rcu_read_unlock();
+                                return 1;
+                        }
+                }
+        } endfor_ifa(in_dev);
+        rcu_read_unlock();
+        return 0;
+}
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+                         int destroy)
+{
+        struct in_ifaddr *ifa1 = *ifap;
+        ASSERT_RTNL();
+        /* 1. Deleting primary ifaddr forces deletion all secondaries */
+        if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+                struct in_ifaddr *ifa;
+                struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+                while ((ifa = *ifap1) != NULL) {
+                        if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+                            ifa1->ifa_mask != ifa->ifa_mask ||
+                            !inet_ifa_match(ifa1->ifa_address, ifa)) {
+                                ifap1 = &ifa->ifa_next;
+                                continue;
+                        }
+                        *ifap1 = ifa->ifa_next;
+                        rtmsg_ifa(RTM_DELADDR, ifa);
+                        notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+                        inet_free_ifa(ifa);
+                }
+        }
+        /* 2. Unlink it */
+        *ifap = ifa1->ifa_next;
+        /* 3. Announce address deletion */
+        /* Send message first, then call notifier.
+           At first sight, FIB update triggered by notifier
+           will refer to already deleted ifaddr, that could confuse
+           netlink listeners. It is not true: look, gated sees
+           that route deleted and if it still thinks that ifaddr
+           is valid, it will try to restore deleted routes... Grr.
+           So that, this order is correct.
+         */
+        rtmsg_ifa(RTM_DELADDR, ifa1);
+        notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+        if (destroy) {
+                inet_free_ifa(ifa1);
+                if (!in_dev->ifa_list)
+                        inetdev_destroy(in_dev);
+        }
+}
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+        struct in_device *in_dev = ifa->ifa_dev;
+        struct in_ifaddr *ifa1, **ifap, **last_primary;
+        ASSERT_RTNL();
+        if (!ifa->ifa_local) {
+                inet_free_ifa(ifa);
+                return 0;
+        }
+        ifa->ifa_flags &= ~IFA_F_SECONDARY;
+        last_primary = &in_dev->ifa_list;
+        for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+             ifap = &ifa1->ifa_next) {
+                if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+                    ifa->ifa_scope <= ifa1->ifa_scope)
+                        last_primary = &ifa1->ifa_next;
+                if (ifa1->ifa_mask == ifa->ifa_mask &&
+                    inet_ifa_match(ifa1->ifa_address, ifa)) {
+                        if (ifa1->ifa_local == ifa->ifa_local) {
+                                inet_free_ifa(ifa);
+                                return -EEXIST;
+                        }
+                        if (ifa1->ifa_scope != ifa->ifa_scope) {
+                                inet_free_ifa(ifa);
+                                return -EINVAL;
+                        }
+                        ifa->ifa_flags |= IFA_F_SECONDARY;
+                }
+        }
+        if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+                net_srandom(ifa->ifa_local);
+                ifap = last_primary;
+        }
+        ifa->ifa_next = *ifap;
+        *ifap = ifa;
+        /* Send message first, then call notifier.
+           Notifier will trigger FIB update, so that
+           listeners of netlink will know about new ifaddr */
+        rtmsg_ifa(RTM_NEWADDR, ifa);
+        notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+        return 0;
+}
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+        struct in_device *in_dev = __in_dev_get(dev);
+        ASSERT_RTNL();
+        if (!in_dev) {
+                in_dev = inetdev_init(dev);
+                if (!in_dev) {
+                        inet_free_ifa(ifa);
+                        return -ENOBUFS;
+                }
+        }
+        if (ifa->ifa_dev != in_dev) {
+                BUG_TRAP(!ifa->ifa_dev);
+                in_dev_hold(in_dev);
+                ifa->ifa_dev = in_dev;
+        }
+        if (LOOPBACK(ifa->ifa_local))
+                ifa->ifa_scope = RT_SCOPE_HOST;
+        return inet_insert_ifa(ifa);
+}
+struct in_device *inetdev_by_index(int ifindex)
+{
+        struct net_device *dev;
+        struct in_device *in_dev = NULL;
+        read_lock(&dev_base_lock);
+        dev = __dev_get_by_index(ifindex);
+        if (dev)
+                in_dev = in_dev_get(dev);
+        read_unlock(&dev_base_lock);
+        return in_dev;
+}
+/* Called only from RTNL semaphored context. No locks. */
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix,
+                                    u32 mask)
+{
+        ASSERT_RTNL();
+        for_primary_ifa(in_dev) {
+                if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+                        return ifa;
+        } endfor_ifa(in_dev);
+        return NULL;
+}
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+        struct rtattr **rta = arg;
+        struct in_device *in_dev;
+        struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+        struct in_ifaddr *ifa, **ifap;
+        ASSERT_RTNL();
+        if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL)
+                goto out;
+        __in_dev_put(in_dev);
+        for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+             ifap = &ifa->ifa_next) {
+                if ((rta[IFA_LOCAL - 1] &&
+                     memcmp(RTA_DATA(rta[IFA_LOCAL - 1]),
+                            &ifa->ifa_local, 4)) ||
+                    (rta[IFA_LABEL - 1] &&
+                     rtattr_strcmp(rta[IFA_LABEL - 1], ifa->ifa_label)) ||
+                    (rta[IFA_ADDRESS - 1] &&
+                     (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+                      !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS - 1]),
+                                      ifa))))
+                        continue;
+                inet_del_ifa(in_dev, ifap, 1);
+                return 0;
+        }
+out:
+        return -EADDRNOTAVAIL;
+}
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+        struct rtattr **rta = arg;
+        struct net_device *dev;
+        struct in_device *in_dev;
+        struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+        struct in_ifaddr *ifa;
+        int rc = -EINVAL;
+        ASSERT_RTNL();
+        if (ifm->ifa_prefixlen > 32 || !rta[IFA_LOCAL - 1])
+                goto out;
+        rc = -ENODEV;
+        if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL)
+                goto out;
+        rc = -ENOBUFS;
+        if ((in_dev = __in_dev_get(dev)) == NULL) {
+                in_dev = inetdev_init(dev);
+                if (!in_dev)
+                        goto out;
+        }
+        if ((ifa = inet_alloc_ifa()) == NULL)
+                goto out;
+        if (!rta[IFA_ADDRESS - 1])
+                rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1];
+        memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL - 1]), 4);
+        memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS - 1]), 4);
+        ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+        ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+        if (rta[IFA_BROADCAST - 1])
+                memcpy(&ifa->ifa_broadcast,
+                       RTA_DATA(rta[IFA_BROADCAST - 1]), 4);
+        if (rta[IFA_ANYCAST - 1])
+                memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST - 1]), 4);
+        ifa->ifa_flags = ifm->ifa_flags;
+        ifa->ifa_scope = ifm->ifa_scope;
+        in_dev_hold(in_dev);
+        ifa->ifa_dev   = in_dev;
+        if (rta[IFA_LABEL - 1])
+                rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL - 1], IFNAMSIZ);
+        else
+                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+        rc = inet_insert_ifa(ifa);
+out:
+        return rc;
+}
+/*
+ *      Determine a default network mask, based on the IP address.
+ */
+static __inline__ int inet_abc_len(u32 addr)
+{
+        int rc = -1;    /* Something else, probably a multicast. */
+        if (ZERONET(addr))
+                rc = 0;
+        else {
+                addr = ntohl(addr);
+                if (IN_CLASSA(addr))
+                        rc = 8;
+                else if (IN_CLASSB(addr))
+                        rc = 16;
+                else if (IN_CLASSC(addr))
+                        rc = 24;
+        }
+        return rc;
+}
+int devinet_ioctl(unsigned int cmd, void __user *arg)
+{
+        struct ifreq ifr;
+        struct sockaddr_in sin_orig;
+        struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+        struct in_device *in_dev;
+        struct in_ifaddr **ifap = NULL;
+        struct in_ifaddr *ifa = NULL;
+        struct net_device *dev;
+        char *colon;
+        int ret = -EFAULT;
+        int tryaddrmatch = 0;
+        /*
+         *      Fetch the caller's info block into kernel space
+         */
+        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+                goto out;
+        ifr.ifr_name[IFNAMSIZ - 1] = 0;
+        /* save original address for comparison */
+        memcpy(&sin_orig, sin, sizeof(*sin));
+        colon = strchr(ifr.ifr_name, ':');
+        if (colon)
+                *colon = 0;
+#ifdef CONFIG_KMOD
+        dev_load(ifr.ifr_name);
+#endif
+        switch(cmd) {
+        case SIOCGIFADDR:       /* Get interface address */
+        case SIOCGIFBRDADDR:    /* Get the broadcast address */
+        case SIOCGIFDSTADDR:    /* Get the destination address */
+        case SIOCGIFNETMASK:    /* Get the netmask for the interface */
+                /* Note that these ioctls will not sleep,
+                   so that we do not impose a lock.
+                   One day we will be forced to put shlock here (I mean SMP)
+                 */
+                tryaddrmatch = (sin_orig.sin_family == AF_INET);
+                memset(sin, 0, sizeof(*sin));
+                sin->sin_family = AF_INET;
+                break;
+        case SIOCSIFFLAGS:
+                ret = -EACCES;
+                if (!capable(CAP_NET_ADMIN))
+                        goto out;
+                break;
+        case SIOCSIFADDR:       /* Set interface address (and family) */
+        case SIOCSIFBRDADDR:    /* Set the broadcast address */
+        case SIOCSIFDSTADDR:    /* Set the destination address */
+        case SIOCSIFNETMASK:    /* Set the netmask for the interface */
+                ret = -EACCES;
+                if (!capable(CAP_NET_ADMIN))
+                        goto out;
+                ret = -EINVAL;
+                if (sin->sin_family != AF_INET)
+                        goto out;
+                break;
+        default:
+                ret = -EINVAL;
+                goto out;
+        }
+        rtnl_lock();
+        ret = -ENODEV;
+        if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
+                goto done;
+        if (colon)
+                *colon = ':';
+        if ((in_dev = __in_dev_get(dev)) != NULL) {
+                if (tryaddrmatch) {
+                        /* Matthias Andree */
+                        /* compare label and address (4.4BSD style) */
+                        /* note: we only do this for a limited set of ioctls
+                           and only if the original address family was AF_INET.
+                           This is checked above. */
+                        for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+                             ifap = &ifa->ifa_next) {
+                                if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+                                    sin_orig.sin_addr.s_addr ==
+                                                        ifa->ifa_address) {
+                                        break; /* found */
+                                }
+                        }
+                }
+                /* we didn't get a match, maybe the application is
+                   4.3BSD-style and passed in junk so we fall back to
+                   comparing just the label */
+                if (!ifa) {
+                        for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+                             ifap = &ifa->ifa_next)
+                                if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+                                        break;
+                }
+        }
+        ret = -EADDRNOTAVAIL;
+        if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+                goto done;
+        switch(cmd) {
+        case SIOCGIFADDR:       /* Get interface address */
+                sin->sin_addr.s_addr = ifa->ifa_local;
+                goto rarok;
+        case SIOCGIFBRDADDR:    /* Get the broadcast address */
+                sin->sin_addr.s_addr = ifa->ifa_broadcast;
+                goto rarok;
+        case SIOCGIFDSTADDR:    /* Get the destination address */
+                sin->sin_addr.s_addr = ifa->ifa_address;
+                goto rarok;
+        case SIOCGIFNETMASK:    /* Get the netmask for the interface */
+                sin->sin_addr.s_addr = ifa->ifa_mask;
+                goto rarok;
+        case SIOCSIFFLAGS:
+                if (colon) {
+                        ret = -EADDRNOTAVAIL;
+                        if (!ifa)
+                                break;
+                        ret = 0;
+                        if (!(ifr.ifr_flags & IFF_UP))
+                                inet_del_ifa(in_dev, ifap, 1);
+                        break;
+                }
+                ret = dev_change_flags(dev, ifr.ifr_flags);
+                break;
+        case SIOCSIFADDR:       /* Set interface address (and family) */
+                ret = -EINVAL;
+                if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+                        break;
+                if (!ifa) {
+                        ret = -ENOBUFS;
+                        if ((ifa = inet_alloc_ifa()) == NULL)
+                                break;
+                        if (colon)
+                                memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+                        else
+                                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+                } else {
+                        ret = 0;
+                        if (ifa->ifa_local == sin->sin_addr.s_addr)
+                                break;
+                        inet_del_ifa(in_dev, ifap, 0);
+                        ifa->ifa_broadcast = 0;
+                        ifa->ifa_anycast = 0;
+                }
+                ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+                if (!(dev->flags & IFF_POINTOPOINT)) {
+                        ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+                        ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+                        if ((dev->flags & IFF_BROADCAST) &&
+                            ifa->ifa_prefixlen < 31)
+                                ifa->ifa_broadcast = ifa->ifa_address |
+                                                     ~ifa->ifa_mask;
+                } else {
+                        ifa->ifa_prefixlen = 32;
+                        ifa->ifa_mask = inet_make_mask(32);
+                }
+                ret = inet_set_ifa(dev, ifa);
+                break;
+        case SIOCSIFBRDADDR:    /* Set the broadcast address */
+                ret = 0;
+                if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+                        inet_del_ifa(in_dev, ifap, 0);
+                        ifa->ifa_broadcast = sin->sin_addr.s_addr;
+                        inet_insert_ifa(ifa);
+                }
+                break;
+        case SIOCSIFDSTADDR:    /* Set the destination address */
+                ret = 0;
+                if (ifa->ifa_address == sin->sin_addr.s_addr)
+                        break;
+                ret = -EINVAL;
+                if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+                        break;
+                ret = 0;
+                inet_del_ifa(in_dev, ifap, 0);
+                ifa->ifa_address = sin->sin_addr.s_addr;
+                inet_insert_ifa(ifa);
+                break;
+        case SIOCSIFNETMASK:    /* Set the netmask for the interface */
+                /*
+                 *      The mask we set must be legal.
+                 */
+                ret = -EINVAL;
+                if (bad_mask(sin->sin_addr.s_addr, 0))
+                        break;
+                ret = 0;
+                if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+                        inet_del_ifa(in_dev, ifap, 0);
+                        ifa->ifa_mask = sin->sin_addr.s_addr;
+                        ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+                        /* See if current broadcast address matches
+                         * with current netmask, then recalculate
+                         * the broadcast address. Otherwise it's a
+                         * funny address, so don't touch it since
+                         * the user seems to know what (s)he's doing...
+                         */
+                        if ((dev->flags & IFF_BROADCAST) &&
+                            (ifa->ifa_prefixlen < 31) &&
+                            (ifa->ifa_broadcast ==
+                             (ifa->ifa_local|~ifa->ifa_mask))) {
+                                ifa->ifa_broadcast = (ifa->ifa_local |
+                                                      ~sin->sin_addr.s_addr);
+                        }
+                        inet_insert_ifa(ifa);
+                }
+                break;
+        }
+done:
+        rtnl_unlock();
+out:
+        return ret;
+rarok:
+        rtnl_unlock();
+        ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+        goto out;
+}
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+        struct in_device *in_dev = __in_dev_get(dev);
+        struct in_ifaddr *ifa;
+        struct ifreq ifr;
+        int done = 0;
+        if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+                goto out;
+        for (; ifa; ifa = ifa->ifa_next) {
+                if (!buf) {
+                        done += sizeof(ifr);
+                        continue;
+                }
+                if (len < (int) sizeof(ifr))
+                        break;
+                memset(&ifr, 0, sizeof(struct ifreq));
+                if (ifa->ifa_label)
+                        strcpy(ifr.ifr_name, ifa->ifa_label);
+                else
+                        strcpy(ifr.ifr_name, dev->name);
+                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+                                                                ifa->ifa_local;
+                if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+                        done = -EFAULT;
+                        break;
+                }
+                buf  += sizeof(struct ifreq);
+                len  -= sizeof(struct ifreq);
+                done += sizeof(struct ifreq);
+        }
+out:
+        return done;
+}
+u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
+{
+        u32 addr = 0;
+        struct in_device *in_dev;
+        rcu_read_lock();
+        in_dev = __in_dev_get(dev);
+        if (!in_dev)
+                goto no_in_dev;
+        for_primary_ifa(in_dev) {
+                if (ifa->ifa_scope > scope)
+                        continue;
+                if (!dst || inet_ifa_match(dst, ifa)) {
+                        addr = ifa->ifa_local;
+                        break;
+                }
+                if (!addr)
+                        addr = ifa->ifa_local;
+        } endfor_ifa(in_dev);
+no_in_dev:
+        rcu_read_unlock();
+        if (addr)
+                goto out;
+        /* Not loopback addresses on loopback should be preferred
+           in this case. It is importnat that lo is the first interface
+           in dev_base list.
+         */
+        read_lock(&dev_base_lock);
+        rcu_read_lock();
+        for (dev = dev_base; dev; dev = dev->next) {
+                if ((in_dev = __in_dev_get(dev)) == NULL)
+                        continue;
+                for_primary_ifa(in_dev) {
+                        if (ifa->ifa_scope != RT_SCOPE_LINK &&
+                            ifa->ifa_scope <= scope) {
+                                addr = ifa->ifa_local;
+                                goto out_unlock_both;
+                        }
+                } endfor_ifa(in_dev);
+        }
+out_unlock_both:
+        read_unlock(&dev_base_lock);
+        rcu_read_unlock();
+out:
+        return addr;
+}
+static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst,
+                              u32 local, int scope)
+{
+        int same = 0;
+        u32 addr = 0;
+        for_ifa(in_dev) {
+                if (!addr &&
+                    (local == ifa->ifa_local || !local) &&
+                    ifa->ifa_scope <= scope) {
+                        addr = ifa->ifa_local;
+                        if (same)
+                                break;
+                }
+                if (!same) {
+                        same = (!local || inet_ifa_match(local, ifa)) &&
+                                (!dst || inet_ifa_match(dst, ifa));
+                        if (same && addr) {
+                                if (local || !dst)
+                                        break;
+                                /* Is the selected addr into dst subnet? */
+                                if (inet_ifa_match(addr, ifa))
+                                        break;
+                                /* No, then can we use new local src? */
+                                if (ifa->ifa_scope <= scope) {
+                                        addr = ifa->ifa_local;
+                                        break;
+                                }
+                                /* search for large dst subnet for addr */
+                                same = 0;
+                        }
+                }
+        } endfor_ifa(in_dev);
+        return same? addr : 0;
+}
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - dev: only on this interface, 0=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope)
+{
+        u32 addr = 0;
+        struct in_device *in_dev;
+        if (dev) {
+                rcu_read_lock();
+                if ((in_dev = __in_dev_get(dev)))
+                        addr = confirm_addr_indev(in_dev, dst, local, scope);
+                rcu_read_unlock();
+                return addr;
+        }
+        read_lock(&dev_base_lock);
+        rcu_read_lock();
+        for (dev = dev_base; dev; dev = dev->next) {
+                if ((in_dev = __in_dev_get(dev))) {
+                        addr = confirm_addr_indev(in_dev, dst, local, scope);
+                        if (addr)
+                                break;
+                }
+        }
+        rcu_read_unlock();
+        read_unlock(&dev_base_lock);
+        return addr;
+}
+/*
+ *      Device notifier
+ */
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+        return notifier_chain_register(&inetaddr_chain, nb);
+}
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+        return notifier_chain_unregister(&inetaddr_chain, nb);
+}
+/* Rename ifa_labels for a device name change. Make some effort to preserve existing
+ * alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{ 
+        struct in_ifaddr *ifa;
+        int named = 0;
+        for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { 
+                char old[IFNAMSIZ], *dot; 
+                memcpy(old, ifa->ifa_label, IFNAMSIZ);
+                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); 
+                if (named++ == 0)
+                        continue;
+                dot = strchr(ifa->ifa_label, ':');
+                if (dot == NULL) { 
+                        sprintf(old, ":%d", named); 
+                        dot = old;
+                }
+                if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { 
+                        strcat(ifa->ifa_label, dot); 
+                } else { 
+                        strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); 
+                } 
+        }       
+} 
+/* Called only under RTNL semaphore */
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+                         void *ptr)
+{
+        struct net_device *dev = ptr;
+        struct in_device *in_dev = __in_dev_get(dev);
+        ASSERT_RTNL();
+        if (!in_dev) {
+                if (event == NETDEV_REGISTER && dev == &loopback_dev) {
+                        in_dev = inetdev_init(dev);
+                        if (!in_dev)
+                                panic("devinet: Failed to create loopback\n");
+                        in_dev->cnf.no_xfrm = 1;
+                        in_dev->cnf.no_policy = 1;
+                }
+                goto out;
+        }
+        switch (event) {
+        case NETDEV_REGISTER:
+                printk(KERN_DEBUG "inetdev_event: bug\n");
+                dev->ip_ptr = NULL;
+                break;
+        case NETDEV_UP:
+                if (dev->mtu < 68)
+                        break;
+                if (dev == &loopback_dev) {
+                        struct in_ifaddr *ifa;
+                        if ((ifa = inet_alloc_ifa()) != NULL) {
+                                ifa->ifa_local =
+                                  ifa->ifa_address = htonl(INADDR_LOOPBACK);
+                                ifa->ifa_prefixlen = 8;
+                                ifa->ifa_mask = inet_make_mask(8);
+                                in_dev_hold(in_dev);
+                                ifa->ifa_dev = in_dev;
+                                ifa->ifa_scope = RT_SCOPE_HOST;
+                                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+                                inet_insert_ifa(ifa);
+                        }
+                }
+                ip_mc_up(in_dev);
+                break;
+        case NETDEV_DOWN:
+                ip_mc_down(in_dev);
+                break;
+        case NETDEV_CHANGEMTU:
+                if (dev->mtu >= 68)
+                        break;
+                /* MTU falled under 68, disable IP */
+        case NETDEV_UNREGISTER:
+                inetdev_destroy(in_dev);
+                break;
+        case NETDEV_CHANGENAME:
+                /* Do not notify about label change, this event is
+                 * not interesting to applications using netlink.
+                 */
+                inetdev_changename(dev, in_dev);
+#ifdef CONFIG_SYSCTL
+                devinet_sysctl_unregister(&in_dev->cnf);
+                neigh_sysctl_unregister(in_dev->arp_parms);
+                neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+                                      NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+                devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+                break;
+        }
+out:
+        return NOTIFY_DONE;
+}
+static struct notifier_block ip_netdev_notifier = {
+        .notifier_call =inetdev_event,
+};
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+                            u32 pid, u32 seq, int event)
+{
+        struct ifaddrmsg *ifm;
+        struct nlmsghdr  *nlh;
+        unsigned char    *b = skb->tail;
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
+        if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+        ifm = NLMSG_DATA(nlh);
+        ifm->ifa_family = AF_INET;
+        ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+        ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+        ifm->ifa_scope = ifa->ifa_scope;
+        ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+        if (ifa->ifa_address)
+                RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
+        if (ifa->ifa_local)
+                RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
+        if (ifa->ifa_broadcast)
+                RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast);
+        if (ifa->ifa_anycast)
+                RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast);
+        if (ifa->ifa_label[0])
+                RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label);
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int idx, ip_idx;
+        struct net_device *dev;
+        struct in_device *in_dev;
+        struct in_ifaddr *ifa;
+        int s_ip_idx, s_idx = cb->args[0];
+        s_ip_idx = ip_idx = cb->args[1];
+        read_lock(&dev_base_lock);
+        for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
+                if (idx < s_idx)
+                        continue;
+                if (idx > s_idx)
+                        s_ip_idx = 0;
+                rcu_read_lock();
+                if ((in_dev = __in_dev_get(dev)) == NULL) {
+                        rcu_read_unlock();
+                        continue;
+                }
+                for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+                     ifa = ifa->ifa_next, ip_idx++) {
+                        if (ip_idx < s_ip_idx)
+                                continue;
+                        if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+                                             cb->nlh->nlmsg_seq,
+                                             RTM_NEWADDR) <= 0) {
+                                rcu_read_unlock();
+                                goto done;
+                        }
+                }
+                rcu_read_unlock();
+        }
+done:
+        read_unlock(&dev_base_lock);
+        cb->args[0] = idx;
+        cb->args[1] = ip_idx;
+        return skb->len;
+}
+static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
+{
+        int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + 128);
+        struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
+        if (!skb)
+                netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+        else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+                kfree_skb(skb);
+                netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+        } else {
+                NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
+                netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+        }
+}
+static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = {
+         [4] = { .doit   = inet_rtm_newaddr,  },
+         [5] = { .doit   = inet_rtm_deladdr,  },
+         [6] = { .dumpit = inet_dump_ifaddr,  },
+         [8] = { .doit   = inet_rtm_newroute, },
+         [9] = { .doit   = inet_rtm_delroute, },
+        [10] = { .doit   = inet_rtm_getroute, .dumpit = inet_dump_fib, },
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        [16] = { .doit   = inet_rtm_newrule, },
+        [17] = { .doit   = inet_rtm_delrule, },
+        [18] = { .dumpit = inet_dump_rules,  },
+#endif
+};
+#ifdef CONFIG_SYSCTL
+void inet_forward_change(void)
+{
+        struct net_device *dev;
+        int on = ipv4_devconf.forwarding;
+        ipv4_devconf.accept_redirects = !on;
+        ipv4_devconf_dflt.forwarding = on;
+        read_lock(&dev_base_lock);
+        for (dev = dev_base; dev; dev = dev->next) {
+                struct in_device *in_dev;
+                rcu_read_lock();
+                in_dev = __in_dev_get(dev);
+                if (in_dev)
+                        in_dev->cnf.forwarding = on;
+                rcu_read_unlock();
+        }
+        read_unlock(&dev_base_lock);
+        rt_cache_flush(0);
+}
+static int devinet_sysctl_forward(ctl_table *ctl, int write,
+                                  struct file* filp, void __user *buffer,
+                                  size_t *lenp, loff_t *ppos)
+{
+        int *valp = ctl->data;
+        int val = *valp;
+        int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+        if (write && *valp != val) {
+                if (valp == &ipv4_devconf.forwarding)
+                        inet_forward_change();
+                else if (valp != &ipv4_devconf_dflt.forwarding)
+                        rt_cache_flush(0);
+        }
+        return ret;
+}
+int ipv4_doint_and_flush(ctl_table *ctl, int write,
+                         struct file* filp, void __user *buffer,
+                         size_t *lenp, loff_t *ppos)
+{
+        int *valp = ctl->data;
+        int val = *valp;
+        int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+        if (write && *valp != val)
+                rt_cache_flush(0);
+        return ret;
+}
+int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
+                                  void __user *oldval, size_t __user *oldlenp,
+                                  void __user *newval, size_t newlen, 
+                                  void **context)
+{
+        int *valp = table->data;
+        int new;
+        if (!newval || !newlen)
+                return 0;
+        if (newlen != sizeof(int))
+                return -EINVAL;
+        if (get_user(new, (int __user *)newval))
+                return -EFAULT;
+        if (new == *valp)
+                return 0;
+        if (oldval && oldlenp) {
+                size_t len;
+                if (get_user(len, oldlenp))
+                        return -EFAULT;
+                if (len) {
+                        if (len > table->maxlen)
+                                len = table->maxlen;
+                        if (copy_to_user(oldval, valp, len))
+                                return -EFAULT;
+                        if (put_user(len, oldlenp))
+                                return -EFAULT;
+                }
+        }
+        *valp = new;
+        rt_cache_flush(0);
+        return 1;
+}
+static struct devinet_sysctl_table {
+        struct ctl_table_header *sysctl_header;
+        ctl_table               devinet_vars[__NET_IPV4_CONF_MAX];
+        ctl_table               devinet_dev[2];
+        ctl_table               devinet_conf_dir[2];
+        ctl_table               devinet_proto_dir[2];
+        ctl_table               devinet_root_dir[2];
+} devinet_sysctl = {
+        .devinet_vars = {
+                {
+                        .ctl_name       = NET_IPV4_CONF_FORWARDING,
+                        .procname       = "forwarding",
+                        .data           = &ipv4_devconf.forwarding,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &devinet_sysctl_forward,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_MC_FORWARDING,
+                        .procname       = "mc_forwarding",
+                        .data           = &ipv4_devconf.mc_forwarding,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0444,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_ACCEPT_REDIRECTS,
+                        .procname       = "accept_redirects",
+                        .data           = &ipv4_devconf.accept_redirects,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_SECURE_REDIRECTS,
+                        .procname       = "secure_redirects",
+                        .data           = &ipv4_devconf.secure_redirects,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_SHARED_MEDIA,
+                        .procname       = "shared_media",
+                        .data           = &ipv4_devconf.shared_media,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_RP_FILTER,
+                        .procname       = "rp_filter",
+                        .data           = &ipv4_devconf.rp_filter,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_SEND_REDIRECTS,
+                        .procname       = "send_redirects",
+                        .data           = &ipv4_devconf.send_redirects,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,
+                        .procname       = "accept_source_route",
+                        .data           = &ipv4_devconf.accept_source_route,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_PROXY_ARP,
+                        .procname       = "proxy_arp",
+                        .data           = &ipv4_devconf.proxy_arp,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_MEDIUM_ID,
+                        .procname       = "medium_id",
+                        .data           = &ipv4_devconf.medium_id,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_BOOTP_RELAY,
+                        .procname       = "bootp_relay",
+                        .data           = &ipv4_devconf.bootp_relay,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_LOG_MARTIANS,
+                        .procname       = "log_martians",
+                        .data           = &ipv4_devconf.log_martians,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_TAG,
+                        .procname       = "tag",
+                        .data           = &ipv4_devconf.tag,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_ARPFILTER,
+                        .procname       = "arp_filter",
+                        .data           = &ipv4_devconf.arp_filter,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_ARP_ANNOUNCE,
+                        .procname       = "arp_announce",
+                        .data           = &ipv4_devconf.arp_announce,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_ARP_IGNORE,
+                        .procname       = "arp_ignore",
+                        .data           = &ipv4_devconf.arp_ignore,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &proc_dointvec,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_NOXFRM,
+                        .procname       = "disable_xfrm",
+                        .data           = &ipv4_devconf.no_xfrm,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &ipv4_doint_and_flush,
+                        .strategy       = &ipv4_doint_and_flush_strategy,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_NOPOLICY,
+                        .procname       = "disable_policy",
+                        .data           = &ipv4_devconf.no_policy,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &ipv4_doint_and_flush,
+                        .strategy       = &ipv4_doint_and_flush_strategy,
+                },
+                {
+                        .ctl_name       = NET_IPV4_CONF_FORCE_IGMP_VERSION,
+                        .procname       = "force_igmp_version",
+                        .data           = &ipv4_devconf.force_igmp_version,
+                        .maxlen         = sizeof(int),
+                        .mode           = 0644,
+                        .proc_handler   = &ipv4_doint_and_flush,
+                        .strategy       = &ipv4_doint_and_flush_strategy,
+                },
+        },
+        .devinet_dev = {
+                {
+                        .ctl_name       = NET_PROTO_CONF_ALL,
+                        .procname       = "all",
+                        .mode           = 0555,
+                        .child          = devinet_sysctl.devinet_vars,
+                },
+        },
+        .devinet_conf_dir = {
+                {
+                        .ctl_name       = NET_IPV4_CONF,
+                        .procname       = "conf",
+                        .mode           = 0555,
+                        .child          = devinet_sysctl.devinet_dev,
+                },
+        },
+        .devinet_proto_dir = {
+                {
+                        .ctl_name       = NET_IPV4,
+                        .procname       = "ipv4",
+                        .mode           = 0555,
+                        .child          = devinet_sysctl.devinet_conf_dir,
+                },
+        },
+        .devinet_root_dir = {
+                {
+                        .ctl_name       = CTL_NET,
+                        .procname       = "net",
+                        .mode           = 0555,
+                        .child          = devinet_sysctl.devinet_proto_dir,
+                },
+        },
+};
+static void devinet_sysctl_register(struct in_device *in_dev,
+                                    struct ipv4_devconf *p)
+{
+        int i;
+        struct net_device *dev = in_dev ? in_dev->dev : NULL;
+        struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
+        char *dev_name = NULL;
+        if (!t)
+                return;
+        memcpy(t, &devinet_sysctl, sizeof(*t));
+        for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+                t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+                t->devinet_vars[i].de = NULL;
+        }
+        if (dev) {
+                dev_name = dev->name; 
+                t->devinet_dev[0].ctl_name = dev->ifindex;
+        } else {
+                dev_name = "default";
+                t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+        }
+        /* 
+         * Make a copy of dev_name, because '.procname' is regarded as const 
+         * by sysctl and we wouldn't want anyone to change it under our feet
+         * (see SIOCSIFNAME).
+         */     
+        dev_name = net_sysctl_strdup(dev_name);
+        if (!dev_name)
+            goto free;
+        t->devinet_dev[0].procname    = dev_name;
+        t->devinet_dev[0].child       = t->devinet_vars;
+        t->devinet_dev[0].de          = NULL;
+        t->devinet_conf_dir[0].child  = t->devinet_dev;
+        t->devinet_conf_dir[0].de     = NULL;
+        t->devinet_proto_dir[0].child = t->devinet_conf_dir;
+        t->devinet_proto_dir[0].de    = NULL;
+        t->devinet_root_dir[0].child  = t->devinet_proto_dir;
+        t->devinet_root_dir[0].de     = NULL;
+        t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
+        if (!t->sysctl_header)
+            goto free_procname;
+        p->sysctl = t;
+        return;
+        /* error path */
+ free_procname:
+        kfree(dev_name);
+ free:
+        kfree(t);
+        return;
+}
+static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+{
+        if (p->sysctl) {
+                struct devinet_sysctl_table *t = p->sysctl;
+                p->sysctl = NULL;
+                unregister_sysctl_table(t->sysctl_header);
+                kfree(t->devinet_dev[0].procname);
+                kfree(t);
+        }
+}
+#endif
+void __init devinet_init(void)
+{
+        register_gifconf(PF_INET, inet_gifconf);
+        register_netdevice_notifier(&ip_netdev_notifier);
+        rtnetlink_links[PF_INET] = inet_rtnetlink_table;
+#ifdef CONFIG_SYSCTL
+        devinet_sysctl.sysctl_header =
+                register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
+        devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
+#endif
+}
+EXPORT_SYMBOL(devinet_ioctl);
+EXPORT_SYMBOL(in_dev_finish_destroy);
+EXPORT_SYMBOL(inet_select_addr);
+EXPORT_SYMBOL(inetdev_by_index);
+EXPORT_SYMBOL(register_inetaddr_notifier);
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 000000000000..053a883247ba
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,510 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+/* decapsulation data for use when post-processing */
+struct esp_decap_data {
+        xfrm_address_t  saddr;
+        __u16           sport;
+        __u8            proto;
+};
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int err;
+        struct iphdr *top_iph;
+        struct ip_esp_hdr *esph;
+        struct crypto_tfm *tfm;
+        struct esp_data *esp;
+        struct sk_buff *trailer;
+        int blksize;
+        int clen;
+        int alen;
+        int nfrags;
+        /* Strip IP+ESP header. */
+        __skb_pull(skb, skb->h.raw - skb->data);
+        /* Now skb is pure payload to encrypt */
+        err = -ENOMEM;
+        /* Round to block size */
+        clen = skb->len;
+        esp = x->data;
+        alen = esp->auth.icv_trunc_len;
+        tfm = esp->conf.tfm;
+        blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
+        clen = (clen + 2 + blksize-1)&~(blksize-1);
+        if (esp->conf.padlen)
+                clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+        if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
+                goto error;
+        /* Fill padding... */
+        do {
+                int i;
+                for (i=0; i<clen-skb->len - 2; i++)
+                        *(u8*)(trailer->tail + i) = i+1;
+        } while (0);
+        *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
+        pskb_put(skb, trailer, clen - skb->len);
+        __skb_push(skb, skb->data - skb->nh.raw);
+        top_iph = skb->nh.iph;
+        esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4);
+        top_iph->tot_len = htons(skb->len + alen);
+        *(u8*)(trailer->tail - 1) = top_iph->protocol;
+        /* this is non-NULL only with UDP Encapsulation */
+        if (x->encap) {
+                struct xfrm_encap_tmpl *encap = x->encap;
+                struct udphdr *uh;
+                u32 *udpdata32;
+                uh = (struct udphdr *)esph;
+                uh->source = encap->encap_sport;
+                uh->dest = encap->encap_dport;
+                uh->len = htons(skb->len + alen - top_iph->ihl*4);
+                uh->check = 0;
+                switch (encap->encap_type) {
+                default:
+                case UDP_ENCAP_ESPINUDP:
+                        esph = (struct ip_esp_hdr *)(uh + 1);
+                        break;
+                case UDP_ENCAP_ESPINUDP_NON_IKE:
+                        udpdata32 = (u32 *)(uh + 1);
+                        udpdata32[0] = udpdata32[1] = 0;
+                        esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+                        break;
+                }
+                top_iph->protocol = IPPROTO_UDP;
+        } else
+                top_iph->protocol = IPPROTO_ESP;
+        esph->spi = x->id.spi;
+        esph->seq_no = htonl(++x->replay.oseq);
+        if (esp->conf.ivlen)
+                crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+        do {
+                struct scatterlist *sg = &esp->sgbuf[0];
+                if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+                        sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+                        if (!sg)
+                                goto error;
+                }
+                skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
+                crypto_cipher_encrypt(tfm, sg, sg, clen);
+                if (unlikely(sg != &esp->sgbuf[0]))
+                        kfree(sg);
+        } while (0);
+        if (esp->conf.ivlen) {
+                memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+                crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+        }
+        if (esp->auth.icv_full_len) {
+                esp->auth.icv(esp, skb, (u8*)esph-skb->data,
+                              sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail);
+                pskb_put(skb, trailer, alen);
+        }
+        ip_send_check(top_iph);
+        err = 0;
+error:
+        return err;
+}
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+        struct iphdr *iph;
+        struct ip_esp_hdr *esph;
+        struct esp_data *esp = x->data;
+        struct sk_buff *trailer;
+        int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+        int alen = esp->auth.icv_trunc_len;
+        int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
+        int nfrags;
+        int encap_len = 0;
+        if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
+                goto out;
+        if (elen <= 0 || (elen & (blksize-1)))
+                goto out;
+        /* If integrity check is required, do this. */
+        if (esp->auth.icv_full_len) {
+                u8 sum[esp->auth.icv_full_len];
+                u8 sum1[alen];
+                
+                esp->auth.icv(esp, skb, 0, skb->len-alen, sum);
+                if (skb_copy_bits(skb, skb->len-alen, sum1, alen))
+                        BUG();
+                if (unlikely(memcmp(sum, sum1, alen))) {
+                        x->stats.integrity_failed++;
+                        goto out;
+                }
+        }
+        if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
+                goto out;
+        skb->ip_summed = CHECKSUM_NONE;
+        esph = (struct ip_esp_hdr*)skb->data;
+        iph = skb->nh.iph;
+        /* Get ivec. This can be wrong, check against another impls. */
+        if (esp->conf.ivlen)
+                crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm));
+        {
+                u8 nexthdr[2];
+                struct scatterlist *sg = &esp->sgbuf[0];
+                u8 workbuf[60];
+                int padlen;
+                if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+                        sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+                        if (!sg)
+                                goto out;
+                }
+                skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
+                crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen);
+                if (unlikely(sg != &esp->sgbuf[0]))
+                        kfree(sg);
+                if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+                        BUG();
+                padlen = nexthdr[0];
+                if (padlen+2 >= elen)
+                        goto out;
+                /* ... check padding bits here. Silly. :-) */ 
+                if (x->encap && decap && decap->decap_type) {
+                        struct esp_decap_data *encap_data;
+                        struct udphdr *uh = (struct udphdr *) (iph+1);
+                        encap_data = (struct esp_decap_data *) (decap->decap_data);
+                        encap_data->proto = 0;
+                        switch (decap->decap_type) {
+                        case UDP_ENCAP_ESPINUDP:
+                        case UDP_ENCAP_ESPINUDP_NON_IKE:
+                                encap_data->proto = AF_INET;
+                                encap_data->saddr.a4 = iph->saddr;
+                                encap_data->sport = uh->source;
+                                encap_len = (void*)esph - (void*)uh;
+                                break;
+                        default:
+                                goto out;
+                        }
+                }
+                iph->protocol = nexthdr[1];
+                pskb_trim(skb, skb->len - alen - padlen - 2);
+                memcpy(workbuf, skb->nh.raw, iph->ihl*4);
+                skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
+                skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+                memcpy(skb->nh.raw, workbuf, iph->ihl*4);
+                skb->nh.iph->tot_len = htons(skb->len);
+        }
+        return 0;
+out:
+        return -EINVAL;
+}
+static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+  
+        if (x->encap) {
+                struct xfrm_encap_tmpl *encap;
+                struct esp_decap_data *decap_data;
+                encap = x->encap;
+                decap_data = (struct esp_decap_data *)(decap->decap_data);
+                /* first, make sure that the decap type == the encap type */
+                if (encap->encap_type != decap->decap_type)
+                        return -EINVAL;
+                switch (encap->encap_type) {
+                default:
+                case UDP_ENCAP_ESPINUDP:
+                case UDP_ENCAP_ESPINUDP_NON_IKE:
+                        /*
+                         * 1) if the NAT-T peer's IP or port changed then
+                         *    advertize the change to the keying daemon.
+                         *    This is an inbound SA, so just compare
+                         *    SRC ports.
+                         */
+                        if (decap_data->proto == AF_INET &&
+                            (decap_data->saddr.a4 != x->props.saddr.a4 ||
+                             decap_data->sport != encap->encap_sport)) {
+                                xfrm_address_t ipaddr;
+                                ipaddr.a4 = decap_data->saddr.a4;
+                                km_new_mapping(x, &ipaddr, decap_data->sport);
+                                        
+                                /* XXX: perhaps add an extra
+                                 * policy check here, to see
+                                 * if we should allow or
+                                 * reject a packet from a
+                                 * different source
+                                 * address/port.
+                                 */
+                        }
+                
+                        /*
+                         * 2) ignore UDP/TCP checksums in case
+                         *    of NAT-T in Transport Mode, or
+                         *    perform other post-processing fixes
+                         *    as per * draft-ietf-ipsec-udp-encaps-06,
+                         *    section 3.1.2
+                         */
+                        if (!x->props.mode)
+                                skb->ip_summed = CHECKSUM_UNNECESSARY;
+                        break;
+                }
+        }
+        return 0;
+}
+static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
+{
+        struct esp_data *esp = x->data;
+        u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+        if (x->props.mode) {
+                mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+        } else {
+                /* The worst case. */
+                mtu += 2 + blksize;
+        }
+        if (esp->conf.padlen)
+                mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+        return mtu + x->props.header_len + esp->auth.icv_trunc_len;
+}
+static void esp4_err(struct sk_buff *skb, u32 info)
+{
+        struct iphdr *iph = (struct iphdr*)skb->data;
+        struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
+        struct xfrm_state *x;
+        if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+            skb->h.icmph->code != ICMP_FRAG_NEEDED)
+                return;
+        x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+        if (!x)
+                return;
+        NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+                        ntohl(esph->spi), ntohl(iph->daddr)));
+        xfrm_state_put(x);
+}
+static void esp_destroy(struct xfrm_state *x)
+{
+        struct esp_data *esp = x->data;
+        if (!esp)
+                return;
+        if (esp->conf.tfm) {
+                crypto_free_tfm(esp->conf.tfm);
+                esp->conf.tfm = NULL;
+        }
+        if (esp->conf.ivec) {
+                kfree(esp->conf.ivec);
+                esp->conf.ivec = NULL;
+        }
+        if (esp->auth.tfm) {
+                crypto_free_tfm(esp->auth.tfm);
+                esp->auth.tfm = NULL;
+        }
+        if (esp->auth.work_icv) {
+                kfree(esp->auth.work_icv);
+                esp->auth.work_icv = NULL;
+        }
+        kfree(esp);
+}
+static int esp_init_state(struct xfrm_state *x, void *args)
+{
+        struct esp_data *esp = NULL;
+        /* null auth and encryption can have zero length keys */
+        if (x->aalg) {
+                if (x->aalg->alg_key_len > 512)
+                        goto error;
+        }
+        if (x->ealg == NULL)
+                goto error;
+        esp = kmalloc(sizeof(*esp), GFP_KERNEL);
+        if (esp == NULL)
+                return -ENOMEM;
+        memset(esp, 0, sizeof(*esp));
+        if (x->aalg) {
+                struct xfrm_algo_desc *aalg_desc;
+                esp->auth.key = x->aalg->alg_key;
+                esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
+                esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+                if (esp->auth.tfm == NULL)
+                        goto error;
+                esp->auth.icv = esp_hmac_digest;
+                aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+                BUG_ON(!aalg_desc);
+                if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+                    crypto_tfm_alg_digestsize(esp->auth.tfm)) {
+                        NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+                               x->aalg->alg_name,
+                               crypto_tfm_alg_digestsize(esp->auth.tfm),
+                               aalg_desc->uinfo.auth.icv_fullbits/8));
+                        goto error;
+                }
+                esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+                esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+                esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
+                if (!esp->auth.work_icv)
+                        goto error;
+        }
+        esp->conf.key = x->ealg->alg_key;
+        esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
+        if (x->props.ealgo == SADB_EALG_NULL)
+                esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB);
+        else
+                esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC);
+        if (esp->conf.tfm == NULL)
+                goto error;
+        esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm);
+        esp->conf.padlen = 0;
+        if (esp->conf.ivlen) {
+                esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
+                if (unlikely(esp->conf.ivec == NULL))
+                        goto error;
+                get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+        }
+        if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len))
+                goto error;
+        x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+        if (x->props.mode)
+                x->props.header_len += sizeof(struct iphdr);
+        if (x->encap) {
+                struct xfrm_encap_tmpl *encap = x->encap;
+                switch (encap->encap_type) {
+                default:
+                        goto error;
+                case UDP_ENCAP_ESPINUDP:
+                        x->props.header_len += sizeof(struct udphdr);
+                        break;
+                case UDP_ENCAP_ESPINUDP_NON_IKE:
+                        x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+                        break;
+                }
+        }
+        x->data = esp;
+        x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len;
+        return 0;
+error:
+        x->data = esp;
+        esp_destroy(x);
+        x->data = NULL;
+        return -EINVAL;
+}
+static struct xfrm_type esp_type =
+{
+        .description    = "ESP4",
+        .owner          = THIS_MODULE,
+        .proto          = IPPROTO_ESP,
+        .init_state     = esp_init_state,
+        .destructor     = esp_destroy,
+        .get_max_size   = esp4_get_max_size,
+        .input          = esp_input,
+        .post_input     = esp_post_input,
+        .output         = esp_output
+};
+static struct net_protocol esp4_protocol = {
+        .handler        =       xfrm4_rcv,
+        .err_handler    =       esp4_err,
+        .no_policy      =       1,
+};
+static int __init esp4_init(void)
+{
+        struct xfrm_decap_state decap;
+        if (sizeof(struct esp_decap_data)  <
+            sizeof(decap.decap_data)) {
+                extern void decap_data_too_small(void);
+                decap_data_too_small();
+        }
+        if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+                printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+                return -EAGAIN;
+        }
+        if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+                printk(KERN_INFO "ip esp init: can't add protocol\n");
+                xfrm_unregister_type(&esp_type, AF_INET);
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void __exit esp4_fini(void)
+{
+        if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+                printk(KERN_INFO "ip esp close: can't remove protocol\n");
+        if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+                printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+}
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 000000000000..563e7d612706
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,611 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Version:     $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#define FFprint(a...) printk(KERN_DEBUG a)
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+#define RT_TABLE_MIN RT_TABLE_MAIN
+struct fib_table *ip_fib_local_table;
+struct fib_table *ip_fib_main_table;
+#else
+#define RT_TABLE_MIN 1
+struct fib_table *fib_tables[RT_TABLE_MAX+1];
+struct fib_table *__fib_new_table(int id)
+{
+        struct fib_table *tb;
+        tb = fib_hash_init(id);
+        if (!tb)
+                return NULL;
+        fib_tables[id] = tb;
+        return tb;
+}
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+static void fib_flush(void)
+{
+        int flushed = 0;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        struct fib_table *tb;
+        int id;
+        for (id = RT_TABLE_MAX; id>0; id--) {
+                if ((tb = fib_get_table(id))==NULL)
+                        continue;
+                flushed += tb->tb_flush(tb);
+        }
+#else /* CONFIG_IP_MULTIPLE_TABLES */
+        flushed += ip_fib_main_table->tb_flush(ip_fib_main_table);
+        flushed += ip_fib_local_table->tb_flush(ip_fib_local_table);
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+        if (flushed)
+                rt_cache_flush(-1);
+}
+/*
+ *      Find the first device with a given source address.
+ */
+struct net_device * ip_dev_find(u32 addr)
+{
+        struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+        struct fib_result res;
+        struct net_device *dev = NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        res.r = NULL;
+#endif
+        if (!ip_fib_local_table ||
+            ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res))
+                return NULL;
+        if (res.type != RTN_LOCAL)
+                goto out;
+        dev = FIB_RES_DEV(res);
+        if (dev)
+                dev_hold(dev);
+out:
+        fib_res_put(&res);
+        return dev;
+}
+unsigned inet_addr_type(u32 addr)
+{
+        struct flowi            fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+        struct fib_result       res;
+        unsigned ret = RTN_BROADCAST;
+        if (ZERONET(addr) || BADCLASS(addr))
+                return RTN_BROADCAST;
+        if (MULTICAST(addr))
+                return RTN_MULTICAST;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        res.r = NULL;
+#endif
+        
+        if (ip_fib_local_table) {
+                ret = RTN_UNICAST;
+                if (!ip_fib_local_table->tb_lookup(ip_fib_local_table,
+                                                   &fl, &res)) {
+                        ret = res.type;
+                        fib_res_put(&res);
+                }
+        }
+        return ret;
+}
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+   - (main) check, that source is valid i.e. not broadcast or our local
+     address.
+   - figure out what "logical" interface this packet arrived
+     and calculate "specific destination" address.
+   - check, that packet arrived from expected physical interface.
+ */
+int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
+                        struct net_device *dev, u32 *spec_dst, u32 *itag)
+{
+        struct in_device *in_dev;
+        struct flowi fl = { .nl_u = { .ip4_u =
+                                      { .daddr = src,
+                                        .saddr = dst,
+                                        .tos = tos } },
+                            .iif = oif };
+        struct fib_result res;
+        int no_addr, rpf;
+        int ret;
+        no_addr = rpf = 0;
+        rcu_read_lock();
+        in_dev = __in_dev_get(dev);
+        if (in_dev) {
+                no_addr = in_dev->ifa_list == NULL;
+                rpf = IN_DEV_RPFILTER(in_dev);
+        }
+        rcu_read_unlock();
+        if (in_dev == NULL)
+                goto e_inval;
+        if (fib_lookup(&fl, &res))
+                goto last_resort;
+        if (res.type != RTN_UNICAST)
+                goto e_inval_res;
+        *spec_dst = FIB_RES_PREFSRC(res);
+        fib_combine_itag(itag, &res);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+#else
+        if (FIB_RES_DEV(res) == dev)
+#endif
+        {
+                ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+                fib_res_put(&res);
+                return ret;
+        }
+        fib_res_put(&res);
+        if (no_addr)
+                goto last_resort;
+        if (rpf)
+                goto e_inval;
+        fl.oif = dev->ifindex;
+        ret = 0;
+        if (fib_lookup(&fl, &res) == 0) {
+                if (res.type == RTN_UNICAST) {
+                        *spec_dst = FIB_RES_PREFSRC(res);
+                        ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+                }
+                fib_res_put(&res);
+        }
+        return ret;
+last_resort:
+        if (rpf)
+                goto e_inval;
+        *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+        *itag = 0;
+        return 0;
+e_inval_res:
+        fib_res_put(&res);
+e_inval:
+        return -EINVAL;
+}
+#ifndef CONFIG_IP_NOSIOCRT
+/*
+ *      Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ */
+ 
+int ip_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+        int err;
+        struct kern_rta rta;
+        struct rtentry  r;
+        struct {
+                struct nlmsghdr nlh;
+                struct rtmsg    rtm;
+        } req;
+        switch (cmd) {
+        case SIOCADDRT:         /* Add a route */
+        case SIOCDELRT:         /* Delete a route */
+                if (!capable(CAP_NET_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&r, arg, sizeof(struct rtentry)))
+                        return -EFAULT;
+                rtnl_lock();
+                err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
+                if (err == 0) {
+                        if (cmd == SIOCDELRT) {
+                                struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
+                                err = -ESRCH;
+                                if (tb)
+                                        err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+                        } else {
+                                struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
+                                err = -ENOBUFS;
+                                if (tb)
+                                        err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+                        }
+                        if (rta.rta_mx)
+                                kfree(rta.rta_mx);
+                }
+                rtnl_unlock();
+                return err;
+        }
+        return -EINVAL;
+}
+#else
+int ip_rt_ioctl(unsigned int cmd, void *arg)
+{
+        return -EINVAL;
+}
+#endif
+static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
+{
+        int i;
+        for (i=1; i<=RTA_MAX; i++) {
+                struct rtattr *attr = rta[i-1];
+                if (attr) {
+                        if (RTA_PAYLOAD(attr) < 4)
+                                return -EINVAL;
+                        if (i != RTA_MULTIPATH && i != RTA_METRICS)
+                                rta[i-1] = (struct rtattr*)RTA_DATA(attr);
+                }
+        }
+        return 0;
+}
+int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+        struct fib_table * tb;
+        struct rtattr **rta = arg;
+        struct rtmsg *r = NLMSG_DATA(nlh);
+        if (inet_check_attr(r, rta))
+                return -EINVAL;
+        tb = fib_get_table(r->rtm_table);
+        if (tb)
+                return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+        return -ESRCH;
+}
+int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+        struct fib_table * tb;
+        struct rtattr **rta = arg;
+        struct rtmsg *r = NLMSG_DATA(nlh);
+        if (inet_check_attr(r, rta))
+                return -EINVAL;
+        tb = fib_new_table(r->rtm_table);
+        if (tb)
+                return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+        return -ENOBUFS;
+}
+int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int t;
+        int s_t;
+        struct fib_table *tb;
+        if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+            ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+                return ip_rt_dump(skb, cb);
+        s_t = cb->args[0];
+        if (s_t == 0)
+                s_t = cb->args[0] = RT_TABLE_MIN;
+        for (t=s_t; t<=RT_TABLE_MAX; t++) {
+                if (t < s_t) continue;
+                if (t > s_t)
+                        memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+                if ((tb = fib_get_table(t))==NULL)
+                        continue;
+                if (tb->tb_dump(tb, skb, cb) < 0) 
+                        break;
+        }
+        cb->args[0] = t;
+        return skb->len;
+}
+/* Prepare and feed intra-kernel routing request.
+   Really, it should be netlink message, but :-( netlink
+   can be not configured, so that we feed it directly
+   to fib engine. It is legal, because all events occur
+   only when netlink is already locked.
+ */
+static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+        struct fib_table * tb;
+        struct {
+                struct nlmsghdr nlh;
+                struct rtmsg    rtm;
+        } req;
+        struct kern_rta rta;
+        memset(&req.rtm, 0, sizeof(req.rtm));
+        memset(&rta, 0, sizeof(rta));
+        if (type == RTN_UNICAST)
+                tb = fib_new_table(RT_TABLE_MAIN);
+        else
+                tb = fib_new_table(RT_TABLE_LOCAL);
+        if (tb == NULL)
+                return;
+        req.nlh.nlmsg_len = sizeof(req);
+        req.nlh.nlmsg_type = cmd;
+        req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
+        req.nlh.nlmsg_pid = 0;
+        req.nlh.nlmsg_seq = 0;
+        req.rtm.rtm_dst_len = dst_len;
+        req.rtm.rtm_table = tb->tb_id;
+        req.rtm.rtm_protocol = RTPROT_KERNEL;
+        req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
+        req.rtm.rtm_type = type;
+        rta.rta_dst = &dst;
+        rta.rta_prefsrc = &ifa->ifa_local;
+        rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
+        if (cmd == RTM_NEWROUTE)
+                tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+        else
+                tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+}
+static void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+        struct in_device *in_dev = ifa->ifa_dev;
+        struct net_device *dev = in_dev->dev;
+        struct in_ifaddr *prim = ifa;
+        u32 mask = ifa->ifa_mask;
+        u32 addr = ifa->ifa_local;
+        u32 prefix = ifa->ifa_address&mask;
+        if (ifa->ifa_flags&IFA_F_SECONDARY) {
+                prim = inet_ifa_byprefix(in_dev, prefix, mask);
+                if (prim == NULL) {
+                        printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+                        return;
+                }
+        }
+        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+        if (!(dev->flags&IFF_UP))
+                return;
+        /* Add broadcast address, if it is explicitly assigned. */
+        if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
+                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+        if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+            (prefix != addr || ifa->ifa_prefixlen < 32)) {
+                fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+                          RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+                /* Add network specific broadcasts, when it takes a sense */
+                if (ifa->ifa_prefixlen < 31) {
+                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+                }
+        }
+}
+static void fib_del_ifaddr(struct in_ifaddr *ifa)
+{
+        struct in_device *in_dev = ifa->ifa_dev;
+        struct net_device *dev = in_dev->dev;
+        struct in_ifaddr *ifa1;
+        struct in_ifaddr *prim = ifa;
+        u32 brd = ifa->ifa_address|~ifa->ifa_mask;
+        u32 any = ifa->ifa_address&ifa->ifa_mask;
+#define LOCAL_OK        1
+#define BRD_OK          2
+#define BRD0_OK         4
+#define BRD1_OK         8
+        unsigned ok = 0;
+        if (!(ifa->ifa_flags&IFA_F_SECONDARY))
+                fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+                          RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+        else {
+                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+                if (prim == NULL) {
+                        printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+                        return;
+                }
+        }
+        /* Deletion is more complicated than add.
+           We should take care of not to delete too much :-)
+           Scan address list to be sure that addresses are really gone.
+         */
+        for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+                if (ifa->ifa_local == ifa1->ifa_local)
+                        ok |= LOCAL_OK;
+                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+                        ok |= BRD_OK;
+                if (brd == ifa1->ifa_broadcast)
+                        ok |= BRD1_OK;
+                if (any == ifa1->ifa_broadcast)
+                        ok |= BRD0_OK;
+        }
+        if (!(ok&BRD_OK))
+                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+        if (!(ok&BRD1_OK))
+                fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+        if (!(ok&BRD0_OK))
+                fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+        if (!(ok&LOCAL_OK)) {
+                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+                /* Check, that this local address finally disappeared. */
+                if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+                        /* And the last, but not the least thing.
+                           We must flush stray FIB entries.
+                           First of all, we scan fib_info list searching
+                           for stray nexthop entries, then ignite fib_flush.
+                        */
+                        if (fib_sync_down(ifa->ifa_local, NULL, 0))
+                                fib_flush();
+                }
+        }
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+static void fib_disable_ip(struct net_device *dev, int force)
+{
+        if (fib_sync_down(0, dev, force))
+                fib_flush();
+        rt_cache_flush(0);
+        arp_ifdown(dev);
+}
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+        struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+        switch (event) {
+        case NETDEV_UP:
+                fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                fib_sync_up(ifa->ifa_dev->dev);
+#endif
+                rt_cache_flush(-1);
+                break;
+        case NETDEV_DOWN:
+                fib_del_ifaddr(ifa);
+                if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+                        /* Last address was deleted from this interface.
+                           Disable IP.
+                         */
+                        fib_disable_ip(ifa->ifa_dev->dev, 1);
+                } else {
+                        rt_cache_flush(-1);
+                }
+                break;
+        }
+        return NOTIFY_DONE;
+}
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+        struct net_device *dev = ptr;
+        struct in_device *in_dev = __in_dev_get(dev);
+        if (event == NETDEV_UNREGISTER) {
+                fib_disable_ip(dev, 2);
+                return NOTIFY_DONE;
+        }
+        if (!in_dev)
+                return NOTIFY_DONE;
+        switch (event) {
+        case NETDEV_UP:
+                for_ifa(in_dev) {
+                        fib_add_ifaddr(ifa);
+                } endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                fib_sync_up(dev);
+#endif
+                rt_cache_flush(-1);
+                break;
+        case NETDEV_DOWN:
+                fib_disable_ip(dev, 0);
+                break;
+        case NETDEV_CHANGEMTU:
+        case NETDEV_CHANGE:
+                rt_cache_flush(0);
+                break;
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block fib_inetaddr_notifier = {
+        .notifier_call =fib_inetaddr_event,
+};
+static struct notifier_block fib_netdev_notifier = {
+        .notifier_call =fib_netdev_event,
+};
+void __init ip_fib_init(void)
+{
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+        ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
+        ip_fib_main_table  = fib_hash_init(RT_TABLE_MAIN);
+#else
+        fib_rules_init();
+#endif
+        register_netdevice_notifier(&fib_netdev_notifier);
+        register_inetaddr_notifier(&fib_inetaddr_notifier);
+}
+EXPORT_SYMBOL(inet_addr_type);
+EXPORT_SYMBOL(ip_dev_find);
+EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
new file mode 100644
index 000000000000..6506dcc01b46
--- /dev/null
+++ b/net/ipv4/fib_hash.c
@@ -0,0 +1,1086 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              IPv4 FIB: lookup engine and maintenance routines.
+ *
+ * Version:     $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include "fib_lookup.h"
+static kmem_cache_t *fn_hash_kmem;
+static kmem_cache_t *fn_alias_kmem;
+struct fib_node {
+        struct hlist_node       fn_hash;
+        struct list_head        fn_alias;
+        u32                     fn_key;
+};
+struct fn_zone {
+        struct fn_zone          *fz_next;       /* Next not empty zone  */
+        struct hlist_head       *fz_hash;       /* Hash table pointer   */
+        int                     fz_nent;        /* Number of entries    */
+        int                     fz_divisor;     /* Hash divisor         */
+        u32                     fz_hashmask;    /* (fz_divisor - 1)     */
+#define FZ_HASHMASK(fz)         ((fz)->fz_hashmask)
+        int                     fz_order;       /* Zone order           */
+        u32                     fz_mask;
+#define FZ_MASK(fz)             ((fz)->fz_mask)
+};
+/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
+ * can be cheaper than memory lookup, so that FZ_* macros are used.
+ */
+struct fn_hash {
+        struct fn_zone  *fn_zones[33];
+        struct fn_zone  *fn_zone_list;
+};
+static inline u32 fn_hash(u32 key, struct fn_zone *fz)
+{
+        u32 h = ntohl(key)>>(32 - fz->fz_order);
+        h ^= (h>>20);
+        h ^= (h>>10);
+        h ^= (h>>5);
+        h &= FZ_HASHMASK(fz);
+        return h;
+}
+static inline u32 fz_key(u32 dst, struct fn_zone *fz)
+{
+        return dst & FZ_MASK(fz);
+}
+static DEFINE_RWLOCK(fib_hash_lock);
+static unsigned int fib_hash_genid;
+#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
+static struct hlist_head *fz_hash_alloc(int divisor)
+{
+        unsigned long size = divisor * sizeof(struct hlist_head);
+        if (size <= PAGE_SIZE) {
+                return kmalloc(size, GFP_KERNEL);
+        } else {
+                return (struct hlist_head *)
+                        __get_free_pages(GFP_KERNEL, get_order(size));
+        }
+}
+/* The fib hash lock must be held when this is called. */
+static inline void fn_rebuild_zone(struct fn_zone *fz,
+                                   struct hlist_head *old_ht,
+                                   int old_divisor)
+{
+        int i;
+        for (i = 0; i < old_divisor; i++) {
+                struct hlist_node *node, *n;
+                struct fib_node *f;
+                hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
+                        struct hlist_head *new_head;
+                        hlist_del(&f->fn_hash);
+                        new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+                        hlist_add_head(&f->fn_hash, new_head);
+                }
+        }
+}
+static void fz_hash_free(struct hlist_head *hash, int divisor)
+{
+        unsigned long size = divisor * sizeof(struct hlist_head);
+        if (size <= PAGE_SIZE)
+                kfree(hash);
+        else
+                free_pages((unsigned long)hash, get_order(size));
+}
+static void fn_rehash_zone(struct fn_zone *fz)
+{
+        struct hlist_head *ht, *old_ht;
+        int old_divisor, new_divisor;
+        u32 new_hashmask;
+                
+        old_divisor = fz->fz_divisor;
+        switch (old_divisor) {
+        case 16:
+                new_divisor = 256;
+                break;
+        case 256:
+                new_divisor = 1024;
+                break;
+        default:
+                if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
+                        printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
+                        return;
+                }
+                new_divisor = (old_divisor << 1);
+                break;
+        }
+        new_hashmask = (new_divisor - 1);
+#if RT_CACHE_DEBUG >= 2
+        printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
+#endif
+        ht = fz_hash_alloc(new_divisor);
+        if (ht) {
+                memset(ht, 0, new_divisor * sizeof(struct hlist_head));
+                write_lock_bh(&fib_hash_lock);
+                old_ht = fz->fz_hash;
+                fz->fz_hash = ht;
+                fz->fz_hashmask = new_hashmask;
+                fz->fz_divisor = new_divisor;
+                fn_rebuild_zone(fz, old_ht, old_divisor);
+                fib_hash_genid++;
+                write_unlock_bh(&fib_hash_lock);
+                fz_hash_free(old_ht, old_divisor);
+        }
+}
+static inline void fn_free_node(struct fib_node * f)
+{
+        kmem_cache_free(fn_hash_kmem, f);
+}
+static inline void fn_free_alias(struct fib_alias *fa)
+{
+        fib_release_info(fa->fa_info);
+        kmem_cache_free(fn_alias_kmem, fa);
+}
+static struct fn_zone *
+fn_new_zone(struct fn_hash *table, int z)
+{
+        int i;
+        struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL);
+        if (!fz)
+                return NULL;
+        memset(fz, 0, sizeof(struct fn_zone));
+        if (z) {
+                fz->fz_divisor = 16;
+        } else {
+                fz->fz_divisor = 1;
+        }
+        fz->fz_hashmask = (fz->fz_divisor - 1);
+        fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
+        if (!fz->fz_hash) {
+                kfree(fz);
+                return NULL;
+        }
+        memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
+        fz->fz_order = z;
+        fz->fz_mask = inet_make_mask(z);
+        /* Find the first not empty zone with more specific mask */
+        for (i=z+1; i<=32; i++)
+                if (table->fn_zones[i])
+                        break;
+        write_lock_bh(&fib_hash_lock);
+        if (i>32) {
+                /* No more specific masks, we are the first. */
+                fz->fz_next = table->fn_zone_list;
+                table->fn_zone_list = fz;
+        } else {
+                fz->fz_next = table->fn_zones[i]->fz_next;
+                table->fn_zones[i]->fz_next = fz;
+        }
+        table->fn_zones[z] = fz;
+        fib_hash_genid++;
+        write_unlock_bh(&fib_hash_lock);
+        return fz;
+}
+static int
+fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+        int err;
+        struct fn_zone *fz;
+        struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+        read_lock(&fib_hash_lock);
+        for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+                struct hlist_head *head;
+                struct hlist_node *node;
+                struct fib_node *f;
+                u32 k = fz_key(flp->fl4_dst, fz);
+                head = &fz->fz_hash[fn_hash(k, fz)];
+                hlist_for_each_entry(f, node, head, fn_hash) {
+                        if (f->fn_key != k)
+                                continue;
+                        err = fib_semantic_match(&f->fn_alias,
+                                                 flp, res,
+                                                 f->fn_key, fz->fz_mask,
+                                                 fz->fz_order);
+                        if (err <= 0)
+                                goto out;
+                }
+        }
+        err = 1;
+out:
+        read_unlock(&fib_hash_lock);
+        return err;
+}
+static int fn_hash_last_dflt=-1;
+static void
+fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+        int order, last_idx;
+        struct hlist_node *node;
+        struct fib_node *f;
+        struct fib_info *fi = NULL;
+        struct fib_info *last_resort;
+        struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+        struct fn_zone *fz = t->fn_zones[0];
+        if (fz == NULL)
+                return;
+        last_idx = -1;
+        last_resort = NULL;
+        order = -1;
+        read_lock(&fib_hash_lock);
+        hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+                struct fib_alias *fa;
+                list_for_each_entry(fa, &f->fn_alias, fa_list) {
+                        struct fib_info *next_fi = fa->fa_info;
+                        if (fa->fa_scope != res->scope ||
+                            fa->fa_type != RTN_UNICAST)
+                                continue;
+                        if (next_fi->fib_priority > res->fi->fib_priority)
+                                break;
+                        if (!next_fi->fib_nh[0].nh_gw ||
+                            next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+                                continue;
+                        fa->fa_state |= FA_S_ACCESSED;
+                        if (fi == NULL) {
+                                if (next_fi != res->fi)
+                                        break;
+                        } else if (!fib_detect_death(fi, order, &last_resort,
+                                                     &last_idx, &fn_hash_last_dflt)) {
+                                if (res->fi)
+                                        fib_info_put(res->fi);
+                                res->fi = fi;
+                                atomic_inc(&fi->fib_clntref);
+                                fn_hash_last_dflt = order;
+                                goto out;
+                        }
+                        fi = next_fi;
+                        order++;
+                }
+        }
+        if (order <= 0 || fi == NULL) {
+                fn_hash_last_dflt = -1;
+                goto out;
+        }
+        if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
+                if (res->fi)
+                        fib_info_put(res->fi);
+                res->fi = fi;
+                atomic_inc(&fi->fib_clntref);
+                fn_hash_last_dflt = order;
+                goto out;
+        }
+        if (last_idx >= 0) {
+                if (res->fi)
+                        fib_info_put(res->fi);
+                res->fi = last_resort;
+                if (last_resort)
+                        atomic_inc(&last_resort->fib_clntref);
+        }
+        fn_hash_last_dflt = last_idx;
+out:
+        read_unlock(&fib_hash_lock);
+}
+/* Insert node F to FZ. */
+static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
+{
+        struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+        hlist_add_head(&f->fn_hash, head);
+}
+/* Return the node in FZ matching KEY. */
+static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key)
+{
+        struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+        struct hlist_node *node;
+        struct fib_node *f;
+        hlist_for_each_entry(f, node, head, fn_hash) {
+                if (f->fn_key == key)
+                        return f;
+        }
+        return NULL;
+}
+static int
+fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+               struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+        struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+        struct fib_node *new_f, *f;
+        struct fib_alias *fa, *new_fa;
+        struct fn_zone *fz;
+        struct fib_info *fi;
+        int z = r->rtm_dst_len;
+        int type = r->rtm_type;
+        u8 tos = r->rtm_tos;
+        u32 key;
+        int err;
+        if (z > 32)
+                return -EINVAL;
+        fz = table->fn_zones[z];
+        if (!fz && !(fz = fn_new_zone(table, z)))
+                return -ENOBUFS;
+        key = 0;
+        if (rta->rta_dst) {
+                u32 dst;
+                memcpy(&dst, rta->rta_dst, 4);
+                if (dst & ~FZ_MASK(fz))
+                        return -EINVAL;
+                key = fz_key(dst, fz);
+        }
+        if  ((fi = fib_create_info(r, rta, n, &err)) == NULL)
+                return err;
+        if (fz->fz_nent > (fz->fz_divisor<<1) &&
+            fz->fz_divisor < FZ_MAX_DIVISOR &&
+            (z==32 || (1<<z) > fz->fz_divisor))
+                fn_rehash_zone(fz);
+        f = fib_find_node(fz, key);
+        if (!f)
+                fa = NULL;
+        else
+                fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
+        /* Now fa, if non-NULL, points to the first fib alias
+         * with the same keys [prefix,tos,priority], if such key already
+         * exists or to the node before which we will insert new one.
+         *
+         * If fa is NULL, we will need to allocate a new one and
+         * insert to the head of f.
+         *
+         * If f is NULL, no fib node matched the destination key
+         * and we need to allocate a new one of those as well.
+         */
+        if (fa && fa->fa_tos == tos &&
+            fa->fa_info->fib_priority == fi->fib_priority) {
+                struct fib_alias *fa_orig;
+                err = -EEXIST;
+                if (n->nlmsg_flags & NLM_F_EXCL)
+                        goto out;
+                if (n->nlmsg_flags & NLM_F_REPLACE) {
+                        struct fib_info *fi_drop;
+                        u8 state;
+                        write_lock_bh(&fib_hash_lock);
+                        fi_drop = fa->fa_info;
+                        fa->fa_info = fi;
+                        fa->fa_type = type;
+                        fa->fa_scope = r->rtm_scope;
+                        state = fa->fa_state;
+                        fa->fa_state &= ~FA_S_ACCESSED;
+                        fib_hash_genid++;
+                        write_unlock_bh(&fib_hash_lock);
+                        fib_release_info(fi_drop);
+                        if (state & FA_S_ACCESSED)
+                                rt_cache_flush(-1);
+                        return 0;
+                }
+                /* Error if we find a perfect match which
+                 * uses the same scope, type, and nexthop
+                 * information.
+                 */
+                fa_orig = fa;
+                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+                list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+                        if (fa->fa_tos != tos)
+                                break;
+                        if (fa->fa_info->fib_priority != fi->fib_priority)
+                                break;
+                        if (fa->fa_type == type &&
+                            fa->fa_scope == r->rtm_scope &&
+                            fa->fa_info == fi)
+                                goto out;
+                }
+                if (!(n->nlmsg_flags & NLM_F_APPEND))
+                        fa = fa_orig;
+        }
+        err = -ENOENT;
+        if (!(n->nlmsg_flags&NLM_F_CREATE))
+                goto out;
+        err = -ENOBUFS;
+        new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+        if (new_fa == NULL)
+                goto out;
+        new_f = NULL;
+        if (!f) {
+                new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
+                if (new_f == NULL)
+                        goto out_free_new_fa;
+                INIT_HLIST_NODE(&new_f->fn_hash);
+                INIT_LIST_HEAD(&new_f->fn_alias);
+                new_f->fn_key = key;
+                f = new_f;
+        }
+        new_fa->fa_info = fi;
+        new_fa->fa_tos = tos;
+        new_fa->fa_type = type;
+        new_fa->fa_scope = r->rtm_scope;
+        new_fa->fa_state = 0;
+        /*
+         * Insert new entry to the list.
+         */
+        write_lock_bh(&fib_hash_lock);
+        if (new_f)
+                fib_insert_node(fz, new_f);
+        list_add_tail(&new_fa->fa_list,
+                 (fa ? &fa->fa_list : &f->fn_alias));
+        fib_hash_genid++;
+        write_unlock_bh(&fib_hash_lock);
+        if (new_f)
+                fz->fz_nent++;
+        rt_cache_flush(-1);
+        rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req);
+        return 0;
+out_free_new_fa:
+        kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+        fib_release_info(fi);
+        return err;
+}
+static int
+fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+               struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+        struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+        struct fib_node *f;
+        struct fib_alias *fa, *fa_to_delete;
+        int z = r->rtm_dst_len;
+        struct fn_zone *fz;
+        u32 key;
+        u8 tos = r->rtm_tos;
+        if (z > 32)
+                return -EINVAL;
+        if ((fz  = table->fn_zones[z]) == NULL)
+                return -ESRCH;
+        key = 0;
+        if (rta->rta_dst) {
+                u32 dst;
+                memcpy(&dst, rta->rta_dst, 4);
+                if (dst & ~FZ_MASK(fz))
+                        return -EINVAL;
+                key = fz_key(dst, fz);
+        }
+        f = fib_find_node(fz, key);
+        if (!f)
+                fa = NULL;
+        else
+                fa = fib_find_alias(&f->fn_alias, tos, 0);
+        if (!fa)
+                return -ESRCH;
+        fa_to_delete = NULL;
+        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+        list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+                struct fib_info *fi = fa->fa_info;
+                if (fa->fa_tos != tos)
+                        break;
+                if ((!r->rtm_type ||
+                     fa->fa_type == r->rtm_type) &&
+                    (r->rtm_scope == RT_SCOPE_NOWHERE ||
+                     fa->fa_scope == r->rtm_scope) &&
+                    (!r->rtm_protocol ||
+                     fi->fib_protocol == r->rtm_protocol) &&
+                    fib_nh_match(r, n, rta, fi) == 0) {
+                        fa_to_delete = fa;
+                        break;
+                }
+        }
+        if (fa_to_delete) {
+                int kill_fn;
+                fa = fa_to_delete;
+                rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req);
+                kill_fn = 0;
+                write_lock_bh(&fib_hash_lock);
+                list_del(&fa->fa_list);
+                if (list_empty(&f->fn_alias)) {
+                        hlist_del(&f->fn_hash);
+                        kill_fn = 1;
+                }
+                fib_hash_genid++;
+                write_unlock_bh(&fib_hash_lock);
+                if (fa->fa_state & FA_S_ACCESSED)
+                        rt_cache_flush(-1);
+                fn_free_alias(fa);
+                if (kill_fn) {
+                        fn_free_node(f);
+                        fz->fz_nent--;
+                }
+                return 0;
+        }
+        return -ESRCH;
+}
+static int fn_flush_list(struct fn_zone *fz, int idx)
+{
+        struct hlist_head *head = &fz->fz_hash[idx];
+        struct hlist_node *node, *n;
+        struct fib_node *f;
+        int found = 0;
+        hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
+                struct fib_alias *fa, *fa_node;
+                int kill_f;
+                kill_f = 0;
+                list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
+                        struct fib_info *fi = fa->fa_info;
+                        if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
+                                write_lock_bh(&fib_hash_lock);
+                                list_del(&fa->fa_list);
+                                if (list_empty(&f->fn_alias)) {
+                                        hlist_del(&f->fn_hash);
+                                        kill_f = 1;
+                                }
+                                fib_hash_genid++;
+                                write_unlock_bh(&fib_hash_lock);
+                                fn_free_alias(fa);
+                                found++;
+                        }
+                }
+                if (kill_f) {
+                        fn_free_node(f);
+                        fz->fz_nent--;
+                }
+        }
+        return found;
+}
+static int fn_hash_flush(struct fib_table *tb)
+{
+        struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+        struct fn_zone *fz;
+        int found = 0;
+        for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+                int i;
+                for (i = fz->fz_divisor - 1; i >= 0; i--)
+                        found += fn_flush_list(fz, i);
+        }
+        return found;
+}
+static inline int
+fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
+                     struct fib_table *tb,
+                     struct fn_zone *fz,
+                     struct hlist_head *head)
+{
+        struct hlist_node *node;
+        struct fib_node *f;
+        int i, s_i;
+        s_i = cb->args[3];
+        i = 0;
+        hlist_for_each_entry(f, node, head, fn_hash) {
+                struct fib_alias *fa;
+                list_for_each_entry(fa, &f->fn_alias, fa_list) {
+                        if (i < s_i)
+                                goto next;
+                        if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+                                          cb->nlh->nlmsg_seq,
+                                          RTM_NEWROUTE,
+                                          tb->tb_id,
+                                          fa->fa_type,
+                                          fa->fa_scope,
+                                          &f->fn_key,
+                                          fz->fz_order,
+                                          fa->fa_tos,
+                                          fa->fa_info) < 0) {
+                                cb->args[3] = i;
+                                return -1;
+                        }
+                next:
+                        i++;
+                }
+        }
+        cb->args[3] = i;
+        return skb->len;
+}
+static inline int
+fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
+                   struct fib_table *tb,
+                   struct fn_zone *fz)
+{
+        int h, s_h;
+        s_h = cb->args[2];
+        for (h=0; h < fz->fz_divisor; h++) {
+                if (h < s_h) continue;
+                if (h > s_h)
+                        memset(&cb->args[3], 0,
+                               sizeof(cb->args) - 3*sizeof(cb->args[0]));
+                if (fz->fz_hash == NULL ||
+                    hlist_empty(&fz->fz_hash[h]))
+                        continue;
+                if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
+                        cb->args[2] = h;
+                        return -1;
+                }
+        }
+        cb->args[2] = h;
+        return skb->len;
+}
+static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int m, s_m;
+        struct fn_zone *fz;
+        struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+        s_m = cb->args[1];
+        read_lock(&fib_hash_lock);
+        for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
+                if (m < s_m) continue;
+                if (m > s_m)
+                        memset(&cb->args[2], 0,
+                               sizeof(cb->args) - 2*sizeof(cb->args[0]));
+                if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
+                        cb->args[1] = m;
+                        read_unlock(&fib_hash_lock);
+                        return -1;
+                }
+        }
+        read_unlock(&fib_hash_lock);
+        cb->args[1] = m;
+        return skb->len;
+}
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+struct fib_table * fib_hash_init(int id)
+#else
+struct fib_table * __init fib_hash_init(int id)
+#endif
+{
+        struct fib_table *tb;
+        if (fn_hash_kmem == NULL)
+                fn_hash_kmem = kmem_cache_create("ip_fib_hash",
+                                                 sizeof(struct fib_node),
+                                                 0, SLAB_HWCACHE_ALIGN,
+                                                 NULL, NULL);
+        if (fn_alias_kmem == NULL)
+                fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+                                                  sizeof(struct fib_alias),
+                                                  0, SLAB_HWCACHE_ALIGN,
+                                                  NULL, NULL);
+        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
+                     GFP_KERNEL);
+        if (tb == NULL)
+                return NULL;
+        tb->tb_id = id;
+        tb->tb_lookup = fn_hash_lookup;
+        tb->tb_insert = fn_hash_insert;
+        tb->tb_delete = fn_hash_delete;
+        tb->tb_flush = fn_hash_flush;
+        tb->tb_select_default = fn_hash_select_default;
+        tb->tb_dump = fn_hash_dump;
+        memset(tb->tb_data, 0, sizeof(struct fn_hash));
+        return tb;
+}
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+struct fib_iter_state {
+        struct fn_zone  *zone;
+        int             bucket;
+        struct hlist_head *hash_head;
+        struct fib_node *fn;
+        struct fib_alias *fa;
+        loff_t pos;
+        unsigned int genid;
+        int valid;
+};
+static struct fib_alias *fib_get_first(struct seq_file *seq)
+{
+        struct fib_iter_state *iter = seq->private;
+        struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
+        iter->bucket    = 0;
+        iter->hash_head = NULL;
+        iter->fn        = NULL;
+        iter->fa        = NULL;
+        iter->pos       = 0;
+        iter->genid     = fib_hash_genid;
+        iter->valid     = 1;
+        for (iter->zone = table->fn_zone_list; iter->zone;
+             iter->zone = iter->zone->fz_next) {
+                int maxslot;
+                if (!iter->zone->fz_nent)
+                        continue;
+                iter->hash_head = iter->zone->fz_hash;
+                maxslot = iter->zone->fz_divisor;
+                for (iter->bucket = 0; iter->bucket < maxslot;
+                     ++iter->bucket, ++iter->hash_head) {
+                        struct hlist_node *node;
+                        struct fib_node *fn;
+                        hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
+                                struct fib_alias *fa;
+                                list_for_each_entry(fa,&fn->fn_alias,fa_list) {
+                                        iter->fn = fn;
+                                        iter->fa = fa;
+                                        goto out;
+                                }
+                        }
+                }
+        }
+out:
+        return iter->fa;
+}
+static struct fib_alias *fib_get_next(struct seq_file *seq)
+{
+        struct fib_iter_state *iter = seq->private;
+        struct fib_node *fn;
+        struct fib_alias *fa;
+        /* Advance FA, if any. */
+        fn = iter->fn;
+        fa = iter->fa;
+        if (fa) {
+                BUG_ON(!fn);
+                list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
+                        iter->fa = fa;
+                        goto out;
+                }
+        }
+        fa = iter->fa = NULL;
+        /* Advance FN. */
+        if (fn) {
+                struct hlist_node *node = &fn->fn_hash;
+                hlist_for_each_entry_continue(fn, node, fn_hash) {
+                        iter->fn = fn;
+                        list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+                                iter->fa = fa;
+                                goto out;
+                        }
+                }
+        }
+        fn = iter->fn = NULL;
+        /* Advance hash chain. */
+        if (!iter->zone)
+                goto out;
+        for (;;) {
+                struct hlist_node *node;
+                int maxslot;
+                maxslot = iter->zone->fz_divisor;
+                while (++iter->bucket < maxslot) {
+                        iter->hash_head++;
+                        hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+                                list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+                                        iter->fn = fn;
+                                        iter->fa = fa;
+                                        goto out;
+                                }
+                        }
+                }
+                iter->zone = iter->zone->fz_next;
+                if (!iter->zone)
+                        goto out;
+                
+                iter->bucket = 0;
+                iter->hash_head = iter->zone->fz_hash;
+                hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+                        list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+                                iter->fn = fn;
+                                iter->fa = fa;
+                                goto out;
+                        }
+                }
+        }
+out:
+        iter->pos++;
+        return fa;
+}
+static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct fib_iter_state *iter = seq->private;
+        struct fib_alias *fa;
+        
+        if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
+                fa   = iter->fa;
+                pos -= iter->pos;
+        } else
+                fa = fib_get_first(seq);
+        if (fa)
+                while (pos && (fa = fib_get_next(seq)))
+                        --pos;
+        return pos ? NULL : fa;
+}
+static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        void *v = NULL;
+        read_lock(&fib_hash_lock);
+        if (ip_fib_main_table)
+                v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+        return v;
+}
+static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        ++*pos;
+        return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
+}
+static void fib_seq_stop(struct seq_file *seq, void *v)
+{
+        read_unlock(&fib_hash_lock);
+}
+static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi)
+{
+        static unsigned type2flags[RTN_MAX + 1] = {
+                [7] = RTF_REJECT, [8] = RTF_REJECT,
+        };
+        unsigned flags = type2flags[type];
+        if (fi && fi->fib_nh->nh_gw)
+                flags |= RTF_GATEWAY;
+        if (mask == 0xFFFFFFFF)
+                flags |= RTF_HOST;
+        flags |= RTF_UP;
+        return flags;
+}
+/* 
+ *      This outputs /proc/net/route.
+ *
+ *      It always works in backward compatibility mode.
+ *      The format of the file is not supposed to be changed.
+ */
+static int fib_seq_show(struct seq_file *seq, void *v)
+{
+        struct fib_iter_state *iter;
+        char bf[128];
+        u32 prefix, mask;
+        unsigned flags;
+        struct fib_node *f;
+        struct fib_alias *fa;
+        struct fib_info *fi;
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+                           "\tWindow\tIRTT");
+                goto out;
+        }
+        iter    = seq->private;
+        f       = iter->fn;
+        fa      = iter->fa;
+        fi      = fa->fa_info;
+        prefix  = f->fn_key;
+        mask    = FZ_MASK(iter->zone);
+        flags   = fib_flag_trans(fa->fa_type, mask, fi);
+        if (fi)
+                snprintf(bf, sizeof(bf),
+                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+                         fi->fib_dev ? fi->fib_dev->name : "*", prefix,
+                         fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
+                         mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+                         fi->fib_window,
+                         fi->fib_rtt >> 3);
+        else
+                snprintf(bf, sizeof(bf),
+                         "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+                         prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0);
+        seq_printf(seq, "%-127s\n", bf);
+out:
+        return 0;
+}
+static struct seq_operations fib_seq_ops = {
+        .start  = fib_seq_start,
+        .next   = fib_seq_next,
+        .stop   = fib_seq_stop,
+        .show   = fib_seq_show,
+};
+static int fib_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct fib_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+        if (!s)
+                goto out;
+        rc = seq_open(file, &fib_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq          = file->private_data;
+        seq->private = s;
+        memset(s, 0, sizeof(*s));
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations fib_seq_fops = {
+        .owner          = THIS_MODULE,
+        .open           = fib_seq_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
+int __init fib_proc_init(void)
+{
+        if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
+                return -ENOMEM;
+        return 0;
+}
+void __init fib_proc_exit(void)
+{
+        proc_net_remove("route");
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 000000000000..ac4485f75e97
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,43 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/ip_fib.h>
+struct fib_alias {
+        struct list_head        fa_list;
+        struct fib_info         *fa_info;
+        u8                      fa_tos;
+        u8                      fa_type;
+        u8                      fa_scope;
+        u8                      fa_state;
+};
+#define FA_S_ACCESSED   0x01
+/* Exported by fib_semantics.c */
+extern int fib_semantic_match(struct list_head *head,
+                              const struct flowi *flp,
+                              struct fib_result *res, __u32 zone, __u32 mask,
+                                int prefixlen);
+extern void fib_release_info(struct fib_info *);
+extern struct fib_info *fib_create_info(const struct rtmsg *r,
+                                        struct kern_rta *rta,
+                                        const struct nlmsghdr *,
+                                        int *err);
+extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
+                        struct kern_rta *rta, struct fib_info *fi);
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+                         u8 tb_id, u8 type, u8 scope, void *dst,
+                         int dst_len, u8 tos, struct fib_info *fi);
+extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+                      int z, int tb_id,
+                      struct nlmsghdr *n, struct netlink_skb_parms *req);
+extern struct fib_alias *fib_find_alias(struct list_head *fah,
+                                        u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+                            struct fib_info **last_resort,
+                            int *last_idx, int *dflt);
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 000000000000..39d0aadb9a2a
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,437 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              IPv4 Forwarding Information Base: policy rules.
+ *
+ * Version:     $Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *              Rani Assaf      :       local_rule cannot be deleted
+ *              Marc Boucher    :       routing by fwmark
+ */
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#define FRprintk(a...)
+struct fib_rule
+{
+        struct fib_rule *r_next;
+        atomic_t        r_clntref;
+        u32             r_preference;
+        unsigned char   r_table;
+        unsigned char   r_action;
+        unsigned char   r_dst_len;
+        unsigned char   r_src_len;
+        u32             r_src;
+        u32             r_srcmask;
+        u32             r_dst;
+        u32             r_dstmask;
+        u32             r_srcmap;
+        u8              r_flags;
+        u8              r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+        u32             r_fwmark;
+#endif
+        int             r_ifindex;
+#ifdef CONFIG_NET_CLS_ROUTE
+        __u32           r_tclassid;
+#endif
+        char            r_ifname[IFNAMSIZ];
+        int             r_dead;
+};
+static struct fib_rule default_rule = {
+        .r_clntref =    ATOMIC_INIT(2),
+        .r_preference = 0x7FFF,
+        .r_table =      RT_TABLE_DEFAULT,
+        .r_action =     RTN_UNICAST,
+};
+static struct fib_rule main_rule = {
+        .r_next =       &default_rule,
+        .r_clntref =    ATOMIC_INIT(2),
+        .r_preference = 0x7FFE,
+        .r_table =      RT_TABLE_MAIN,
+        .r_action =     RTN_UNICAST,
+};
+static struct fib_rule local_rule = {
+        .r_next =       &main_rule,
+        .r_clntref =    ATOMIC_INIT(2),
+        .r_table =      RT_TABLE_LOCAL,
+        .r_action =     RTN_UNICAST,
+};
+static struct fib_rule *fib_rules = &local_rule;
+static DEFINE_RWLOCK(fib_rules_lock);
+int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+        struct rtattr **rta = arg;
+        struct rtmsg *rtm = NLMSG_DATA(nlh);
+        struct fib_rule *r, **rp;
+        int err = -ESRCH;
+        for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
+                if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
+                    rtm->rtm_src_len == r->r_src_len &&
+                    rtm->rtm_dst_len == r->r_dst_len &&
+                    (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
+                    rtm->rtm_tos == r->r_tos &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                    (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
+#endif
+                    (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
+                    (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
+                    (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) &&
+                    (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
+                        err = -EPERM;
+                        if (r == &local_rule)
+                                break;
+                        write_lock_bh(&fib_rules_lock);
+                        *rp = r->r_next;
+                        r->r_dead = 1;
+                        write_unlock_bh(&fib_rules_lock);
+                        fib_rule_put(r);
+                        err = 0;
+                        break;
+                }
+        }
+        return err;
+}
+/* Allocate new unique table id */
+static struct fib_table *fib_empty_table(void)
+{
+        int id;
+        for (id = 1; id <= RT_TABLE_MAX; id++)
+                if (fib_tables[id] == NULL)
+                        return __fib_new_table(id);
+        return NULL;
+}
+void fib_rule_put(struct fib_rule *r)
+{
+        if (atomic_dec_and_test(&r->r_clntref)) {
+                if (r->r_dead)
+                        kfree(r);
+                else
+                        printk("Freeing alive rule %p\n", r);
+        }
+}
+int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+        struct rtattr **rta = arg;
+        struct rtmsg *rtm = NLMSG_DATA(nlh);
+        struct fib_rule *r, *new_r, **rp;
+        unsigned char table_id;
+        if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 ||
+            (rtm->rtm_tos & ~IPTOS_TOS_MASK))
+                return -EINVAL;
+        if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
+                return -EINVAL;
+        table_id = rtm->rtm_table;
+        if (table_id == RT_TABLE_UNSPEC) {
+                struct fib_table *table;
+                if (rtm->rtm_type == RTN_UNICAST) {
+                        if ((table = fib_empty_table()) == NULL)
+                                return -ENOBUFS;
+                        table_id = table->tb_id;
+                }
+        }
+        new_r = kmalloc(sizeof(*new_r), GFP_KERNEL);
+        if (!new_r)
+                return -ENOMEM;
+        memset(new_r, 0, sizeof(*new_r));
+        if (rta[RTA_SRC-1])
+                memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4);
+        if (rta[RTA_DST-1])
+                memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4);
+        if (rta[RTA_GATEWAY-1])
+                memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4);
+        new_r->r_src_len = rtm->rtm_src_len;
+        new_r->r_dst_len = rtm->rtm_dst_len;
+        new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
+        new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
+        new_r->r_tos = rtm->rtm_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+        if (rta[RTA_PROTOINFO-1])
+                memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+#endif
+        new_r->r_action = rtm->rtm_type;
+        new_r->r_flags = rtm->rtm_flags;
+        if (rta[RTA_PRIORITY-1])
+                memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
+        new_r->r_table = table_id;
+        if (rta[RTA_IIF-1]) {
+                struct net_device *dev;
+                rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ);
+                new_r->r_ifindex = -1;
+                dev = __dev_get_by_name(new_r->r_ifname);
+                if (dev)
+                        new_r->r_ifindex = dev->ifindex;
+        }
+#ifdef CONFIG_NET_CLS_ROUTE
+        if (rta[RTA_FLOW-1])
+                memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4);
+#endif
+        rp = &fib_rules;
+        if (!new_r->r_preference) {
+                r = fib_rules;
+                if (r && (r = r->r_next) != NULL) {
+                        rp = &fib_rules->r_next;
+                        if (r->r_preference)
+                                new_r->r_preference = r->r_preference - 1;
+                }
+        }
+        while ( (r = *rp) != NULL ) {
+                if (r->r_preference > new_r->r_preference)
+                        break;
+                rp = &r->r_next;
+        }
+        new_r->r_next = r;
+        atomic_inc(&new_r->r_clntref);
+        write_lock_bh(&fib_rules_lock);
+        *rp = new_r;
+        write_unlock_bh(&fib_rules_lock);
+        return 0;
+}
+#ifdef CONFIG_NET_CLS_ROUTE
+u32 fib_rules_tclass(struct fib_result *res)
+{
+        if (res->r)
+                return res->r->r_tclassid;
+        return 0;
+}
+#endif
+static void fib_rules_detach(struct net_device *dev)
+{
+        struct fib_rule *r;
+        for (r=fib_rules; r; r=r->r_next) {
+                if (r->r_ifindex == dev->ifindex) {
+                        write_lock_bh(&fib_rules_lock);
+                        r->r_ifindex = -1;
+                        write_unlock_bh(&fib_rules_lock);
+                }
+        }
+}
+static void fib_rules_attach(struct net_device *dev)
+{
+        struct fib_rule *r;
+        for (r=fib_rules; r; r=r->r_next) {
+                if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) {
+                        write_lock_bh(&fib_rules_lock);
+                        r->r_ifindex = dev->ifindex;
+                        write_unlock_bh(&fib_rules_lock);
+                }
+        }
+}
+int fib_lookup(const struct flowi *flp, struct fib_result *res)
+{
+        int err;
+        struct fib_rule *r, *policy;
+        struct fib_table *tb;
+        u32 daddr = flp->fl4_dst;
+        u32 saddr = flp->fl4_src;
+FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ",
+        NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src));
+        read_lock(&fib_rules_lock);
+        for (r = fib_rules; r; r=r->r_next) {
+                if (((saddr^r->r_src) & r->r_srcmask) ||
+                    ((daddr^r->r_dst) & r->r_dstmask) ||
+                    (r->r_tos && r->r_tos != flp->fl4_tos) ||
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                    (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) ||
+#endif
+                    (r->r_ifindex && r->r_ifindex != flp->iif))
+                        continue;
+FRprintk("tb %d r %d ", r->r_table, r->r_action);
+                switch (r->r_action) {
+                case RTN_UNICAST:
+                        policy = r;
+                        break;
+                case RTN_UNREACHABLE:
+                        read_unlock(&fib_rules_lock);
+                        return -ENETUNREACH;
+                default:
+                case RTN_BLACKHOLE:
+                        read_unlock(&fib_rules_lock);
+                        return -EINVAL;
+                case RTN_PROHIBIT:
+                        read_unlock(&fib_rules_lock);
+                        return -EACCES;
+                }
+                if ((tb = fib_get_table(r->r_table)) == NULL)
+                        continue;
+                err = tb->tb_lookup(tb, flp, res);
+                if (err == 0) {
+                        res->r = policy;
+                        if (policy)
+                                atomic_inc(&policy->r_clntref);
+                        read_unlock(&fib_rules_lock);
+                        return 0;
+                }
+                if (err < 0 && err != -EAGAIN) {
+                        read_unlock(&fib_rules_lock);
+                        return err;
+                }
+        }
+FRprintk("FAILURE\n");
+        read_unlock(&fib_rules_lock);
+        return -ENETUNREACH;
+}
+void fib_select_default(const struct flowi *flp, struct fib_result *res)
+{
+        if (res->r && res->r->r_action == RTN_UNICAST &&
+            FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+                struct fib_table *tb;
+                if ((tb = fib_get_table(res->r->r_table)) != NULL)
+                        tb->tb_select_default(tb, flp, res);
+        }
+}
+static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+        struct net_device *dev = ptr;
+        if (event == NETDEV_UNREGISTER)
+                fib_rules_detach(dev);
+        else if (event == NETDEV_REGISTER)
+                fib_rules_attach(dev);
+        return NOTIFY_DONE;
+}
+static struct notifier_block fib_rules_notifier = {
+        .notifier_call =fib_rules_event,
+};
+static __inline__ int inet_fill_rule(struct sk_buff *skb,
+                                     struct fib_rule *r,
+                                     struct netlink_callback *cb)
+{
+        struct rtmsg *rtm;
+        struct nlmsghdr  *nlh;
+        unsigned char    *b = skb->tail;
+        nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+        rtm = NLMSG_DATA(nlh);
+        rtm->rtm_family = AF_INET;
+        rtm->rtm_dst_len = r->r_dst_len;
+        rtm->rtm_src_len = r->r_src_len;
+        rtm->rtm_tos = r->r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+        if (r->r_fwmark)
+                RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+#endif
+        rtm->rtm_table = r->r_table;
+        rtm->rtm_protocol = 0;
+        rtm->rtm_scope = 0;
+        rtm->rtm_type = r->r_action;
+        rtm->rtm_flags = r->r_flags;
+        if (r->r_dst_len)
+                RTA_PUT(skb, RTA_DST, 4, &r->r_dst);
+        if (r->r_src_len)
+                RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
+        if (r->r_ifname[0])
+                RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
+        if (r->r_preference)
+                RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
+        if (r->r_srcmap)
+                RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap);
+#ifdef CONFIG_NET_CLS_ROUTE
+        if (r->r_tclassid)
+                RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid);
+#endif
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int idx;
+        int s_idx = cb->args[0];
+        struct fib_rule *r;
+        read_lock(&fib_rules_lock);
+        for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
+                if (idx < s_idx)
+                        continue;
+                if (inet_fill_rule(skb, r, cb) < 0)
+                        break;
+        }
+        read_unlock(&fib_rules_lock);
+        cb->args[0] = idx;
+        return skb->len;
+}
+void __init fib_rules_init(void)
+{
+        register_netdevice_notifier(&fib_rules_notifier);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 000000000000..029362d66135
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1332 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              IPv4 Forwarding Information Base: semantics.
+ *
+ * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/ip_mp_alg.h>
+#include "fib_lookup.h"
+#define FSprintk(a...)
+static DEFINE_RWLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_hash_size;
+static unsigned int fib_info_cnt;
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static DEFINE_SPINLOCK(fib_multipath_lock);
+#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
+for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
+for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
+for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
+for (nhsel=0; nhsel < 1; nhsel++)
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+#define endfor_nexthops(fi) }
+static struct 
+{
+        int     error;
+        u8      scope;
+} fib_props[RTA_MAX + 1] = {
+        {
+                .error  = 0,
+                .scope  = RT_SCOPE_NOWHERE,
+        },      /* RTN_UNSPEC */
+        {
+                .error  = 0,
+                .scope  = RT_SCOPE_UNIVERSE,
+        },      /* RTN_UNICAST */
+        {
+                .error  = 0,
+                .scope  = RT_SCOPE_HOST,
+        },      /* RTN_LOCAL */
+        {
+                .error  = 0,
+                .scope  = RT_SCOPE_LINK,
+        },      /* RTN_BROADCAST */
+        {
+                .error  = 0,
+                .scope  = RT_SCOPE_LINK,
+        },      /* RTN_ANYCAST */
+        {
+                .error  = 0,
+                .scope  = RT_SCOPE_UNIVERSE,
+        },      /* RTN_MULTICAST */
+        {
+                .error  = -EINVAL,
+                .scope  = RT_SCOPE_UNIVERSE,
+        },      /* RTN_BLACKHOLE */
+        {
+                .error  = -EHOSTUNREACH,
+                .scope  = RT_SCOPE_UNIVERSE,
+        },      /* RTN_UNREACHABLE */
+        {
+                .error  = -EACCES,
+                .scope  = RT_SCOPE_UNIVERSE,
+        },      /* RTN_PROHIBIT */
+        {
+                .error  = -EAGAIN,
+                .scope  = RT_SCOPE_UNIVERSE,
+        },      /* RTN_THROW */
+        {
+                .error  = -EINVAL,
+                .scope  = RT_SCOPE_NOWHERE,
+        },      /* RTN_NAT */
+        {
+                .error  = -EINVAL,
+                .scope  = RT_SCOPE_NOWHERE,
+        },      /* RTN_XRESOLVE */
+};
+/* Release a nexthop info record */
+void free_fib_info(struct fib_info *fi)
+{
+        if (fi->fib_dead == 0) {
+                printk("Freeing alive fib_info %p\n", fi);
+                return;
+        }
+        change_nexthops(fi) {
+                if (nh->nh_dev)
+                        dev_put(nh->nh_dev);
+                nh->nh_dev = NULL;
+        } endfor_nexthops(fi);
+        fib_info_cnt--;
+        kfree(fi);
+}
+void fib_release_info(struct fib_info *fi)
+{
+        write_lock(&fib_info_lock);
+        if (fi && --fi->fib_treeref == 0) {
+                hlist_del(&fi->fib_hash);
+                if (fi->fib_prefsrc)
+                        hlist_del(&fi->fib_lhash);
+                change_nexthops(fi) {
+                        if (!nh->nh_dev)
+                                continue;
+                        hlist_del(&nh->nh_hash);
+                } endfor_nexthops(fi)
+                fi->fib_dead = 1;
+                fib_info_put(fi);
+        }
+        write_unlock(&fib_info_lock);
+}
+static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+        const struct fib_nh *onh = ofi->fib_nh;
+        for_nexthops(fi) {
+                if (nh->nh_oif != onh->nh_oif ||
+                    nh->nh_gw  != onh->nh_gw ||
+                    nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                    nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+                    nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+                    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+                        return -1;
+                onh++;
+        } endfor_nexthops(fi);
+        return 0;
+}
+static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+{
+        unsigned int mask = (fib_hash_size - 1);
+        unsigned int val = fi->fib_nhs;
+        val ^= fi->fib_protocol;
+        val ^= fi->fib_prefsrc;
+        val ^= fi->fib_priority;
+        return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+static struct fib_info *fib_find_info(const struct fib_info *nfi)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct fib_info *fi;
+        unsigned int hash;
+        hash = fib_info_hashfn(nfi);
+        head = &fib_info_hash[hash];
+        hlist_for_each_entry(fi, node, head, fib_hash) {
+                if (fi->fib_nhs != nfi->fib_nhs)
+                        continue;
+                if (nfi->fib_protocol == fi->fib_protocol &&
+                    nfi->fib_prefsrc == fi->fib_prefsrc &&
+                    nfi->fib_priority == fi->fib_priority &&
+                    memcmp(nfi->fib_metrics, fi->fib_metrics,
+                           sizeof(fi->fib_metrics)) == 0 &&
+                    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+                    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+                        return fi;
+        }
+        return NULL;
+}
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+        unsigned int mask = DEVINDEX_HASHSIZE - 1;
+        return (val ^
+                (val >> DEVINDEX_HASHBITS) ^
+                (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+/* Check, that the gateway is already configured.
+   Used only by redirect accept routine.
+ */
+int ip_fib_check_default(u32 gw, struct net_device *dev)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct fib_nh *nh;
+        unsigned int hash;
+        read_lock(&fib_info_lock);
+        hash = fib_devindex_hashfn(dev->ifindex);
+        head = &fib_info_devhash[hash];
+        hlist_for_each_entry(nh, node, head, nh_hash) {
+                if (nh->nh_dev == dev &&
+                    nh->nh_gw == gw &&
+                    !(nh->nh_flags&RTNH_F_DEAD)) {
+                        read_unlock(&fib_info_lock);
+                        return 0;
+                }
+        }
+        read_unlock(&fib_info_lock);
+        return -1;
+}
+void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+               int z, int tb_id,
+               struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+        struct sk_buff *skb;
+        u32 pid = req ? req->pid : 0;
+        int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
+        skb = alloc_skb(size, GFP_KERNEL);
+        if (!skb)
+                return;
+        if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
+                          fa->fa_type, fa->fa_scope, &key, z,
+                          fa->fa_tos,
+                          fa->fa_info) < 0) {
+                kfree_skb(skb);
+                return;
+        }
+        NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+        if (n->nlmsg_flags&NLM_F_ECHO)
+                atomic_inc(&skb->users);
+        netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+        if (n->nlmsg_flags&NLM_F_ECHO)
+                netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+}
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+{
+        if (fah) {
+                struct fib_alias *fa;
+                list_for_each_entry(fa, fah, fa_list) {
+                        if (fa->fa_tos > tos)
+                                continue;
+                        if (fa->fa_info->fib_priority >= prio ||
+                            fa->fa_tos < tos)
+                                return fa;
+                }
+        }
+        return NULL;
+}
+int fib_detect_death(struct fib_info *fi, int order,
+                     struct fib_info **last_resort, int *last_idx, int *dflt)
+{
+        struct neighbour *n;
+        int state = NUD_NONE;
+        n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+        if (n) {
+                state = n->nud_state;
+                neigh_release(n);
+        }
+        if (state==NUD_REACHABLE)
+                return 0;
+        if ((state&NUD_VALID) && order != *dflt)
+                return 0;
+        if ((state&NUD_VALID) ||
+            (*last_idx<0 && order > *dflt)) {
+                *last_resort = fi;
+                *last_idx = order;
+        }
+        return 1;
+}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
+{
+        while (RTA_OK(attr,attrlen)) {
+                if (attr->rta_type == type)
+                        return *(u32*)RTA_DATA(attr);
+                attr = RTA_NEXT(attr, attrlen);
+        }
+        return 0;
+}
+static int
+fib_count_nexthops(struct rtattr *rta)
+{
+        int nhs = 0;
+        struct rtnexthop *nhp = RTA_DATA(rta);
+        int nhlen = RTA_PAYLOAD(rta);
+        while (nhlen >= (int)sizeof(struct rtnexthop)) {
+                if ((nhlen -= nhp->rtnh_len) < 0)
+                        return 0;
+                nhs++;
+                nhp = RTNH_NEXT(nhp);
+        };
+        return nhs;
+}
+static int
+fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+{
+        struct rtnexthop *nhp = RTA_DATA(rta);
+        int nhlen = RTA_PAYLOAD(rta);
+        change_nexthops(fi) {
+                int attrlen = nhlen - sizeof(struct rtnexthop);
+                if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+                        return -EINVAL;
+                nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
+                nh->nh_oif = nhp->rtnh_ifindex;
+                nh->nh_weight = nhp->rtnh_hops + 1;
+                if (attrlen) {
+                        nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+#ifdef CONFIG_NET_CLS_ROUTE
+                        nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+#endif
+                }
+                nhp = RTNH_NEXT(nhp);
+        } endfor_nexthops(fi);
+        return 0;
+}
+#endif
+int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
+                 struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        struct rtnexthop *nhp;
+        int nhlen;
+#endif
+        if (rta->rta_priority &&
+            *rta->rta_priority != fi->fib_priority)
+                return 1;
+        if (rta->rta_oif || rta->rta_gw) {
+                if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
+                    (!rta->rta_gw  || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
+                        return 0;
+                return 1;
+        }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        if (rta->rta_mp == NULL)
+                return 0;
+        nhp = RTA_DATA(rta->rta_mp);
+        nhlen = RTA_PAYLOAD(rta->rta_mp);
+        
+        for_nexthops(fi) {
+                int attrlen = nhlen - sizeof(struct rtnexthop);
+                u32 gw;
+                if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+                        return -EINVAL;
+                if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+                        return 1;
+                if (attrlen) {
+                        gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+                        if (gw && gw != nh->nh_gw)
+                                return 1;
+#ifdef CONFIG_NET_CLS_ROUTE
+                        gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+                        if (gw && gw != nh->nh_tclassid)
+                                return 1;
+#endif
+                }
+                nhp = RTNH_NEXT(nhp);
+        } endfor_nexthops(fi);
+#endif
+        return 0;
+}
+/*
+   Picture
+   -------
+   Semantics of nexthop is very messy by historical reasons.
+   We have to take into account, that:
+   a) gateway can be actually local interface address,
+      so that gatewayed route is direct.
+   b) gateway must be on-link address, possibly
+      described not by an ifaddr, but also by a direct route.
+   c) If both gateway and interface are specified, they should not
+      contradict.
+   d) If we use tunnel routes, gateway could be not on-link.
+   Attempt to reconcile all of these (alas, self-contradictory) conditions
+   results in pretty ugly and hairy code with obscure logic.
+   I chose to generalized it instead, so that the size
+   of code does not increase practically, but it becomes
+   much more general.
+   Every prefix is assigned a "scope" value: "host" is local address,
+   "link" is direct route,
+   [ ... "site" ... "interior" ... ]
+   and "universe" is true gateway route with global meaning.
+   Every prefix refers to a set of "nexthop"s (gw, oif),
+   where gw must have narrower scope. This recursion stops
+   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+   which means that gw is forced to be on link.
+   Code is still hairy, but now it is apparently logically
+   consistent and very flexible. F.e. as by-product it allows
+   to co-exists in peace independent exterior and interior
+   routing processes.
+   Normally it looks as following.
+   {universe prefix}  -> (gw, oif) [scope link]
+                          |
+                          |-> {link prefix} -> (gw, oif) [scope local]
+                                                |
+                                                |-> {local prefix} (terminal node)
+ */
+static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
+{
+        int err;
+        if (nh->nh_gw) {
+                struct fib_result res;
+#ifdef CONFIG_IP_ROUTE_PERVASIVE
+                if (nh->nh_flags&RTNH_F_PERVASIVE)
+                        return 0;
+#endif
+                if (nh->nh_flags&RTNH_F_ONLINK) {
+                        struct net_device *dev;
+                        if (r->rtm_scope >= RT_SCOPE_LINK)
+                                return -EINVAL;
+                        if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
+                                return -EINVAL;
+                        if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
+                                return -ENODEV;
+                        if (!(dev->flags&IFF_UP))
+                                return -ENETDOWN;
+                        nh->nh_dev = dev;
+                        dev_hold(dev);
+                        nh->nh_scope = RT_SCOPE_LINK;
+                        return 0;
+                }
+                {
+                        struct flowi fl = { .nl_u = { .ip4_u =
+                                                      { .daddr = nh->nh_gw,
+                                                        .scope = r->rtm_scope + 1 } },
+                                            .oif = nh->nh_oif };
+                        /* It is not necessary, but requires a bit of thinking */
+                        if (fl.fl4_scope < RT_SCOPE_LINK)
+                                fl.fl4_scope = RT_SCOPE_LINK;
+                        if ((err = fib_lookup(&fl, &res)) != 0)
+                                return err;
+                }
+                err = -EINVAL;
+                if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+                        goto out;
+                nh->nh_scope = res.scope;
+                nh->nh_oif = FIB_RES_OIF(res);
+                if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+                        goto out;
+                dev_hold(nh->nh_dev);
+                err = -ENETDOWN;
+                if (!(nh->nh_dev->flags & IFF_UP))
+                        goto out;
+                err = 0;
+out:
+                fib_res_put(&res);
+                return err;
+        } else {
+                struct in_device *in_dev;
+                if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+                        return -EINVAL;
+                in_dev = inetdev_by_index(nh->nh_oif);
+                if (in_dev == NULL)
+                        return -ENODEV;
+                if (!(in_dev->dev->flags&IFF_UP)) {
+                        in_dev_put(in_dev);
+                        return -ENETDOWN;
+                }
+                nh->nh_dev = in_dev->dev;
+                dev_hold(nh->nh_dev);
+                nh->nh_scope = RT_SCOPE_HOST;
+                in_dev_put(in_dev);
+        }
+        return 0;
+}
+static inline unsigned int fib_laddr_hashfn(u32 val)
+{
+        unsigned int mask = (fib_hash_size - 1);
+        return (val ^ (val >> 7) ^ (val >> 14)) & mask;
+}
+static struct hlist_head *fib_hash_alloc(int bytes)
+{
+        if (bytes <= PAGE_SIZE)
+                return kmalloc(bytes, GFP_KERNEL);
+        else
+                return (struct hlist_head *)
+                        __get_free_pages(GFP_KERNEL, get_order(bytes));
+}
+static void fib_hash_free(struct hlist_head *hash, int bytes)
+{
+        if (!hash)
+                return;
+        if (bytes <= PAGE_SIZE)
+                kfree(hash);
+        else
+                free_pages((unsigned long) hash, get_order(bytes));
+}
+static void fib_hash_move(struct hlist_head *new_info_hash,
+                          struct hlist_head *new_laddrhash,
+                          unsigned int new_size)
+{
+        unsigned int old_size = fib_hash_size;
+        unsigned int i;
+        write_lock(&fib_info_lock);
+        fib_hash_size = new_size;
+        for (i = 0; i < old_size; i++) {
+                struct hlist_head *head = &fib_info_hash[i];
+                struct hlist_node *node, *n;
+                struct fib_info *fi;
+                hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+                        struct hlist_head *dest;
+                        unsigned int new_hash;
+                        hlist_del(&fi->fib_hash);
+                        new_hash = fib_info_hashfn(fi);
+                        dest = &new_info_hash[new_hash];
+                        hlist_add_head(&fi->fib_hash, dest);
+                }
+        }
+        fib_info_hash = new_info_hash;
+        for (i = 0; i < old_size; i++) {
+                struct hlist_head *lhead = &fib_info_laddrhash[i];
+                struct hlist_node *node, *n;
+                struct fib_info *fi;
+                hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+                        struct hlist_head *ldest;
+                        unsigned int new_hash;
+                        hlist_del(&fi->fib_lhash);
+                        new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
+                        ldest = &new_laddrhash[new_hash];
+                        hlist_add_head(&fi->fib_lhash, ldest);
+                }
+        }
+        fib_info_laddrhash = new_laddrhash;
+        write_unlock(&fib_info_lock);
+}
+struct fib_info *
+fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
+                const struct nlmsghdr *nlh, int *errp)
+{
+        int err;
+        struct fib_info *fi = NULL;
+        struct fib_info *ofi;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        int nhs = 1;
+#else
+        const int nhs = 1;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        u32 mp_alg = IP_MP_ALG_NONE;
+#endif
+        /* Fast check to catch the most weird cases */
+        if (fib_props[r->rtm_type].scope > r->rtm_scope)
+                goto err_inval;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        if (rta->rta_mp) {
+                nhs = fib_count_nexthops(rta->rta_mp);
+                if (nhs == 0)
+                        goto err_inval;
+        }
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        if (rta->rta_mp_alg) {
+                mp_alg = *rta->rta_mp_alg;
+                if (mp_alg < IP_MP_ALG_NONE ||
+                    mp_alg > IP_MP_ALG_MAX)
+                        goto err_inval;
+        }
+#endif
+        err = -ENOBUFS;
+        if (fib_info_cnt >= fib_hash_size) {
+                unsigned int new_size = fib_hash_size << 1;
+                struct hlist_head *new_info_hash;
+                struct hlist_head *new_laddrhash;
+                unsigned int bytes;
+                if (!new_size)
+                        new_size = 1;
+                bytes = new_size * sizeof(struct hlist_head *);
+                new_info_hash = fib_hash_alloc(bytes);
+                new_laddrhash = fib_hash_alloc(bytes);
+                if (!new_info_hash || !new_laddrhash) {
+                        fib_hash_free(new_info_hash, bytes);
+                        fib_hash_free(new_laddrhash, bytes);
+                } else {
+                        memset(new_info_hash, 0, bytes);
+                        memset(new_laddrhash, 0, bytes);
+                        fib_hash_move(new_info_hash, new_laddrhash, new_size);
+                }
+                if (!fib_hash_size)
+                        goto failure;
+        }
+        fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+        if (fi == NULL)
+                goto failure;
+        fib_info_cnt++;
+        memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
+        fi->fib_protocol = r->rtm_protocol;
+        fi->fib_nhs = nhs;
+        change_nexthops(fi) {
+                nh->nh_parent = fi;
+        } endfor_nexthops(fi)
+        fi->fib_flags = r->rtm_flags;
+        if (rta->rta_priority)
+                fi->fib_priority = *rta->rta_priority;
+        if (rta->rta_mx) {
+                int attrlen = RTA_PAYLOAD(rta->rta_mx);
+                struct rtattr *attr = RTA_DATA(rta->rta_mx);
+                while (RTA_OK(attr, attrlen)) {
+                        unsigned flavor = attr->rta_type;
+                        if (flavor) {
+                                if (flavor > RTAX_MAX)
+                                        goto err_inval;
+                                fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
+                        }
+                        attr = RTA_NEXT(attr, attrlen);
+                }
+        }
+        if (rta->rta_prefsrc)
+                memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
+        if (rta->rta_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+                        goto failure;
+                if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+                        goto err_inval;
+                if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
+                        goto err_inval;
+#ifdef CONFIG_NET_CLS_ROUTE
+                if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
+                        goto err_inval;
+#endif
+#else
+                goto err_inval;
+#endif
+        } else {
+                struct fib_nh *nh = fi->fib_nh;
+                if (rta->rta_oif)
+                        nh->nh_oif = *rta->rta_oif;
+                if (rta->rta_gw)
+                        memcpy(&nh->nh_gw, rta->rta_gw, 4);
+#ifdef CONFIG_NET_CLS_ROUTE
+                if (rta->rta_flow)
+                        memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
+#endif
+                nh->nh_flags = r->rtm_flags;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                nh->nh_weight = 1;
+#endif
+        }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        fi->fib_mp_alg = mp_alg;
+#endif
+        if (fib_props[r->rtm_type].error) {
+                if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+                        goto err_inval;
+                goto link_it;
+        }
+        if (r->rtm_scope > RT_SCOPE_HOST)
+                goto err_inval;
+        if (r->rtm_scope == RT_SCOPE_HOST) {
+                struct fib_nh *nh = fi->fib_nh;
+                /* Local address is added. */
+                if (nhs != 1 || nh->nh_gw)
+                        goto err_inval;
+                nh->nh_scope = RT_SCOPE_NOWHERE;
+                nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
+                err = -ENODEV;
+                if (nh->nh_dev == NULL)
+                        goto failure;
+        } else {
+                change_nexthops(fi) {
+                        if ((err = fib_check_nh(r, fi, nh)) != 0)
+                                goto failure;
+                } endfor_nexthops(fi)
+        }
+        if (fi->fib_prefsrc) {
+                if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
+                    memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
+                        if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+                                goto err_inval;
+        }
+link_it:
+        if ((ofi = fib_find_info(fi)) != NULL) {
+                fi->fib_dead = 1;
+                free_fib_info(fi);
+                ofi->fib_treeref++;
+                return ofi;
+        }
+        fi->fib_treeref++;
+        atomic_inc(&fi->fib_clntref);
+        write_lock(&fib_info_lock);
+        hlist_add_head(&fi->fib_hash,
+                       &fib_info_hash[fib_info_hashfn(fi)]);
+        if (fi->fib_prefsrc) {
+                struct hlist_head *head;
+                head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+                hlist_add_head(&fi->fib_lhash, head);
+        }
+        change_nexthops(fi) {
+                struct hlist_head *head;
+                unsigned int hash;
+                if (!nh->nh_dev)
+                        continue;
+                hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
+                head = &fib_info_devhash[hash];
+                hlist_add_head(&nh->nh_hash, head);
+        } endfor_nexthops(fi)
+        write_unlock(&fib_info_lock);
+        return fi;
+err_inval:
+        err = -EINVAL;
+failure:
+        *errp = err;
+        if (fi) {
+                fi->fib_dead = 1;
+                free_fib_info(fi);
+        }
+        return NULL;
+}
+int fib_semantic_match(struct list_head *head, const struct flowi *flp,
+                       struct fib_result *res, __u32 zone, __u32 mask, 
+                        int prefixlen)
+{
+        struct fib_alias *fa;
+        int nh_sel = 0;
+        list_for_each_entry(fa, head, fa_list) {
+                int err;
+                if (fa->fa_tos &&
+                    fa->fa_tos != flp->fl4_tos)
+                        continue;
+                if (fa->fa_scope < flp->fl4_scope)
+                        continue;
+                fa->fa_state |= FA_S_ACCESSED;
+                err = fib_props[fa->fa_type].error;
+                if (err == 0) {
+                        struct fib_info *fi = fa->fa_info;
+                        if (fi->fib_flags & RTNH_F_DEAD)
+                                continue;
+                        switch (fa->fa_type) {
+                        case RTN_UNICAST:
+                        case RTN_LOCAL:
+                        case RTN_BROADCAST:
+                        case RTN_ANYCAST:
+                        case RTN_MULTICAST:
+                                for_nexthops(fi) {
+                                        if (nh->nh_flags&RTNH_F_DEAD)
+                                                continue;
+                                        if (!flp->oif || flp->oif == nh->nh_oif)
+                                                break;
+                                }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                                if (nhsel < fi->fib_nhs) {
+                                        nh_sel = nhsel;
+                                        goto out_fill_res;
+                                }
+#else
+                                if (nhsel < 1) {
+                                        goto out_fill_res;
+                                }
+#endif
+                                endfor_nexthops(fi);
+                                continue;
+                        default:
+                                printk(KERN_DEBUG "impossible 102\n");
+                                return -EINVAL;
+                        };
+                }
+                return err;
+        }
+        return 1;
+out_fill_res:
+        res->prefixlen = prefixlen;
+        res->nh_sel = nh_sel;
+        res->type = fa->fa_type;
+        res->scope = fa->fa_scope;
+        res->fi = fa->fa_info;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        res->netmask = mask;
+        res->network = zone &
+                (0xFFFFFFFF >> (32 - prefixlen));
+#endif
+        atomic_inc(&res->fi->fib_clntref);
+        return 0;
+}
+/* Find appropriate source address to this destination */
+u32 __fib_res_prefsrc(struct fib_result *res)
+{
+        return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
+}
+int
+fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+              u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
+              struct fib_info *fi)
+{
+        struct rtmsg *rtm;
+        struct nlmsghdr  *nlh;
+        unsigned char    *b = skb->tail;
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+        rtm = NLMSG_DATA(nlh);
+        rtm->rtm_family = AF_INET;
+        rtm->rtm_dst_len = dst_len;
+        rtm->rtm_src_len = 0;
+        rtm->rtm_tos = tos;
+        rtm->rtm_table = tb_id;
+        rtm->rtm_type = type;
+        rtm->rtm_flags = fi->fib_flags;
+        rtm->rtm_scope = scope;
+        if (rtm->rtm_dst_len)
+                RTA_PUT(skb, RTA_DST, 4, dst);
+        rtm->rtm_protocol = fi->fib_protocol;
+        if (fi->fib_priority)
+                RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
+#ifdef CONFIG_NET_CLS_ROUTE
+        if (fi->fib_nh[0].nh_tclassid)
+                RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
+#endif
+        if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+                goto rtattr_failure;
+        if (fi->fib_prefsrc)
+                RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
+        if (fi->fib_nhs == 1) {
+                if (fi->fib_nh->nh_gw)
+                        RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
+                if (fi->fib_nh->nh_oif)
+                        RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
+        }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        if (fi->fib_nhs > 1) {
+                struct rtnexthop *nhp;
+                struct rtattr *mp_head;
+                if (skb_tailroom(skb) <= RTA_SPACE(0))
+                        goto rtattr_failure;
+                mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
+                for_nexthops(fi) {
+                        if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+                                goto rtattr_failure;
+                        nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+                        nhp->rtnh_flags = nh->nh_flags & 0xFF;
+                        nhp->rtnh_hops = nh->nh_weight-1;
+                        nhp->rtnh_ifindex = nh->nh_oif;
+                        if (nh->nh_gw)
+                                RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
+                        nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
+                } endfor_nexthops(fi);
+                mp_head->rta_type = RTA_MULTIPATH;
+                mp_head->rta_len = skb->tail - (u8*)mp_head;
+        }
+#endif
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+#ifndef CONFIG_IP_NOSIOCRT
+int
+fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
+                    struct kern_rta *rta, struct rtentry *r)
+{
+        int    plen;
+        u32    *ptr;
+        memset(rtm, 0, sizeof(*rtm));
+        memset(rta, 0, sizeof(*rta));
+        if (r->rt_dst.sa_family != AF_INET)
+                return -EAFNOSUPPORT;
+        /* Check mask for validity:
+           a) it must be contiguous.
+           b) destination must have all host bits clear.
+           c) if application forgot to set correct family (AF_INET),
+              reject request unless it is absolutely clear i.e.
+              both family and mask are zero.
+         */
+        plen = 32;
+        ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
+        if (!(r->rt_flags&RTF_HOST)) {
+                u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
+                if (r->rt_genmask.sa_family != AF_INET) {
+                        if (mask || r->rt_genmask.sa_family)
+                                return -EAFNOSUPPORT;
+                }
+                if (bad_mask(mask, *ptr))
+                        return -EINVAL;
+                plen = inet_mask_len(mask);
+        }
+        nl->nlmsg_flags = NLM_F_REQUEST;
+        nl->nlmsg_pid = 0;
+        nl->nlmsg_seq = 0;
+        nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
+        if (cmd == SIOCDELRT) {
+                nl->nlmsg_type = RTM_DELROUTE;
+                nl->nlmsg_flags = 0;
+        } else {
+                nl->nlmsg_type = RTM_NEWROUTE;
+                nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
+                rtm->rtm_protocol = RTPROT_BOOT;
+        }
+        rtm->rtm_dst_len = plen;
+        rta->rta_dst = ptr;
+        if (r->rt_metric) {
+                *(u32*)&r->rt_pad3 = r->rt_metric - 1;
+                rta->rta_priority = (u32*)&r->rt_pad3;
+        }
+        if (r->rt_flags&RTF_REJECT) {
+                rtm->rtm_scope = RT_SCOPE_HOST;
+                rtm->rtm_type = RTN_UNREACHABLE;
+                return 0;
+        }
+        rtm->rtm_scope = RT_SCOPE_NOWHERE;
+        rtm->rtm_type = RTN_UNICAST;
+        if (r->rt_dev) {
+                char *colon;
+                struct net_device *dev;
+                char   devname[IFNAMSIZ];
+                if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
+                        return -EFAULT;
+                devname[IFNAMSIZ-1] = 0;
+                colon = strchr(devname, ':');
+                if (colon)
+                        *colon = 0;
+                dev = __dev_get_by_name(devname);
+                if (!dev)
+                        return -ENODEV;
+                rta->rta_oif = &dev->ifindex;
+                if (colon) {
+                        struct in_ifaddr *ifa;
+                        struct in_device *in_dev = __in_dev_get(dev);
+                        if (!in_dev)
+                                return -ENODEV;
+                        *colon = ':';
+                        for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+                                if (strcmp(ifa->ifa_label, devname) == 0)
+                                        break;
+                        if (ifa == NULL)
+                                return -ENODEV;
+                        rta->rta_prefsrc = &ifa->ifa_local;
+                }
+        }
+        ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
+        if (r->rt_gateway.sa_family == AF_INET && *ptr) {
+                rta->rta_gw = ptr;
+                if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
+                        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+        }
+        if (cmd == SIOCDELRT)
+                return 0;
+        if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
+                return -EINVAL;
+        if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
+                rtm->rtm_scope = RT_SCOPE_LINK;
+        if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
+                struct rtattr *rec;
+                struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
+                if (mx == NULL)
+                        return -ENOMEM;
+                rta->rta_mx = mx;
+                mx->rta_type = RTA_METRICS;
+                mx->rta_len  = RTA_LENGTH(0);
+                if (r->rt_flags&RTF_MTU) {
+                        rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+                        rec->rta_type = RTAX_ADVMSS;
+                        rec->rta_len = RTA_LENGTH(4);
+                        mx->rta_len += RTA_LENGTH(4);
+                        *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
+                }
+                if (r->rt_flags&RTF_WINDOW) {
+                        rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+                        rec->rta_type = RTAX_WINDOW;
+                        rec->rta_len = RTA_LENGTH(4);
+                        mx->rta_len += RTA_LENGTH(4);
+                        *(u32*)RTA_DATA(rec) = r->rt_window;
+                }
+                if (r->rt_flags&RTF_IRTT) {
+                        rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+                        rec->rta_type = RTAX_RTT;
+                        rec->rta_len = RTA_LENGTH(4);
+                        mx->rta_len += RTA_LENGTH(4);
+                        *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
+                }
+        }
+        return 0;
+}
+#endif
+/*
+   Update FIB if:
+   - local address disappeared -> we must delete all the entries
+     referring to it.
+   - device went down -> we must shutdown all nexthops going via it.
+ */
+int fib_sync_down(u32 local, struct net_device *dev, int force)
+{
+        int ret = 0;
+        int scope = RT_SCOPE_NOWHERE;
+        
+        if (force)
+                scope = -1;
+        if (local && fib_info_laddrhash) {
+                unsigned int hash = fib_laddr_hashfn(local);
+                struct hlist_head *head = &fib_info_laddrhash[hash];
+                struct hlist_node *node;
+                struct fib_info *fi;
+                hlist_for_each_entry(fi, node, head, fib_lhash) {
+                        if (fi->fib_prefsrc == local) {
+                                fi->fib_flags |= RTNH_F_DEAD;
+                                ret++;
+                        }
+                }
+        }
+        if (dev) {
+                struct fib_info *prev_fi = NULL;
+                unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+                struct hlist_head *head = &fib_info_devhash[hash];
+                struct hlist_node *node;
+                struct fib_nh *nh;
+                hlist_for_each_entry(nh, node, head, nh_hash) {
+                        struct fib_info *fi = nh->nh_parent;
+                        int dead;
+                        BUG_ON(!fi->fib_nhs);
+                        if (nh->nh_dev != dev || fi == prev_fi)
+                                continue;
+                        prev_fi = fi;
+                        dead = 0;
+                        change_nexthops(fi) {
+                                if (nh->nh_flags&RTNH_F_DEAD)
+                                        dead++;
+                                else if (nh->nh_dev == dev &&
+                                         nh->nh_scope != scope) {
+                                        nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                                        spin_lock_bh(&fib_multipath_lock);
+                                        fi->fib_power -= nh->nh_power;
+                                        nh->nh_power = 0;
+                                        spin_unlock_bh(&fib_multipath_lock);
+#endif
+                                        dead++;
+                                }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                                if (force > 1 && nh->nh_dev == dev) {
+                                        dead = fi->fib_nhs;
+                                        break;
+                                }
+#endif
+                        } endfor_nexthops(fi)
+                        if (dead == fi->fib_nhs) {
+                                fi->fib_flags |= RTNH_F_DEAD;
+                                ret++;
+                        }
+                }
+        }
+        return ret;
+}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+/*
+   Dead device goes up. We wake up dead nexthops.
+   It takes sense only on multipath routes.
+ */
+int fib_sync_up(struct net_device *dev)
+{
+        struct fib_info *prev_fi;
+        unsigned int hash;
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct fib_nh *nh;
+        int ret;
+        if (!(dev->flags&IFF_UP))
+                return 0;
+        prev_fi = NULL;
+        hash = fib_devindex_hashfn(dev->ifindex);
+        head = &fib_info_devhash[hash];
+        ret = 0;
+        hlist_for_each_entry(nh, node, head, nh_hash) {
+                struct fib_info *fi = nh->nh_parent;
+                int alive;
+                BUG_ON(!fi->fib_nhs);
+                if (nh->nh_dev != dev || fi == prev_fi)
+                        continue;
+                prev_fi = fi;
+                alive = 0;
+                change_nexthops(fi) {
+                        if (!(nh->nh_flags&RTNH_F_DEAD)) {
+                                alive++;
+                                continue;
+                        }
+                        if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+                                continue;
+                        if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
+                                continue;
+                        alive++;
+                        spin_lock_bh(&fib_multipath_lock);
+                        nh->nh_power = 0;
+                        nh->nh_flags &= ~RTNH_F_DEAD;
+                        spin_unlock_bh(&fib_multipath_lock);
+                } endfor_nexthops(fi)
+                if (alive > 0) {
+                        fi->fib_flags &= ~RTNH_F_DEAD;
+                        ret++;
+                }
+        }
+        return ret;
+}
+/*
+   The algorithm is suboptimal, but it provides really
+   fair weighted route distribution.
+ */
+void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+{
+        struct fib_info *fi = res->fi;
+        int w;
+        spin_lock_bh(&fib_multipath_lock);
+        if (fi->fib_power <= 0) {
+                int power = 0;
+                change_nexthops(fi) {
+                        if (!(nh->nh_flags&RTNH_F_DEAD)) {
+                                power += nh->nh_weight;
+                                nh->nh_power = nh->nh_weight;
+                        }
+                } endfor_nexthops(fi);
+                fi->fib_power = power;
+                if (power <= 0) {
+                        spin_unlock_bh(&fib_multipath_lock);
+                        /* Race condition: route has just become dead. */
+                        res->nh_sel = 0;
+                        return;
+                }
+        }
+        /* w should be random number [0..fi->fib_power-1],
+           it is pretty bad approximation.
+         */
+        w = jiffies % fi->fib_power;
+        change_nexthops(fi) {
+                if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+                        if ((w -= nh->nh_power) <= 0) {
+                                nh->nh_power--;
+                                fi->fib_power--;
+                                res->nh_sel = nhsel;
+                                spin_unlock_bh(&fib_multipath_lock);
+                                return;
+                        }
+                }
+        } endfor_nexthops(fi);
+        /* Race condition: route has just become dead. */
+        res->nh_sel = 0;
+        spin_unlock_bh(&fib_multipath_lock);
+}
+#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 000000000000..85bf0d3e294b
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1143 @@
+/*
+ *      NET3:   Implementation of the ICMP protocol layer.
+ *
+ *              Alan Cox, <alan@redhat.com>
+ *
+ *      Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      Some of the function names and the icmp unreach table for this
+ *      module were derived from [icmp.c 1.0.11 06/02/93] by
+ *      Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ *      Other than that this module is a complete rewrite.
+ *
+ *      Fixes:
+ *      Clemens Fruhwirth       :       introduce global icmp rate limiting
+ *                                      with icmp type masking ability instead
+ *                                      of broken per type icmp timeouts.
+ *              Mike Shaver     :       RFC1122 checks.
+ *              Alan Cox        :       Multicast ping reply as self.
+ *              Alan Cox        :       Fix atomicity lockup in ip_build_xmit
+ *                                      call.
+ *              Alan Cox        :       Added 216,128 byte paths to the MTU
+ *                                      code.
+ *              Martin Mares    :       RFC1812 checks.
+ *              Martin Mares    :       Can be configured to follow redirects
+ *                                      if acting as a router _without_ a
+ *                                      routing protocol (RFC 1812).
+ *              Martin Mares    :       Echo requests may be configured to
+ *                                      be ignored (RFC 1812).
+ *              Martin Mares    :       Limitation of ICMP error message
+ *                                      transmit rate (RFC 1812).
+ *              Martin Mares    :       TOS and Precedence set correctly
+ *                                      (RFC 1812).
+ *              Martin Mares    :       Now copying as much data from the
+ *                                      original packet as we can without
+ *                                      exceeding 576 bytes (RFC 1812).
+ *      Willy Konynenberg       :       Transparent proxying support.
+ *              Keith Owens     :       RFC1191 correction for 4.2BSD based
+ *                                      path MTU bug.
+ *              Thomas Quinot   :       ICMP Dest Unreach codes up to 15 are
+ *                                      valid (RFC 1812).
+ *              Andi Kleen      :       Check all packet lengths properly
+ *                                      and moved all kfree_skb() up to
+ *                                      icmp_rcv.
+ *              Andi Kleen      :       Move the rate limit bookkeeping
+ *                                      into the dest entry and use a token
+ *                                      bucket filter (thanks to ANK). Make
+ *                                      the rates sysctl configurable.
+ *              Yu Tianli       :       Fixed two ugly bugs in icmp_send
+ *                                      - IP option length was accounted wrongly
+ *                                      - ICMP header length was not accounted
+ *                                        at all.
+ *              Tristan Greaves :       Added sysctl option to ignore bogus
+ *                                      broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ *      - Should use skb_pull() instead of all the manual checking.
+ *        This would also greatly simply some upper layer error handlers. --AK
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+/*
+ *      Build xmit assembly blocks
+ */
+struct icmp_bxm {
+        struct sk_buff *skb;
+        int offset;
+        int data_len;
+        struct {
+                struct icmphdr icmph;
+                __u32          times[3];
+        } data;
+        int head_len;
+        struct ip_options replyopts;
+        unsigned char  optbuf[40];
+};
+/*
+ *      Statistics
+ */
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+struct icmp_err icmp_err_convert[] = {
+        {
+                .errno = ENETUNREACH,   /* ICMP_NET_UNREACH */
+                .fatal = 0,
+        },
+        {
+                .errno = EHOSTUNREACH,  /* ICMP_HOST_UNREACH */
+                .fatal = 0,
+        },
+        {
+                .errno = ENOPROTOOPT    /* ICMP_PROT_UNREACH */,
+                .fatal = 1,
+        },
+        {
+                .errno = ECONNREFUSED,  /* ICMP_PORT_UNREACH */
+                .fatal = 1,
+        },
+        {
+                .errno = EMSGSIZE,      /* ICMP_FRAG_NEEDED */
+                .fatal = 0,
+        },
+        {
+                .errno = EOPNOTSUPP,    /* ICMP_SR_FAILED */
+                .fatal = 0,
+        },
+        {
+                .errno = ENETUNREACH,   /* ICMP_NET_UNKNOWN */
+                .fatal = 1,
+        },
+        {
+                .errno = EHOSTDOWN,     /* ICMP_HOST_UNKNOWN */
+                .fatal = 1,
+        },
+        {
+                .errno = ENONET,        /* ICMP_HOST_ISOLATED */
+                .fatal = 1,
+        },
+        {
+                .errno = ENETUNREACH,   /* ICMP_NET_ANO */
+                .fatal = 1,
+        },
+        {
+                .errno = EHOSTUNREACH,  /* ICMP_HOST_ANO */
+                .fatal = 1,
+        },
+        {
+                .errno = ENETUNREACH,   /* ICMP_NET_UNR_TOS */
+                .fatal = 0,
+        },
+        {
+                .errno = EHOSTUNREACH,  /* ICMP_HOST_UNR_TOS */
+                .fatal = 0,
+        },
+        {
+                .errno = EHOSTUNREACH,  /* ICMP_PKT_FILTERED */
+                .fatal = 1,
+        },
+        {
+                .errno = EHOSTUNREACH,  /* ICMP_PREC_VIOLATION */
+                .fatal = 1,
+        },
+        {
+                .errno = EHOSTUNREACH,  /* ICMP_PREC_CUTOFF */
+                .fatal = 1,
+        },
+};
+/* Control parameters for ECHO replies. */
+int sysctl_icmp_echo_ignore_all;
+int sysctl_icmp_echo_ignore_broadcasts;
+/* Control parameter - ignore bogus broadcast responses? */
+int sysctl_icmp_ignore_bogus_error_responses;
+/*
+ *      Configurable global rate limit.
+ *
+ *      ratelimit defines tokens/packet consumed for dst->rate_token bucket
+ *      ratemask defines which icmp types are ratelimited by setting
+ *      it's bit position.
+ *
+ *      default:
+ *      dest unreachable (3), source quench (4),
+ *      time exceeded (11), parameter problem (12)
+ */
+int sysctl_icmp_ratelimit = 1 * HZ;
+int sysctl_icmp_ratemask = 0x1818;
+/*
+ *      ICMP control array. This specifies what to do with each ICMP.
+ */
+struct icmp_control {
+        int output_entry;       /* Field for increment on output */
+        int input_entry;        /* Field for increment on input */
+        void (*handler)(struct sk_buff *skb);
+        short   error;          /* This ICMP is classed as an error message */
+};
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+/*
+ *      The ICMP socket(s). This is the most convenient way to flow control
+ *      our ICMP output as well as maintain a clean interface throughout
+ *      all layers. All Socketless IP sends will soon be gone.
+ *
+ *      On SMP we have one ICMP socket per-cpu.
+ */
+static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
+#define icmp_socket     __get_cpu_var(__icmp_socket)
+static __inline__ int icmp_xmit_lock(void)
+{
+        local_bh_disable();
+        if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) {
+                /* This can happen if the output path signals a
+                 * dst_link_failure() for an outgoing ICMP packet.
+                 */
+                local_bh_enable();
+                return 1;
+        }
+        return 0;
+}
+static void icmp_xmit_unlock(void)
+{
+        spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
+}
+/*
+ *      Send an ICMP frame.
+ */
+/*
+ *      Check transmit rate limitation for given message.
+ *      The rate information is held in the destination cache now.
+ *      This function is generic and could be used for other purposes
+ *      too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ *      Note that the same dst_entry fields are modified by functions in
+ *      route.c too, but these work for packet destinations while xrlim_allow
+ *      works for icmp destinations. This means the rate limiting information
+ *      for one "ip object" is shared - and these ICMPs are twice limited:
+ *      by source and by destination.
+ *
+ *      RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ *                        SHOULD allow setting of rate limits
+ *
+ *      Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+int xrlim_allow(struct dst_entry *dst, int timeout)
+{
+        unsigned long now;
+        int rc = 0;
+        now = jiffies;
+        dst->rate_tokens += now - dst->rate_last;
+        dst->rate_last = now;
+        if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
+                dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
+        if (dst->rate_tokens >= timeout) {
+                dst->rate_tokens -= timeout;
+                rc = 1;
+        }
+        return rc;
+}
+static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
+{
+        struct dst_entry *dst = &rt->u.dst;
+        int rc = 1;
+        if (type > NR_ICMP_TYPES)
+                goto out;
+        /* Don't limit PMTU discovery. */
+        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+                goto out;
+        /* No rate limit on loopback */
+        if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+                goto out;
+        /* Limit if icmp type is enabled in ratemask. */
+        if ((1 << type) & sysctl_icmp_ratemask)
+                rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
+out:
+        return rc;
+}
+/*
+ *      Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+static void icmp_out_count(int type)
+{
+        if (type <= NR_ICMP_TYPES) {
+                ICMP_INC_STATS(icmp_pointers[type].output_entry);
+                ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
+        }
+}
+/*
+ *      Checksum each fragment, and on the first include the headers and final
+ *      checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+                          struct sk_buff *skb)
+{
+        struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+        unsigned int csum;
+        csum = skb_copy_and_csum_bits(icmp_param->skb,
+                                      icmp_param->offset + offset,
+                                      to, len, 0);
+        skb->csum = csum_block_add(skb->csum, csum, odd);
+        if (icmp_pointers[icmp_param->data.icmph.type].error)
+                nf_ct_attach(skb, icmp_param->skb);
+        return 0;
+}
+static void icmp_push_reply(struct icmp_bxm *icmp_param,
+                            struct ipcm_cookie *ipc, struct rtable *rt)
+{
+        struct sk_buff *skb;
+        ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+                       icmp_param->data_len+icmp_param->head_len,
+                       icmp_param->head_len,
+                       ipc, rt, MSG_DONTWAIT);
+        if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+                struct icmphdr *icmph = skb->h.icmph;
+                unsigned int csum = 0;
+                struct sk_buff *skb1;
+                skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
+                        csum = csum_add(csum, skb1->csum);
+                }
+                csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+                                                 (char *)icmph,
+                                                 icmp_param->head_len, csum);
+                icmph->checksum = csum_fold(csum);
+                skb->ip_summed = CHECKSUM_NONE;
+                ip_push_pending_frames(icmp_socket->sk);
+        }
+}
+/*
+ *      Driving logic for building and sending ICMP messages.
+ */
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+        struct sock *sk = icmp_socket->sk;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ipcm_cookie ipc;
+        struct rtable *rt = (struct rtable *)skb->dst;
+        u32 daddr;
+        if (ip_options_echo(&icmp_param->replyopts, skb))
+                goto out;
+        if (icmp_xmit_lock())
+                return;
+        icmp_param->data.icmph.checksum = 0;
+        icmp_out_count(icmp_param->data.icmph.type);
+        inet->tos = skb->nh.iph->tos;
+        daddr = ipc.addr = rt->rt_src;
+        ipc.opt = NULL;
+        if (icmp_param->replyopts.optlen) {
+                ipc.opt = &icmp_param->replyopts;
+                if (ipc.opt->srr)
+                        daddr = icmp_param->replyopts.faddr;
+        }
+        {
+                struct flowi fl = { .nl_u = { .ip4_u =
+                                              { .daddr = daddr,
+                                                .saddr = rt->rt_spec_dst,
+                                                .tos = RT_TOS(skb->nh.iph->tos) } },
+                                    .proto = IPPROTO_ICMP };
+                if (ip_route_output_key(&rt, &fl))
+                        goto out_unlock;
+        }
+        if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
+                               icmp_param->data.icmph.code))
+                icmp_push_reply(icmp_param, &ipc, rt);
+        ip_rt_put(rt);
+out_unlock:
+        icmp_xmit_unlock();
+out:;
+}
+/*
+ *      Send an ICMP message in response to a situation
+ *
+ *      RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
+ *                MAY send more (we do).
+ *                      MUST NOT change this header information.
+ *                      MUST NOT reply to a multicast/broadcast IP address.
+ *                      MUST NOT reply to a multicast/broadcast MAC address.
+ *                      MUST reply to only the first fragment.
+ */
+void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
+{
+        struct iphdr *iph;
+        int room;
+        struct icmp_bxm icmp_param;
+        struct rtable *rt = (struct rtable *)skb_in->dst;
+        struct ipcm_cookie ipc;
+        u32 saddr;
+        u8  tos;
+        if (!rt)
+                goto out;
+        /*
+         *      Find the original header. It is expected to be valid, of course.
+         *      Check this, icmp_send is called from the most obscure devices
+         *      sometimes.
+         */
+        iph = skb_in->nh.iph;
+        if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail)
+                goto out;
+        /*
+         *      No replies to physical multicast/broadcast
+         */
+        if (skb_in->pkt_type != PACKET_HOST)
+                goto out;
+        /*
+         *      Now check at the protocol level
+         */
+        if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+                goto out;
+        /*
+         *      Only reply to fragment 0. We byte re-order the constant
+         *      mask for efficiency.
+         */
+        if (iph->frag_off & htons(IP_OFFSET))
+                goto out;
+        /*
+         *      If we send an ICMP error to an ICMP error a mess would result..
+         */
+        if (icmp_pointers[type].error) {
+                /*
+                 *      We are an error, check if we are replying to an
+                 *      ICMP error
+                 */
+                if (iph->protocol == IPPROTO_ICMP) {
+                        u8 _inner_type, *itp;
+                        itp = skb_header_pointer(skb_in,
+                                                 skb_in->nh.raw +
+                                                 (iph->ihl << 2) +
+                                                 offsetof(struct icmphdr,
+                                                          type) -
+                                                 skb_in->data,
+                                                 sizeof(_inner_type),
+                                                 &_inner_type);
+                        if (itp == NULL)
+                                goto out;
+                        /*
+                         *      Assume any unknown ICMP type is an error. This
+                         *      isn't specified by the RFC, but think about it..
+                         */
+                        if (*itp > NR_ICMP_TYPES ||
+                            icmp_pointers[*itp].error)
+                                goto out;
+                }
+        }
+        if (icmp_xmit_lock())
+                return;
+        /*
+         *      Construct source address and options.
+         */
+        saddr = iph->daddr;
+        if (!(rt->rt_flags & RTCF_LOCAL))
+                saddr = 0;
+        tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+                                           IPTOS_PREC_INTERNETCONTROL) :
+                                          iph->tos;
+        if (ip_options_echo(&icmp_param.replyopts, skb_in))
+                goto ende;
+        /*
+         *      Prepare data for ICMP header.
+         */
+        icmp_param.data.icmph.type       = type;
+        icmp_param.data.icmph.code       = code;
+        icmp_param.data.icmph.un.gateway = info;
+        icmp_param.data.icmph.checksum   = 0;
+        icmp_param.skb    = skb_in;
+        icmp_param.offset = skb_in->nh.raw - skb_in->data;
+        icmp_out_count(icmp_param.data.icmph.type);
+        inet_sk(icmp_socket->sk)->tos = tos;
+        ipc.addr = iph->saddr;
+        ipc.opt = &icmp_param.replyopts;
+        {
+                struct flowi fl = {
+                        .nl_u = {
+                                .ip4_u = {
+                                        .daddr = icmp_param.replyopts.srr ?
+                                                icmp_param.replyopts.faddr :
+                                                iph->saddr,
+                                        .saddr = saddr,
+                                        .tos = RT_TOS(tos)
+                                }
+                        },
+                        .proto = IPPROTO_ICMP,
+                        .uli_u = {
+                                .icmpt = {
+                                        .type = type,
+                                        .code = code
+                                }
+                        }
+                };
+                if (ip_route_output_key(&rt, &fl))
+                        goto out_unlock;
+        }
+        if (!icmpv4_xrlim_allow(rt, type, code))
+                goto ende;
+        /* RFC says return as much as we can without exceeding 576 bytes. */
+        room = dst_mtu(&rt->u.dst);
+        if (room > 576)
+                room = 576;
+        room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+        room -= sizeof(struct icmphdr);
+        icmp_param.data_len = skb_in->len - icmp_param.offset;
+        if (icmp_param.data_len > room)
+                icmp_param.data_len = room;
+        icmp_param.head_len = sizeof(struct icmphdr);
+        icmp_push_reply(&icmp_param, &ipc, rt);
+ende:
+        ip_rt_put(rt);
+out_unlock:
+        icmp_xmit_unlock();
+out:;
+}
+/*
+ *      Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ */
+static void icmp_unreach(struct sk_buff *skb)
+{
+        struct iphdr *iph;
+        struct icmphdr *icmph;
+        int hash, protocol;
+        struct net_protocol *ipprot;
+        struct sock *raw_sk;
+        u32 info = 0;
+        /*
+         *      Incomplete header ?
+         *      Only checks for the IP header, there should be an
+         *      additional check for longer headers in upper levels.
+         */
+        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                goto out_err;
+        icmph = skb->h.icmph;
+        iph   = (struct iphdr *)skb->data;
+        if (iph->ihl < 5) /* Mangled header, drop. */
+                goto out_err;
+        if (icmph->type == ICMP_DEST_UNREACH) {
+                switch (icmph->code & 15) {
+                case ICMP_NET_UNREACH:
+                case ICMP_HOST_UNREACH:
+                case ICMP_PROT_UNREACH:
+                case ICMP_PORT_UNREACH:
+                        break;
+                case ICMP_FRAG_NEEDED:
+                        if (ipv4_config.no_pmtu_disc) {
+                                LIMIT_NETDEBUG(
+                                        printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+                                                         "fragmentation needed "
+                                                         "and DF set.\n",
+                                               NIPQUAD(iph->daddr)));
+                        } else {
+                                info = ip_rt_frag_needed(iph,
+                                                     ntohs(icmph->un.frag.mtu));
+                                if (!info)
+                                        goto out;
+                        }
+                        break;
+                case ICMP_SR_FAILED:
+                        LIMIT_NETDEBUG(
+                                printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+                                                 "Route Failed.\n",
+                                       NIPQUAD(iph->daddr)));
+                        break;
+                default:
+                        break;
+                }
+                if (icmph->code > NR_ICMP_UNREACH)
+                        goto out;
+        } else if (icmph->type == ICMP_PARAMETERPROB)
+                info = ntohl(icmph->un.gateway) >> 24;
+        /*
+         *      Throw it at our lower layers
+         *
+         *      RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+         *                header.
+         *      RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+         *                transport layer.
+         *      RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+         *                transport layer.
+         */
+        /*
+         *      Check the other end isnt violating RFC 1122. Some routers send
+         *      bogus responses to broadcast frames. If you see this message
+         *      first check your netmask matches at both ends, if it does then
+         *      get the other vendor to fix their kit.
+         */
+        if (!sysctl_icmp_ignore_bogus_error_responses &&
+            inet_addr_type(iph->daddr) == RTN_BROADCAST) {
+                if (net_ratelimit())
+                        printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
+                                            "type %u, code %u "
+                                            "error to a broadcast: %u.%u.%u.%u on %s\n",
+                               NIPQUAD(skb->nh.iph->saddr),
+                               icmph->type, icmph->code,
+                               NIPQUAD(iph->daddr),
+                               skb->dev->name);
+                goto out;
+        }
+        /* Checkin full IP header plus 8 bytes of protocol to
+         * avoid additional coding at protocol handlers.
+         */
+        if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+                goto out;
+        iph = (struct iphdr *)skb->data;
+        protocol = iph->protocol;
+        /*
+         *      Deliver ICMP message to raw sockets. Pretty useless feature?
+         */
+        /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+        hash = protocol & (MAX_INET_PROTOS - 1);
+        read_lock(&raw_v4_lock);
+        if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
+                while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
+                                                 iph->saddr,
+                                                 skb->dev->ifindex)) != NULL) {
+                        raw_err(raw_sk, skb, info);
+                        raw_sk = sk_next(raw_sk);
+                        iph = (struct iphdr *)skb->data;
+                }
+        }
+        read_unlock(&raw_v4_lock);
+        rcu_read_lock();
+        ipprot = rcu_dereference(inet_protos[hash]);
+        if (ipprot && ipprot->err_handler)
+                ipprot->err_handler(skb, info);
+        rcu_read_unlock();
+out:
+        return;
+out_err:
+        ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+        goto out;
+}
+/*
+ *      Handle ICMP_REDIRECT.
+ */
+static void icmp_redirect(struct sk_buff *skb)
+{
+        struct iphdr *iph;
+        unsigned long ip;
+        if (skb->len < sizeof(struct iphdr))
+                goto out_err;
+        /*
+         *      Get the copied header of the packet that caused the redirect
+         */
+        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                goto out;
+        iph = (struct iphdr *)skb->data;
+        ip = iph->daddr;
+        switch (skb->h.icmph->code & 7) {
+        case ICMP_REDIR_NET:
+        case ICMP_REDIR_NETTOS:
+                /*
+                 * As per RFC recommendations now handle it as a host redirect.
+                 */
+        case ICMP_REDIR_HOST:
+        case ICMP_REDIR_HOSTTOS:
+                ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway,
+                               iph->saddr, iph->tos, skb->dev);
+                break;
+        }
+out:
+        return;
+out_err:
+        ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+        goto out;
+}
+/*
+ *      Handle ICMP_ECHO ("ping") requests.
+ *
+ *      RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ *                requests.
+ *      RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ *                included in the reply.
+ *      RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ *                echo requests, MUST have default=NOT.
+ *      See also WRT handling of options once they are done and working.
+ */
+static void icmp_echo(struct sk_buff *skb)
+{
+        if (!sysctl_icmp_echo_ignore_all) {
+                struct icmp_bxm icmp_param;
+                icmp_param.data.icmph      = *skb->h.icmph;
+                icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+                icmp_param.skb             = skb;
+                icmp_param.offset          = 0;
+                icmp_param.data_len        = skb->len;
+                icmp_param.head_len        = sizeof(struct icmphdr);
+                icmp_reply(&icmp_param, skb);
+        }
+}
+/*
+ *      Handle ICMP Timestamp requests.
+ *      RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ *                SHOULD be in the kernel for minimum random latency.
+ *                MUST be accurate to a few minutes.
+ *                MUST be updated at least at 15Hz.
+ */
+static void icmp_timestamp(struct sk_buff *skb)
+{
+        struct timeval tv;
+        struct icmp_bxm icmp_param;
+        /*
+         *      Too short.
+         */
+        if (skb->len < 4)
+                goto out_err;
+        /*
+         *      Fill in the current time as ms since midnight UT:
+         */
+        do_gettimeofday(&tv);
+        icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 +
+                                         tv.tv_usec / 1000);
+        icmp_param.data.times[2] = icmp_param.data.times[1];
+        if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
+                BUG();
+        icmp_param.data.icmph      = *skb->h.icmph;
+        icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+        icmp_param.data.icmph.code = 0;
+        icmp_param.skb             = skb;
+        icmp_param.offset          = 0;
+        icmp_param.data_len        = 0;
+        icmp_param.head_len        = sizeof(struct icmphdr) + 12;
+        icmp_reply(&icmp_param, skb);
+out:
+        return;
+out_err:
+        ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+        goto out;
+}
+/*
+ *      Handle ICMP_ADDRESS_MASK requests.  (RFC950)
+ *
+ * RFC1122 (3.2.2.9).  A host MUST only send replies to
+ * ADDRESS_MASK requests if it's been configured as an address mask
+ * agent.  Receiving a request doesn't constitute implicit permission to
+ * act as one. Of course, implementing this correctly requires (SHOULD)
+ * a way to turn the functionality on and off.  Another one for sysctl(),
+ * I guess. -- MS
+ *
+ * RFC1812 (4.3.3.9).   A router MUST implement it.
+ *                      A router SHOULD have switch turning it on/off.
+ *                      This switch MUST be ON by default.
+ *
+ * Gratuitous replies, zero-source replies are not implemented,
+ * that complies with RFC. DO NOT implement them!!! All the idea
+ * of broadcast addrmask replies as specified in RFC950 is broken.
+ * The problem is that it is not uncommon to have several prefixes
+ * on one physical interface. Moreover, addrmask agent can even be
+ * not aware of existing another prefixes.
+ * If source is zero, addrmask agent cannot choose correct prefix.
+ * Gratuitous mask announcements suffer from the same problem.
+ * RFC1812 explains it, but still allows to use ADDRMASK,
+ * that is pretty silly. --ANK
+ *
+ * All these rules are so bizarre, that I removed kernel addrmask
+ * support at all. It is wrong, it is obsolete, nobody uses it in
+ * any case. --ANK
+ *
+ * Furthermore you can do it with a usermode address agent program
+ * anyway...
+ */
+static void icmp_address(struct sk_buff *skb)
+{
+#if 0
+        if (net_ratelimit())
+                printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+#endif
+}
+/*
+ * RFC1812 (4.3.3.9).   A router SHOULD listen all replies, and complain
+ *                      loudly if an inconsistency is found.
+ */
+static void icmp_address_reply(struct sk_buff *skb)
+{
+        struct rtable *rt = (struct rtable *)skb->dst;
+        struct net_device *dev = skb->dev;
+        struct in_device *in_dev;
+        struct in_ifaddr *ifa;
+        if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
+                goto out;
+        in_dev = in_dev_get(dev);
+        if (!in_dev)
+                goto out;
+        rcu_read_lock();
+        if (in_dev->ifa_list &&
+            IN_DEV_LOG_MARTIANS(in_dev) &&
+            IN_DEV_FORWARD(in_dev)) {
+                u32 _mask, *mp;
+                mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
+                if (mp == NULL)
+                        BUG();
+                for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+                        if (*mp == ifa->ifa_mask &&
+                            inet_ifa_match(rt->rt_src, ifa))
+                                break;
+                }
+                if (!ifa && net_ratelimit()) {
+                        printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from "
+                                         "%s/%u.%u.%u.%u\n",
+                               NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
+                }
+        }
+        rcu_read_unlock();
+        in_dev_put(in_dev);
+out:;
+}
+static void icmp_discard(struct sk_buff *skb)
+{
+}
+/*
+ *      Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+        struct icmphdr *icmph;
+        struct rtable *rt = (struct rtable *)skb->dst;
+        ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
+        switch (skb->ip_summed) {
+        case CHECKSUM_HW:
+                if (!(u16)csum_fold(skb->csum))
+                        break;
+                NETDEBUG(if (net_ratelimit())
+                                printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+        case CHECKSUM_NONE:
+                if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
+                        goto error;
+        default:;
+        }
+        if (!pskb_pull(skb, sizeof(struct icmphdr)))
+                goto error;
+        icmph = skb->h.icmph;
+        /*
+         *      18 is the highest 'known' ICMP type. Anything else is a mystery
+         *
+         *      RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
+         *                discarded.
+         */
+        if (icmph->type > NR_ICMP_TYPES)
+                goto error;
+        /*
+         *      Parse the ICMP message
+         */
+        if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+                /*
+                 *      RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+                 *        silently ignored (we let user decide with a sysctl).
+                 *      RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+                 *        discarded if to broadcast/multicast.
+                 */
+                if (icmph->type == ICMP_ECHO &&
+                    sysctl_icmp_echo_ignore_broadcasts) {
+                        goto error;
+                }
+                if (icmph->type != ICMP_ECHO &&
+                    icmph->type != ICMP_TIMESTAMP &&
+                    icmph->type != ICMP_ADDRESS &&
+                    icmph->type != ICMP_ADDRESSREPLY) {
+                        goto error;
+                }
+        }
+        ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
+        icmp_pointers[icmph->type].handler(skb);
+drop:
+        kfree_skb(skb);
+        return 0;
+error:
+        ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+        goto drop;
+}
+/*
+ *      This table is the definition of how we handle ICMP.
+ */
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+        [ICMP_ECHOREPLY] = {
+                .output_entry = ICMP_MIB_OUTECHOREPS,
+                .input_entry = ICMP_MIB_INECHOREPS,
+                .handler = icmp_discard,
+        },
+        [1] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_INERRORS,
+                .handler = icmp_discard,
+                .error = 1,
+        },
+        [2] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_INERRORS,
+                .handler = icmp_discard,
+                .error = 1,
+        },
+        [ICMP_DEST_UNREACH] = {
+                .output_entry = ICMP_MIB_OUTDESTUNREACHS,
+                .input_entry = ICMP_MIB_INDESTUNREACHS,
+                .handler = icmp_unreach,
+                .error = 1,
+        },
+        [ICMP_SOURCE_QUENCH] = {
+                .output_entry = ICMP_MIB_OUTSRCQUENCHS,
+                .input_entry = ICMP_MIB_INSRCQUENCHS,
+                .handler = icmp_unreach,
+                .error = 1,
+        },
+        [ICMP_REDIRECT] = {
+                .output_entry = ICMP_MIB_OUTREDIRECTS,
+                .input_entry = ICMP_MIB_INREDIRECTS,
+                .handler = icmp_redirect,
+                .error = 1,
+        },
+        [6] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_INERRORS,
+                .handler = icmp_discard,
+                .error = 1,
+        },
+        [7] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_INERRORS,
+                .handler = icmp_discard,
+                .error = 1,
+        },
+        [ICMP_ECHO] = {
+                .output_entry = ICMP_MIB_OUTECHOS,
+                .input_entry = ICMP_MIB_INECHOS,
+                .handler = icmp_echo,
+        },
+        [9] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_INERRORS,
+                .handler = icmp_discard,
+                .error = 1,
+        },
+        [10] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_INERRORS,
+                .handler = icmp_discard,
+                .error = 1,
+        },
+        [ICMP_TIME_EXCEEDED] = {
+                .output_entry = ICMP_MIB_OUTTIMEEXCDS,
+                .input_entry = ICMP_MIB_INTIMEEXCDS,
+                .handler = icmp_unreach,
+                .error = 1,
+        },
+        [ICMP_PARAMETERPROB] = {
+                .output_entry = ICMP_MIB_OUTPARMPROBS,
+                .input_entry = ICMP_MIB_INPARMPROBS,
+                .handler = icmp_unreach,
+                .error = 1,
+        },
+        [ICMP_TIMESTAMP] = {
+                .output_entry = ICMP_MIB_OUTTIMESTAMPS,
+                .input_entry = ICMP_MIB_INTIMESTAMPS,
+                .handler = icmp_timestamp,
+        },
+        [ICMP_TIMESTAMPREPLY] = {
+                .output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
+                .input_entry = ICMP_MIB_INTIMESTAMPREPS,
+                .handler = icmp_discard,
+        },
+        [ICMP_INFO_REQUEST] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_DUMMY,
+                .handler = icmp_discard,
+        },
+        [ICMP_INFO_REPLY] = {
+                .output_entry = ICMP_MIB_DUMMY,
+                .input_entry = ICMP_MIB_DUMMY,
+                .handler = icmp_discard,
+        },
+        [ICMP_ADDRESS] = {
+                .output_entry = ICMP_MIB_OUTADDRMASKS,
+                .input_entry = ICMP_MIB_INADDRMASKS,
+                .handler = icmp_address,
+        },
+        [ICMP_ADDRESSREPLY] = {
+                .output_entry = ICMP_MIB_OUTADDRMASKREPS,
+                .input_entry = ICMP_MIB_INADDRMASKREPS,
+                .handler = icmp_address_reply,
+        },
+};
+void __init icmp_init(struct net_proto_family *ops)
+{
+        struct inet_sock *inet;
+        int i;
+        for (i = 0; i < NR_CPUS; i++) {
+                int err;
+                if (!cpu_possible(i))
+                        continue;
+                err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
+                                       &per_cpu(__icmp_socket, i));
+                if (err < 0)
+                        panic("Failed to create the ICMP control socket.\n");
+                per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC;
+                /* Enough space for 2 64K ICMP packets, including
+                 * sk_buff struct overhead.
+                 */
+                per_cpu(__icmp_socket, i)->sk->sk_sndbuf =
+                        (2 * ((64 * 1024) + sizeof(struct sk_buff)));
+                inet = inet_sk(per_cpu(__icmp_socket, i)->sk);
+                inet->uc_ttl = -1;
+                inet->pmtudisc = IP_PMTUDISC_DONT;
+                /* Unhash it so that IP input processing does not even
+                 * see it, we do not wish this socket to see incoming
+                 * packets.
+                 */
+                per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
+        }
+}
+EXPORT_SYMBOL(icmp_err_convert);
+EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(icmp_statistics);
+EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 000000000000..1f3183168a90
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2473 @@
+/*
+ *      Linux NET3:     Internet Group Management Protocol  [IGMP]
+ *
+ *      This code implements the IGMP protocol as defined in RFC1112. There has
+ *      been a further revision of this protocol since which is now supported.
+ *
+ *      If you have trouble with this module be careful what gcc you have used,
+ *      the older version didn't come out right using gcc 2.5.8, the newer one
+ *      seems to fall out with gcc 2.6.2.
+ *
+ *      Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $
+ *
+ *      Authors:
+ *              Alan Cox <Alan.Cox@linux.org>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      Fixes:
+ *
+ *              Alan Cox        :       Added lots of __inline__ to optimise
+ *                                      the memory usage of all the tiny little
+ *                                      functions.
+ *              Alan Cox        :       Dumped the header building experiment.
+ *              Alan Cox        :       Minor tweaks ready for multicast routing
+ *                                      and extended IGMP protocol.
+ *              Alan Cox        :       Removed a load of inline directives. Gcc 2.5.8
+ *                                      writes utterly bogus code otherwise (sigh)
+ *                                      fixed IGMP loopback to behave in the manner
+ *                                      desired by mrouted, fixed the fact it has been
+ *                                      broken since 1.3.6 and cleaned up a few minor
+ *                                      points.
+ *
+ *              Chih-Jen Chang  :       Tried to revise IGMP to Version 2
+ *              Tsu-Sheng Tsao          E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ *                                      The enhancements are mainly based on Steve Deering's 
+ *                                      ipmulti-3.5 source code.
+ *              Chih-Jen Chang  :       Added the igmp_get_mrouter_info and
+ *              Tsu-Sheng Tsao          igmp_set_mrouter_info to keep track of
+ *                                      the mrouted version on that device.
+ *              Chih-Jen Chang  :       Added the max_resp_time parameter to
+ *              Tsu-Sheng Tsao          igmp_heard_query(). Using this parameter
+ *                                      to identify the multicast router version
+ *                                      and do what the IGMP version 2 specified.
+ *              Chih-Jen Chang  :       Added a timer to revert to IGMP V2 router
+ *              Tsu-Sheng Tsao          if the specified time expired.
+ *              Alan Cox        :       Stop IGMP from 0.0.0.0 being accepted.
+ *              Alan Cox        :       Use GFP_ATOMIC in the right places.
+ *              Christian Daudt :       igmp timer wasn't set for local group
+ *                                      memberships but was being deleted, 
+ *                                      which caused a "del_timer() called 
+ *                                      from %p with timer not initialized\n"
+ *                                      message (960131).
+ *              Christian Daudt :       removed del_timer from 
+ *                                      igmp_timer_expire function (960205).
+ *             Christian Daudt :       igmp_heard_report now only calls
+ *                                     igmp_timer_expire if tm->running is
+ *                                     true (960216).
+ *              Malcolm Beattie :       ttl comparison wrong in igmp_rcv made
+ *                                      igmp_heard_query never trigger. Expiry
+ *                                      miscalculation fixed in igmp_heard_query
+ *                                      and random() made to return unsigned to
+ *                                      prevent negative expiry times.
+ *              Alexey Kuznetsov:       Wrong group leaving behaviour, backport
+ *                                      fix from pending 2.1.x patches.
+ *              Alan Cox:               Forget to enable FDDI support earlier.
+ *              Alexey Kuznetsov:       Fixed leaving groups on device down.
+ *              Alexey Kuznetsov:       Accordance to igmp-v2-06 draft.
+ *              David L Stevens:        IGMPv3 support, with help from
+ *                                      Vinay Kulkarni
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+#define IP_MAX_MEMBERSHIPS      20
+#define IP_MAX_MSF              10
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+#define IGMP_V1_Router_Present_Timeout          (400*HZ)
+#define IGMP_V2_Router_Present_Timeout          (400*HZ)
+#define IGMP_Unsolicited_Report_Interval        (10*HZ)
+#define IGMP_Query_Response_Interval            (10*HZ)
+#define IGMP_Unsolicited_Report_Count           2
+#define IGMP_Initial_Report_Delay               (1)
+/* IGMP_Initial_Report_Delay is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \
+                (in_dev)->cnf.force_igmp_version == 1 || \
+                ((in_dev)->mr_v1_seen && \
+                time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \
+                (in_dev)->cnf.force_igmp_version == 2 || \
+                ((in_dev)->mr_v2_seen && \
+                time_before(jiffies, (in_dev)->mr_v2_seen)))
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+                         int sfcount, __u32 *psfsrc, int delta);
+static void ip_ma_put(struct ip_mc_list *im)
+{
+        if (atomic_dec_and_test(&im->refcnt)) {
+                in_dev_put(im->interface);
+                kfree(im);
+        }
+}
+#ifdef CONFIG_IP_MULTICAST
+/*
+ *      Timer management
+ */
+static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+{
+        spin_lock_bh(&im->lock);
+        if (del_timer(&im->timer))
+                atomic_dec(&im->refcnt);
+        im->tm_running=0;
+        im->reporter = 0;
+        im->unsolicit_count = 0;
+        spin_unlock_bh(&im->lock);
+}
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+        int tv=net_random() % max_delay;
+        im->tm_running=1;
+        if (!mod_timer(&im->timer, jiffies+tv+2))
+                atomic_inc(&im->refcnt);
+}
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+        int tv = net_random() % in_dev->mr_maxdelay;
+        in_dev->mr_gq_running = 1;
+        if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+                in_dev_hold(in_dev);
+}
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+        int tv = net_random() % delay;
+        if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+                in_dev_hold(in_dev);
+}
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+        spin_lock_bh(&im->lock);
+        im->unsolicit_count = 0;
+        if (del_timer(&im->timer)) {
+                if ((long)(im->timer.expires-jiffies) < max_delay) {
+                        add_timer(&im->timer);
+                        im->tm_running=1;
+                        spin_unlock_bh(&im->lock);
+                        return;
+                }
+                atomic_dec(&im->refcnt);
+        }
+        igmp_start_timer(im, max_delay);
+        spin_unlock_bh(&im->lock);
+}
+/*
+ *      Send an IGMP report.
+ */
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+        int gdeleted, int sdeleted)
+{
+        switch (type) {
+        case IGMPV3_MODE_IS_INCLUDE:
+        case IGMPV3_MODE_IS_EXCLUDE:
+                if (gdeleted || sdeleted)
+                        return 0;
+                return !(pmc->gsquery && !psf->sf_gsresp);
+        case IGMPV3_CHANGE_TO_INCLUDE:
+                if (gdeleted || sdeleted)
+                        return 0;
+                return psf->sf_count[MCAST_INCLUDE] != 0;
+        case IGMPV3_CHANGE_TO_EXCLUDE:
+                if (gdeleted || sdeleted)
+                        return 0;
+                if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+                    psf->sf_count[MCAST_INCLUDE])
+                        return 0;
+                return pmc->sfcount[MCAST_EXCLUDE] ==
+                        psf->sf_count[MCAST_EXCLUDE];
+        case IGMPV3_ALLOW_NEW_SOURCES:
+                if (gdeleted || !psf->sf_crcount)
+                        return 0;
+                return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+        case IGMPV3_BLOCK_OLD_SOURCES:
+                if (pmc->sfmode == MCAST_INCLUDE)
+                        return gdeleted || (psf->sf_crcount && sdeleted);
+                return psf->sf_crcount && !gdeleted && !sdeleted;
+        }
+        return 0;
+}
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+        struct ip_sf_list *psf;
+        int scount = 0;
+        for (psf=pmc->sources; psf; psf=psf->sf_next) {
+                if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+                        continue;
+                scount++;
+        }
+        return scount;
+}
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+{
+        struct sk_buff *skb;
+        struct rtable *rt;
+        struct iphdr *pip;
+        struct igmpv3_report *pig;
+        skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+        if (skb == NULL)
+                return NULL;
+        {
+                struct flowi fl = { .oif = dev->ifindex,
+                                    .nl_u = { .ip4_u = {
+                                    .daddr = IGMPV3_ALL_MCR } },
+                                    .proto = IPPROTO_IGMP };
+                if (ip_route_output_key(&rt, &fl)) {
+                        kfree_skb(skb);
+                        return NULL;
+                }
+        }
+        if (rt->rt_src == 0) {
+                kfree_skb(skb);
+                ip_rt_put(rt);
+                return NULL;
+        }
+        skb->dst = &rt->u.dst;
+        skb->dev = dev;
+        skb_reserve(skb, LL_RESERVED_SPACE(dev));
+        skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+        pip->version  = 4;
+        pip->ihl      = (sizeof(struct iphdr)+4)>>2;
+        pip->tos      = 0xc0;
+        pip->frag_off = htons(IP_DF);
+        pip->ttl      = 1;
+        pip->daddr    = rt->rt_dst;
+        pip->saddr    = rt->rt_src;
+        pip->protocol = IPPROTO_IGMP;
+        pip->tot_len  = 0;      /* filled in later */
+        ip_select_ident(pip, &rt->u.dst, NULL);
+        ((u8*)&pip[1])[0] = IPOPT_RA;
+        ((u8*)&pip[1])[1] = 4;
+        ((u8*)&pip[1])[2] = 0;
+        ((u8*)&pip[1])[3] = 0;
+        pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig));
+        skb->h.igmph = (struct igmphdr *)pig;
+        pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+        pig->resv1 = 0;
+        pig->csum = 0;
+        pig->resv2 = 0;
+        pig->ngrec = 0;
+        return skb;
+}
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+        struct iphdr *pip = skb->nh.iph;
+        struct igmphdr *pig = skb->h.igmph;
+        int iplen, igmplen;
+        iplen = skb->tail - (unsigned char *)skb->nh.iph;
+        pip->tot_len = htons(iplen);
+        ip_send_check(pip);
+        igmplen = skb->tail - (unsigned char *)skb->h.igmph;
+        pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
+        return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
+                       dst_output);
+}
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+        return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel);
+}
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+        int type, struct igmpv3_grec **ppgr)
+{
+        struct net_device *dev = pmc->interface->dev;
+        struct igmpv3_report *pih;
+        struct igmpv3_grec *pgr;
+        if (!skb)
+                skb = igmpv3_newpack(dev, dev->mtu);
+        if (!skb)
+                return NULL;
+        pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+        pgr->grec_type = type;
+        pgr->grec_auxwords = 0;
+        pgr->grec_nsrcs = 0;
+        pgr->grec_mca = pmc->multiaddr;
+        pih = (struct igmpv3_report *)skb->h.igmph;
+        pih->ngrec = htons(ntohs(pih->ngrec)+1);
+        *ppgr = pgr;
+        return skb;
+}
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
+        skb_tailroom(skb)) : 0)
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+        int type, int gdeleted, int sdeleted)
+{
+        struct net_device *dev = pmc->interface->dev;
+        struct igmpv3_report *pih;
+        struct igmpv3_grec *pgr = NULL;
+        struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+        int scount, first, isquery, truncate;
+        if (pmc->multiaddr == IGMP_ALL_HOSTS)
+                return skb;
+        isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+                  type == IGMPV3_MODE_IS_EXCLUDE;
+        truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+                    type == IGMPV3_CHANGE_TO_EXCLUDE;
+        psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+        if (!*psf_list) {
+                if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+                    type == IGMPV3_BLOCK_OLD_SOURCES)
+                        return skb;
+                if (pmc->crcount || isquery) {
+                        /* make sure we have room for group header and at
+                         * least one source.
+                         */
+                        if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)+
+                            sizeof(__u32)) {
+                                igmpv3_sendpack(skb);
+                                skb = NULL; /* add_grhead will get a new one */
+                        }
+                        skb = add_grhead(skb, pmc, type, &pgr);
+                }
+                return skb;
+        }
+        pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL;
+        /* EX and TO_EX get a fresh packet, if needed */
+        if (truncate) {
+                if (pih && pih->ngrec &&
+                    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+                        if (skb)
+                                igmpv3_sendpack(skb);
+                        skb = igmpv3_newpack(dev, dev->mtu);
+                }
+        }
+        first = 1;
+        scount = 0;
+        psf_prev = NULL;
+        for (psf=*psf_list; psf; psf=psf_next) {
+                u32 *psrc;
+                psf_next = psf->sf_next;
+                if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+                        psf_prev = psf;
+                        continue;
+                }
+                /* clear marks on query responses */
+                if (isquery)
+                        psf->sf_gsresp = 0;
+                if (AVAILABLE(skb) < sizeof(u32) +
+                    first*sizeof(struct igmpv3_grec)) {
+                        if (truncate && !first)
+                                break;   /* truncate these */
+                        if (pgr)
+                                pgr->grec_nsrcs = htons(scount);
+                        if (skb)
+                                igmpv3_sendpack(skb);
+                        skb = igmpv3_newpack(dev, dev->mtu);
+                        first = 1;
+                        scount = 0;
+                }
+                if (first) {
+                        skb = add_grhead(skb, pmc, type, &pgr);
+                        first = 0;
+                }
+                psrc = (u32 *)skb_put(skb, sizeof(u32));
+                *psrc = psf->sf_inaddr;
+                scount++;
+                if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+                     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+                        psf->sf_crcount--;
+                        if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+                                if (psf_prev)
+                                        psf_prev->sf_next = psf->sf_next;
+                                else
+                                        *psf_list = psf->sf_next;
+                                kfree(psf);
+                                continue;
+                        }
+                }
+                psf_prev = psf;
+        }
+        if (pgr)
+                pgr->grec_nsrcs = htons(scount);
+        if (isquery)
+                pmc->gsquery = 0;       /* clear query state on report */
+        return skb;
+}
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+        struct sk_buff *skb = NULL;
+        int type;
+        if (!pmc) {
+                read_lock(&in_dev->mc_list_lock);
+                for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+                        if (pmc->multiaddr == IGMP_ALL_HOSTS)
+                                continue;
+                        spin_lock_bh(&pmc->lock);
+                        if (pmc->sfcount[MCAST_EXCLUDE])
+                                type = IGMPV3_MODE_IS_EXCLUDE;
+                        else
+                                type = IGMPV3_MODE_IS_INCLUDE;
+                        skb = add_grec(skb, pmc, type, 0, 0);
+                        spin_unlock_bh(&pmc->lock);
+                }
+                read_unlock(&in_dev->mc_list_lock);
+        } else {
+                spin_lock_bh(&pmc->lock);
+                if (pmc->sfcount[MCAST_EXCLUDE])
+                        type = IGMPV3_MODE_IS_EXCLUDE;
+                else
+                        type = IGMPV3_MODE_IS_INCLUDE;
+                skb = add_grec(skb, pmc, type, 0, 0);
+                spin_unlock_bh(&pmc->lock);
+        }
+        if (!skb)
+                return 0;
+        return igmpv3_sendpack(skb);
+}
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+        struct ip_sf_list *psf_prev, *psf_next, *psf;
+        psf_prev = NULL;
+        for (psf=*ppsf; psf; psf = psf_next) {
+                psf_next = psf->sf_next;
+                if (psf->sf_crcount == 0) {
+                        if (psf_prev)
+                                psf_prev->sf_next = psf->sf_next;
+                        else
+                                *ppsf = psf->sf_next;
+                        kfree(psf);
+                } else
+                        psf_prev = psf;
+        }
+}
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+        struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+        struct sk_buff *skb = NULL;
+        int type, dtype;
+        read_lock(&in_dev->mc_list_lock);
+        spin_lock_bh(&in_dev->mc_tomb_lock);
+        /* deleted MCA's */
+        pmc_prev = NULL;
+        for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+                pmc_next = pmc->next;
+                if (pmc->sfmode == MCAST_INCLUDE) {
+                        type = IGMPV3_BLOCK_OLD_SOURCES;
+                        dtype = IGMPV3_BLOCK_OLD_SOURCES;
+                        skb = add_grec(skb, pmc, type, 1, 0);
+                        skb = add_grec(skb, pmc, dtype, 1, 1);
+                }
+                if (pmc->crcount) {
+                        pmc->crcount--;
+                        if (pmc->sfmode == MCAST_EXCLUDE) {
+                                type = IGMPV3_CHANGE_TO_INCLUDE;
+                                skb = add_grec(skb, pmc, type, 1, 0);
+                        }
+                        if (pmc->crcount == 0) {
+                                igmpv3_clear_zeros(&pmc->tomb);
+                                igmpv3_clear_zeros(&pmc->sources);
+                        }
+                }
+                if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+                        if (pmc_prev)
+                                pmc_prev->next = pmc_next;
+                        else
+                                in_dev->mc_tomb = pmc_next;
+                        in_dev_put(pmc->interface);
+                        kfree(pmc);
+                } else
+                        pmc_prev = pmc;
+        }
+        spin_unlock_bh(&in_dev->mc_tomb_lock);
+        /* change recs */
+        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+                spin_lock_bh(&pmc->lock);
+                if (pmc->sfcount[MCAST_EXCLUDE]) {
+                        type = IGMPV3_BLOCK_OLD_SOURCES;
+                        dtype = IGMPV3_ALLOW_NEW_SOURCES;
+                } else {
+                        type = IGMPV3_ALLOW_NEW_SOURCES;
+                        dtype = IGMPV3_BLOCK_OLD_SOURCES;
+                }
+                skb = add_grec(skb, pmc, type, 0, 0);
+                skb = add_grec(skb, pmc, dtype, 0, 1);  /* deleted sources */
+                /* filter mode changes */
+                if (pmc->crcount) {
+                        pmc->crcount--;
+                        if (pmc->sfmode == MCAST_EXCLUDE)
+                                type = IGMPV3_CHANGE_TO_EXCLUDE;
+                        else
+                                type = IGMPV3_CHANGE_TO_INCLUDE;
+                        skb = add_grec(skb, pmc, type, 0, 0);
+                }
+                spin_unlock_bh(&pmc->lock);
+        }
+        read_unlock(&in_dev->mc_list_lock);
+        if (!skb)
+                return;
+        (void) igmpv3_sendpack(skb);
+}
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+        int type)
+{
+        struct sk_buff *skb;
+        struct iphdr *iph;
+        struct igmphdr *ih;
+        struct rtable *rt;
+        struct net_device *dev = in_dev->dev;
+        u32     group = pmc ? pmc->multiaddr : 0;
+        u32     dst;
+        if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+                return igmpv3_send_report(in_dev, pmc);
+        else if (type == IGMP_HOST_LEAVE_MESSAGE)
+                dst = IGMP_ALL_ROUTER;
+        else
+                dst = group;
+        {
+                struct flowi fl = { .oif = dev->ifindex,
+                                    .nl_u = { .ip4_u = { .daddr = dst } },
+                                    .proto = IPPROTO_IGMP };
+                if (ip_route_output_key(&rt, &fl))
+                        return -1;
+        }
+        if (rt->rt_src == 0) {
+                ip_rt_put(rt);
+                return -1;
+        }
+        skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+        if (skb == NULL) {
+                ip_rt_put(rt);
+                return -1;
+        }
+        skb->dst = &rt->u.dst;
+        skb_reserve(skb, LL_RESERVED_SPACE(dev));
+        skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+        iph->version  = 4;
+        iph->ihl      = (sizeof(struct iphdr)+4)>>2;
+        iph->tos      = 0xc0;
+        iph->frag_off = htons(IP_DF);
+        iph->ttl      = 1;
+        iph->daddr    = dst;
+        iph->saddr    = rt->rt_src;
+        iph->protocol = IPPROTO_IGMP;
+        iph->tot_len  = htons(IGMP_SIZE);
+        ip_select_ident(iph, &rt->u.dst, NULL);
+        ((u8*)&iph[1])[0] = IPOPT_RA;
+        ((u8*)&iph[1])[1] = 4;
+        ((u8*)&iph[1])[2] = 0;
+        ((u8*)&iph[1])[3] = 0;
+        ip_send_check(iph);
+        ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+        ih->type=type;
+        ih->code=0;
+        ih->csum=0;
+        ih->group=group;
+        ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+        return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+                       dst_output);
+}
+static void igmp_gq_timer_expire(unsigned long data)
+{
+        struct in_device *in_dev = (struct in_device *)data;
+        in_dev->mr_gq_running = 0;
+        igmpv3_send_report(in_dev, NULL);
+        __in_dev_put(in_dev);
+}
+static void igmp_ifc_timer_expire(unsigned long data)
+{
+        struct in_device *in_dev = (struct in_device *)data;
+        igmpv3_send_cr(in_dev);
+        if (in_dev->mr_ifc_count) {
+                in_dev->mr_ifc_count--;
+                igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+        }
+        __in_dev_put(in_dev);
+}
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+                return;
+        in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : 
+                IGMP_Unsolicited_Report_Count;
+        igmp_ifc_start_timer(in_dev, 1);
+}
+static void igmp_timer_expire(unsigned long data)
+{
+        struct ip_mc_list *im=(struct ip_mc_list *)data;
+        struct in_device *in_dev = im->interface;
+        spin_lock(&im->lock);
+        im->tm_running=0;
+        if (im->unsolicit_count) {
+                im->unsolicit_count--;
+                igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+        }
+        im->reporter = 1;
+        spin_unlock(&im->lock);
+        if (IGMP_V1_SEEN(in_dev))
+                igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+        else if (IGMP_V2_SEEN(in_dev))
+                igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+        else
+                igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+        ip_ma_put(im);
+}
+static void igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __u32 *srcs)
+{
+        struct ip_sf_list *psf;
+        int i, scount;
+        scount = 0;
+        for (psf=pmc->sources; psf; psf=psf->sf_next) {
+                if (scount == nsrcs)
+                        break;
+                for (i=0; i<nsrcs; i++)
+                        if (srcs[i] == psf->sf_inaddr) {
+                                psf->sf_gsresp = 1;
+                                scount++;
+                                break;
+                        }
+        }
+}
+static void igmp_heard_report(struct in_device *in_dev, u32 group)
+{
+        struct ip_mc_list *im;
+        /* Timers are only set for non-local groups */
+        if (group == IGMP_ALL_HOSTS)
+                return;
+        read_lock(&in_dev->mc_list_lock);
+        for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+                if (im->multiaddr == group) {
+                        igmp_stop_timer(im);
+                        break;
+                }
+        }
+        read_unlock(&in_dev->mc_list_lock);
+}
+static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+        int len)
+{
+        struct igmphdr          *ih = skb->h.igmph;
+        struct igmpv3_query *ih3 = (struct igmpv3_query *)ih;
+        struct ip_mc_list       *im;
+        u32                     group = ih->group;
+        int                     max_delay;
+        int                     mark = 0;
+        if (len == 8) {
+                if (ih->code == 0) {
+                        /* Alas, old v1 router presents here. */
+        
+                        max_delay = IGMP_Query_Response_Interval;
+                        in_dev->mr_v1_seen = jiffies +
+                                IGMP_V1_Router_Present_Timeout;
+                        group = 0;
+                } else {
+                        /* v2 router present */
+                        max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+                        in_dev->mr_v2_seen = jiffies +
+                                IGMP_V2_Router_Present_Timeout;
+                }
+                /* cancel the interface change timer */
+                in_dev->mr_ifc_count = 0;
+                if (del_timer(&in_dev->mr_ifc_timer))
+                        __in_dev_put(in_dev);
+                /* clear deleted report items */
+                igmpv3_clear_delrec(in_dev);
+        } else if (len < 12) {
+                return; /* ignore bogus packet; freed by caller */
+        } else { /* v3 */
+                if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+                        return;
+                
+                ih3 = (struct igmpv3_query *) skb->h.raw;
+                if (ih3->nsrcs) {
+                        if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) 
+                                           + ntohs(ih3->nsrcs)*sizeof(__u32)))
+                                return;
+                        ih3 = (struct igmpv3_query *) skb->h.raw;
+                }
+                max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+                if (!max_delay)
+                        max_delay = 1;  /* can't mod w/ 0 */
+                in_dev->mr_maxdelay = max_delay;
+                if (ih3->qrv)
+                        in_dev->mr_qrv = ih3->qrv;
+                if (!group) { /* general query */
+                        if (ih3->nsrcs)
+                                return; /* no sources allowed */
+                        igmp_gq_start_timer(in_dev);
+                        return;
+                }
+                /* mark sources to include, if group & source-specific */
+                mark = ih3->nsrcs != 0;
+        }
+        /*
+         * - Start the timers in all of our membership records
+         *   that the query applies to for the interface on
+         *   which the query arrived excl. those that belong
+         *   to a "local" group (224.0.0.X)
+         * - For timers already running check if they need to
+         *   be reset.
+         * - Use the igmp->igmp_code field as the maximum
+         *   delay possible
+         */
+        read_lock(&in_dev->mc_list_lock);
+        for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+                if (group && group != im->multiaddr)
+                        continue;
+                if (im->multiaddr == IGMP_ALL_HOSTS)
+                        continue;
+                spin_lock_bh(&im->lock);
+                if (im->tm_running)
+                        im->gsquery = im->gsquery && mark;
+                else
+                        im->gsquery = mark;
+                if (im->gsquery)
+                        igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+                spin_unlock_bh(&im->lock);
+                igmp_mod_timer(im, max_delay);
+        }
+        read_unlock(&in_dev->mc_list_lock);
+}
+int igmp_rcv(struct sk_buff *skb)
+{
+        /* This basically follows the spec line by line -- see RFC1112 */
+        struct igmphdr *ih;
+        struct in_device *in_dev = in_dev_get(skb->dev);
+        int len = skb->len;
+        if (in_dev==NULL) {
+                kfree_skb(skb);
+                return 0;
+        }
+        if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || 
+            (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
+                in_dev_put(in_dev);
+                kfree_skb(skb);
+                return 0;
+        }
+        ih = skb->h.igmph;
+        switch (ih->type) {
+        case IGMP_HOST_MEMBERSHIP_QUERY:
+                igmp_heard_query(in_dev, skb, len);
+                break;
+        case IGMP_HOST_MEMBERSHIP_REPORT:
+        case IGMPV2_HOST_MEMBERSHIP_REPORT:
+        case IGMPV3_HOST_MEMBERSHIP_REPORT:
+                /* Is it our report looped back? */
+                if (((struct rtable*)skb->dst)->fl.iif == 0)
+                        break;
+                igmp_heard_report(in_dev, ih->group);
+                break;
+        case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+                in_dev_put(in_dev);
+                return pim_rcv_v1(skb);
+#endif
+        case IGMP_DVMRP:
+        case IGMP_TRACE:
+        case IGMP_HOST_LEAVE_MESSAGE:
+        case IGMP_MTRACE:
+        case IGMP_MTRACE_RESP:
+                break;
+        default:
+                NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+        }
+        in_dev_put(in_dev);
+        kfree_skb(skb);
+        return 0;
+}
+#endif
+/*
+ *      Add a filter to a device
+ */
+static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
+{
+        char buf[MAX_ADDR_LEN];
+        struct net_device *dev = in_dev->dev;
+        /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+           We will get multicast token leakage, when IFF_MULTICAST
+           is changed. This check should be done in dev->set_multicast_list
+           routine. Something sort of:
+           if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+           --ANK
+           */
+        if (arp_mc_map(addr, buf, dev, 0) == 0)
+                dev_mc_add(dev,buf,dev->addr_len,0);
+}
+/*
+ *      Remove a filter from a device
+ */
+static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
+{
+        char buf[MAX_ADDR_LEN];
+        struct net_device *dev = in_dev->dev;
+        if (arp_mc_map(addr, buf, dev, 0) == 0)
+                dev_mc_delete(dev,buf,dev->addr_len,0);
+}
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+        struct ip_mc_list *pmc;
+        /* this is an "ip_mc_list" for convenience; only the fields below
+         * are actually used. In particular, the refcnt and users are not
+         * used for management of the delete list. Using the same structure
+         * for deleted items allows change reports to use common code with
+         * non-deleted or query-response MCA's.
+         */
+        pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL);
+        if (!pmc)
+                return;
+        memset(pmc, 0, sizeof(*pmc));
+        spin_lock_bh(&im->lock);
+        pmc->interface = im->interface;
+        in_dev_hold(in_dev);
+        pmc->multiaddr = im->multiaddr;
+        pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+                IGMP_Unsolicited_Report_Count;
+        pmc->sfmode = im->sfmode;
+        if (pmc->sfmode == MCAST_INCLUDE) {
+                struct ip_sf_list *psf;
+                pmc->tomb = im->tomb;
+                pmc->sources = im->sources;
+                im->tomb = im->sources = NULL;
+                for (psf=pmc->sources; psf; psf=psf->sf_next)
+                        psf->sf_crcount = pmc->crcount;
+        }
+        spin_unlock_bh(&im->lock);
+        spin_lock_bh(&in_dev->mc_tomb_lock);
+        pmc->next = in_dev->mc_tomb;
+        in_dev->mc_tomb = pmc;
+        spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr)
+{
+        struct ip_mc_list *pmc, *pmc_prev;
+        struct ip_sf_list *psf, *psf_next;
+        spin_lock_bh(&in_dev->mc_tomb_lock);
+        pmc_prev = NULL;
+        for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+                if (pmc->multiaddr == multiaddr)
+                        break;
+                pmc_prev = pmc;
+        }
+        if (pmc) {
+                if (pmc_prev)
+                        pmc_prev->next = pmc->next;
+                else
+                        in_dev->mc_tomb = pmc->next;
+        }
+        spin_unlock_bh(&in_dev->mc_tomb_lock);
+        if (pmc) {
+                for (psf=pmc->tomb; psf; psf=psf_next) {
+                        psf_next = psf->sf_next;
+                        kfree(psf);
+                }
+                in_dev_put(pmc->interface);
+                kfree(pmc);
+        }
+}
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+        struct ip_mc_list *pmc, *nextpmc;
+        spin_lock_bh(&in_dev->mc_tomb_lock);
+        pmc = in_dev->mc_tomb;
+        in_dev->mc_tomb = NULL;
+        spin_unlock_bh(&in_dev->mc_tomb_lock);
+        for (; pmc; pmc = nextpmc) {
+                nextpmc = pmc->next;
+                ip_mc_clear_src(pmc);
+                in_dev_put(pmc->interface);
+                kfree(pmc);
+        }
+        /* clear dead sources, too */
+        read_lock(&in_dev->mc_list_lock);
+        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+                struct ip_sf_list *psf, *psf_next;
+                spin_lock_bh(&pmc->lock);
+                psf = pmc->tomb;
+                pmc->tomb = NULL;
+                spin_unlock_bh(&pmc->lock);
+                for (; psf; psf=psf_next) {
+                        psf_next = psf->sf_next;
+                        kfree(psf);
+                }
+        }
+        read_unlock(&in_dev->mc_list_lock);
+}
+#endif
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+        struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+        int reporter;
+#endif
+        if (im->loaded) {
+                im->loaded = 0;
+                ip_mc_filter_del(in_dev, im->multiaddr);
+        }
+#ifdef CONFIG_IP_MULTICAST
+        if (im->multiaddr == IGMP_ALL_HOSTS)
+                return;
+        reporter = im->reporter;
+        igmp_stop_timer(im);
+        if (!in_dev->dead) {
+                if (IGMP_V1_SEEN(in_dev))
+                        goto done;
+                if (IGMP_V2_SEEN(in_dev)) {
+                        if (reporter)
+                                igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+                        goto done;
+                }
+                /* IGMPv3 */
+                igmpv3_add_delrec(in_dev, im);
+                igmp_ifc_event(in_dev);
+        }
+done:
+#endif
+        ip_mc_clear_src(im);
+}
+static void igmp_group_added(struct ip_mc_list *im)
+{
+        struct in_device *in_dev = im->interface;
+        if (im->loaded == 0) {
+                im->loaded = 1;
+                ip_mc_filter_add(in_dev, im->multiaddr);
+        }
+#ifdef CONFIG_IP_MULTICAST
+        if (im->multiaddr == IGMP_ALL_HOSTS)
+                return;
+        if (in_dev->dead)
+                return;
+        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+                spin_lock_bh(&im->lock);
+                igmp_start_timer(im, IGMP_Initial_Report_Delay);
+                spin_unlock_bh(&im->lock);
+                return;
+        }
+        /* else, v3 */
+        im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+                IGMP_Unsolicited_Report_Count;
+        igmp_ifc_event(in_dev);
+#endif
+}
+/*
+ *      Multicast list managers
+ */
+/*
+ *      A socket has joined a multicast group on device dev.
+ */
+void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
+{
+        struct ip_mc_list *im;
+        ASSERT_RTNL();
+        for (im=in_dev->mc_list; im; im=im->next) {
+                if (im->multiaddr == addr) {
+                        im->users++;
+                        ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+                        goto out;
+                }
+        }
+        im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
+        if (!im)
+                goto out;
+        im->users=1;
+        im->interface=in_dev;
+        in_dev_hold(in_dev);
+        im->multiaddr=addr;
+        /* initial mode is (EX, empty) */
+        im->sfmode = MCAST_EXCLUDE;
+        im->sfcount[MCAST_INCLUDE] = 0;
+        im->sfcount[MCAST_EXCLUDE] = 1;
+        im->sources = NULL;
+        im->tomb = NULL;
+        im->crcount = 0;
+        atomic_set(&im->refcnt, 1);
+        spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+        im->tm_running=0;
+        init_timer(&im->timer);
+        im->timer.data=(unsigned long)im;
+        im->timer.function=&igmp_timer_expire;
+        im->unsolicit_count = IGMP_Unsolicited_Report_Count;
+        im->reporter = 0;
+        im->gsquery = 0;
+#endif
+        im->loaded = 0;
+        write_lock_bh(&in_dev->mc_list_lock);
+        im->next=in_dev->mc_list;
+        in_dev->mc_list=im;
+        write_unlock_bh(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+        igmpv3_del_delrec(in_dev, im->multiaddr);
+#endif
+        igmp_group_added(im);
+        if (!in_dev->dead)
+                ip_rt_multicast_event(in_dev);
+out:
+        return;
+}
+/*
+ *      A socket has left a multicast group on device dev
+ */
+void ip_mc_dec_group(struct in_device *in_dev, u32 addr)
+{
+        struct ip_mc_list *i, **ip;
+        
+        ASSERT_RTNL();
+        
+        for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
+                if (i->multiaddr==addr) {
+                        if (--i->users == 0) {
+                                write_lock_bh(&in_dev->mc_list_lock);
+                                *ip = i->next;
+                                write_unlock_bh(&in_dev->mc_list_lock);
+                                igmp_group_dropped(i);
+                                if (!in_dev->dead)
+                                        ip_rt_multicast_event(in_dev);
+                                ip_ma_put(i);
+                                return;
+                        }
+                        break;
+                }
+        }
+}
+/* Device going down */
+void ip_mc_down(struct in_device *in_dev)
+{
+        struct ip_mc_list *i;
+        ASSERT_RTNL();
+        for (i=in_dev->mc_list; i; i=i->next)
+                igmp_group_dropped(i);
+#ifdef CONFIG_IP_MULTICAST
+        in_dev->mr_ifc_count = 0;
+        if (del_timer(&in_dev->mr_ifc_timer))
+                __in_dev_put(in_dev);
+        in_dev->mr_gq_running = 0;
+        if (del_timer(&in_dev->mr_gq_timer))
+                __in_dev_put(in_dev);
+        igmpv3_clear_delrec(in_dev);
+#endif
+        ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+        ASSERT_RTNL();
+        in_dev->mc_tomb = NULL;
+#ifdef CONFIG_IP_MULTICAST
+        in_dev->mr_gq_running = 0;
+        init_timer(&in_dev->mr_gq_timer);
+        in_dev->mr_gq_timer.data=(unsigned long) in_dev;
+        in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
+        in_dev->mr_ifc_count = 0;
+        init_timer(&in_dev->mr_ifc_timer);
+        in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
+        in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
+        in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
+#endif
+        rwlock_init(&in_dev->mc_list_lock);
+        spin_lock_init(&in_dev->mc_tomb_lock);
+}
+/* Device going up */
+void ip_mc_up(struct in_device *in_dev)
+{
+        struct ip_mc_list *i;
+        ASSERT_RTNL();
+        ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+        for (i=in_dev->mc_list; i; i=i->next)
+                igmp_group_added(i);
+}
+/*
+ *      Device is about to be destroyed: clean up.
+ */
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+        struct ip_mc_list *i;
+        ASSERT_RTNL();
+        /* Deactivate timers */
+        ip_mc_down(in_dev);
+        write_lock_bh(&in_dev->mc_list_lock);
+        while ((i = in_dev->mc_list) != NULL) {
+                in_dev->mc_list = i->next;
+                write_unlock_bh(&in_dev->mc_list_lock);
+                igmp_group_dropped(i);
+                ip_ma_put(i);
+                write_lock_bh(&in_dev->mc_list_lock);
+        }
+        write_unlock_bh(&in_dev->mc_list_lock);
+}
+static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+{
+        struct flowi fl = { .nl_u = { .ip4_u =
+                                      { .daddr = imr->imr_multiaddr.s_addr } } };
+        struct rtable *rt;
+        struct net_device *dev = NULL;
+        struct in_device *idev = NULL;
+        if (imr->imr_ifindex) {
+                idev = inetdev_by_index(imr->imr_ifindex);
+                if (idev)
+                        __in_dev_put(idev);
+                return idev;
+        }
+        if (imr->imr_address.s_addr) {
+                dev = ip_dev_find(imr->imr_address.s_addr);
+                if (!dev)
+                        return NULL;
+                __dev_put(dev);
+        }
+        if (!dev && !ip_route_output_key(&rt, &fl)) {
+                dev = rt->u.dst.dev;
+                ip_rt_put(rt);
+        }
+        if (dev) {
+                imr->imr_ifindex = dev->ifindex;
+                idev = __in_dev_get(dev);
+        }
+        return idev;
+}
+/*
+ *      Join a socket to a group
+ */
+int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS;
+int sysctl_igmp_max_msf = IP_MAX_MSF;
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+        __u32 *psfsrc)
+{
+        struct ip_sf_list *psf, *psf_prev;
+        int rv = 0;
+        psf_prev = NULL;
+        for (psf=pmc->sources; psf; psf=psf->sf_next) {
+                if (psf->sf_inaddr == *psfsrc)
+                        break;
+                psf_prev = psf;
+        }
+        if (!psf || psf->sf_count[sfmode] == 0) {
+                /* source filter not found, or count wrong =>  bug */
+                return -ESRCH;
+        }
+        psf->sf_count[sfmode]--;
+        if (psf->sf_count[sfmode] == 0) {
+                ip_rt_multicast_event(pmc->interface);
+        }
+        if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+                struct in_device *in_dev = pmc->interface;
+#endif
+                /* no more filters for this source */
+                if (psf_prev)
+                        psf_prev->sf_next = psf->sf_next;
+                else
+                        pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+                if (psf->sf_oldin &&
+                    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+                        psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 
+                                IGMP_Unsolicited_Report_Count;
+                        psf->sf_next = pmc->tomb;
+                        pmc->tomb = psf;
+                        rv = 1;
+                } else
+#endif
+                        kfree(psf);
+        }
+        return rv;
+}
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x)       do { } while (0)
+#endif
+static int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+                         int sfcount, __u32 *psfsrc, int delta)
+{
+        struct ip_mc_list *pmc;
+        int     changerec = 0;
+        int     i, err;
+        if (!in_dev)
+                return -ENODEV;
+        read_lock(&in_dev->mc_list_lock);
+        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+                if (*pmca == pmc->multiaddr)
+                        break;
+        }
+        if (!pmc) {
+                /* MCA not found?? bug */
+                read_unlock(&in_dev->mc_list_lock);
+                return -ESRCH;
+        }
+        spin_lock_bh(&pmc->lock);
+        read_unlock(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+        sf_markstate(pmc);
+#endif
+        if (!delta) {
+                err = -EINVAL;
+                if (!pmc->sfcount[sfmode])
+                        goto out_unlock;
+                pmc->sfcount[sfmode]--;
+        }
+        err = 0;
+        for (i=0; i<sfcount; i++) {
+                int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+                changerec |= rv > 0;
+                if (!err && rv < 0)
+                        err = rv;
+        }
+        if (pmc->sfmode == MCAST_EXCLUDE &&
+            pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+            pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+                struct ip_sf_list *psf;
+#endif
+                /* filter mode change */
+                pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+                pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 
+                        IGMP_Unsolicited_Report_Count;
+                in_dev->mr_ifc_count = pmc->crcount;
+                for (psf=pmc->sources; psf; psf = psf->sf_next)
+                        psf->sf_crcount = 0;
+                igmp_ifc_event(pmc->interface);
+        } else if (sf_setstate(pmc) || changerec) {
+                igmp_ifc_event(pmc->interface);
+#endif
+        }
+out_unlock:
+        spin_unlock_bh(&pmc->lock);
+        return err;
+}
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+        __u32 *psfsrc, int delta)
+{
+        struct ip_sf_list *psf, *psf_prev;
+        psf_prev = NULL;
+        for (psf=pmc->sources; psf; psf=psf->sf_next) {
+                if (psf->sf_inaddr == *psfsrc)
+                        break;
+                psf_prev = psf;
+        }
+        if (!psf) {
+                psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC);
+                if (!psf)
+                        return -ENOBUFS;
+                memset(psf, 0, sizeof(*psf));
+                psf->sf_inaddr = *psfsrc;
+                if (psf_prev) {
+                        psf_prev->sf_next = psf;
+                } else
+                        pmc->sources = psf;
+        }
+        psf->sf_count[sfmode]++;
+        if (psf->sf_count[sfmode] == 1) {
+                ip_rt_multicast_event(pmc->interface);
+        }
+        return 0;
+}
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+        struct ip_sf_list *psf;
+        int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+        for (psf=pmc->sources; psf; psf=psf->sf_next)
+                if (pmc->sfcount[MCAST_EXCLUDE]) {
+                        psf->sf_oldin = mca_xcount ==
+                                psf->sf_count[MCAST_EXCLUDE] &&
+                                !psf->sf_count[MCAST_INCLUDE];
+                } else
+                        psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+        struct ip_sf_list *psf;
+        int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+        int qrv = pmc->interface->mr_qrv;
+        int new_in, rv;
+        rv = 0;
+        for (psf=pmc->sources; psf; psf=psf->sf_next) {
+                if (pmc->sfcount[MCAST_EXCLUDE]) {
+                        new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+                                !psf->sf_count[MCAST_INCLUDE];
+                } else
+                        new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+                if (new_in != psf->sf_oldin) {
+                        psf->sf_crcount = qrv;
+                        rv++;
+                }
+        }
+        return rv;
+}
+#endif
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+                         int sfcount, __u32 *psfsrc, int delta)
+{
+        struct ip_mc_list *pmc;
+        int     isexclude;
+        int     i, err;
+        if (!in_dev)
+                return -ENODEV;
+        read_lock(&in_dev->mc_list_lock);
+        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+                if (*pmca == pmc->multiaddr)
+                        break;
+        }
+        if (!pmc) {
+                /* MCA not found?? bug */
+                read_unlock(&in_dev->mc_list_lock);
+                return -ESRCH;
+        }
+        spin_lock_bh(&pmc->lock);
+        read_unlock(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+        sf_markstate(pmc);
+#endif
+        isexclude = pmc->sfmode == MCAST_EXCLUDE;
+        if (!delta)
+                pmc->sfcount[sfmode]++;
+        err = 0;
+        for (i=0; i<sfcount; i++) {
+                err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+                if (err)
+                        break;
+        }
+        if (err) {
+                int j;
+                pmc->sfcount[sfmode]--;
+                for (j=0; j<i; j++)
+                        (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+        } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+                struct in_device *in_dev = pmc->interface;
+                struct ip_sf_list *psf;
+#endif
+                /* filter mode change */
+                if (pmc->sfcount[MCAST_EXCLUDE])
+                        pmc->sfmode = MCAST_EXCLUDE;
+                else if (pmc->sfcount[MCAST_INCLUDE])
+                        pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+                /* else no filters; keep old mode for reports */
+                pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 
+                        IGMP_Unsolicited_Report_Count;
+                in_dev->mr_ifc_count = pmc->crcount;
+                for (psf=pmc->sources; psf; psf = psf->sf_next)
+                        psf->sf_crcount = 0;
+                igmp_ifc_event(in_dev);
+        } else if (sf_setstate(pmc)) {
+                igmp_ifc_event(in_dev);
+#endif
+        }
+        spin_unlock_bh(&pmc->lock);
+        return err;
+}
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+        struct ip_sf_list *psf, *nextpsf;
+        for (psf=pmc->tomb; psf; psf=nextpsf) {
+                nextpsf = psf->sf_next;
+                kfree(psf);
+        }
+        pmc->tomb = NULL;
+        for (psf=pmc->sources; psf; psf=nextpsf) {
+                nextpsf = psf->sf_next;
+                kfree(psf);
+        }
+        pmc->sources = NULL;
+        pmc->sfmode = MCAST_EXCLUDE;
+        pmc->sfcount[MCAST_EXCLUDE] = 0;
+        pmc->sfcount[MCAST_EXCLUDE] = 1;
+}
+/*
+ * Join a multicast group
+ */
+int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+{
+        int err;
+        u32 addr = imr->imr_multiaddr.s_addr;
+        struct ip_mc_socklist *iml, *i;
+        struct in_device *in_dev;
+        struct inet_sock *inet = inet_sk(sk);
+        int count = 0;
+        if (!MULTICAST(addr))
+                return -EINVAL;
+        rtnl_shlock();
+        in_dev = ip_mc_find_dev(imr);
+        if (!in_dev) {
+                iml = NULL;
+                err = -ENODEV;
+                goto done;
+        }
+        iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+        err = -EADDRINUSE;
+        for (i = inet->mc_list; i; i = i->next) {
+                if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
+                        /* New style additions are reference counted */
+                        if (imr->imr_address.s_addr == 0) {
+                                i->count++;
+                                err = 0;
+                        }
+                        goto done;
+                }
+                count++;
+        }
+        err = -ENOBUFS;
+        if (iml == NULL || count >= sysctl_igmp_max_memberships)
+                goto done;
+        memcpy(&iml->multi, imr, sizeof(*imr));
+        iml->next = inet->mc_list;
+        iml->count = 1;
+        iml->sflist = NULL;
+        iml->sfmode = MCAST_EXCLUDE;
+        inet->mc_list = iml;
+        ip_mc_inc_group(in_dev, addr);
+        iml = NULL;
+        err = 0;
+done:
+        rtnl_shunlock();
+        if (iml)
+                sock_kfree_s(sk, iml, sizeof(*iml));
+        return err;
+}
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+                           struct in_device *in_dev)
+{
+        int err;
+        if (iml->sflist == 0) {
+                /* any-source empty exclude case */
+                return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+                        iml->sfmode, 0, NULL, 0);
+        }
+        err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+                        iml->sfmode, iml->sflist->sl_count,
+                        iml->sflist->sl_addr, 0);
+        sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
+        iml->sflist = NULL;
+        return err;
+}
+/*
+ *      Ask a socket to leave a group.
+ */
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_mc_socklist *iml, **imlp;
+        rtnl_lock();
+        for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
+                if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
+                    iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
+                    (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
+                        struct in_device *in_dev;
+                        in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+                        if (in_dev)
+                                (void) ip_mc_leave_src(sk, iml, in_dev);
+                        if (--iml->count) {
+                                rtnl_unlock();
+                                if (in_dev)
+                                        in_dev_put(in_dev);
+                                return 0;
+                        }
+                        *imlp = iml->next;
+                        if (in_dev) {
+                                ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
+                                in_dev_put(in_dev);
+                        }
+                        rtnl_unlock();
+                        sock_kfree_s(sk, iml, sizeof(*iml));
+                        return 0;
+                }
+        }
+        rtnl_unlock();
+        return -EADDRNOTAVAIL;
+}
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+        ip_mreq_source *mreqs, int ifindex)
+{
+        int err;
+        struct ip_mreqn imr;
+        u32 addr = mreqs->imr_multiaddr;
+        struct ip_mc_socklist *pmc;
+        struct in_device *in_dev = NULL;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_sf_socklist *psl;
+        int i, j, rv;
+        if (!MULTICAST(addr))
+                return -EINVAL;
+        rtnl_shlock();
+        imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+        imr.imr_address.s_addr = mreqs->imr_interface;
+        imr.imr_ifindex = ifindex;
+        in_dev = ip_mc_find_dev(&imr);
+        if (!in_dev) {
+                err = -ENODEV;
+                goto done;
+        }
+        err = -EADDRNOTAVAIL;
+        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+                if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+                        break;
+        }
+        if (!pmc)               /* must have a prior join */
+                goto done;
+        /* if a source filter was set, must be the same mode as before */
+        if (pmc->sflist) {
+                if (pmc->sfmode != omode)
+                        goto done;
+        } else if (pmc->sfmode != omode) {
+                /* allow mode switches for empty-set filters */
+                ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+                ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, 
+                        NULL, 0);
+                pmc->sfmode = omode;
+        }
+        psl = pmc->sflist;
+        if (!add) {
+                if (!psl)
+                        goto done;
+                rv = !0;
+                for (i=0; i<psl->sl_count; i++) {
+                        rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+                                sizeof(__u32));
+                        if (rv == 0)
+                                break;
+                }
+                if (rv)         /* source not found */
+                        goto done;
+                /* update the interface filter */
+                ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
+                        &mreqs->imr_sourceaddr, 1);
+                for (j=i+1; j<psl->sl_count; j++)
+                        psl->sl_addr[j-1] = psl->sl_addr[j];
+                psl->sl_count--;
+                err = 0;
+                goto done;
+        }
+        /* else, add a new source to the filter */
+        if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+                err = -ENOBUFS;
+                goto done;
+        }
+        if (!psl || psl->sl_count == psl->sl_max) {
+                struct ip_sf_socklist *newpsl;
+                int count = IP_SFBLOCK;
+                if (psl)
+                        count += psl->sl_max;
+                newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+                        IP_SFLSIZE(count), GFP_KERNEL);
+                if (!newpsl) {
+                        err = -ENOBUFS;
+                        goto done;
+                }
+                newpsl->sl_max = count;
+                newpsl->sl_count = count - IP_SFBLOCK;
+                if (psl) {
+                        for (i=0; i<psl->sl_count; i++)
+                                newpsl->sl_addr[i] = psl->sl_addr[i];
+                        sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+                }
+                pmc->sflist = psl = newpsl;
+        }
+        rv = 1; /* > 0 for insert logic below if sl_count is 0 */
+        for (i=0; i<psl->sl_count; i++) {
+                rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+                        sizeof(__u32));
+                if (rv == 0)
+                        break;
+        }
+        if (rv == 0)            /* address already there is an error */
+                goto done;
+        for (j=psl->sl_count-1; j>=i; j--)
+                psl->sl_addr[j+1] = psl->sl_addr[j];
+        psl->sl_addr[i] = mreqs->imr_sourceaddr;
+        psl->sl_count++;
+        err = 0;
+        /* update the interface list */
+        ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
+                &mreqs->imr_sourceaddr, 1);
+done:
+        rtnl_shunlock();
+        return err;
+}
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+        int err;
+        struct ip_mreqn imr;
+        u32 addr = msf->imsf_multiaddr;
+        struct ip_mc_socklist *pmc;
+        struct in_device *in_dev;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_sf_socklist *newpsl, *psl;
+        if (!MULTICAST(addr))
+                return -EINVAL;
+        if (msf->imsf_fmode != MCAST_INCLUDE &&
+            msf->imsf_fmode != MCAST_EXCLUDE)
+                return -EINVAL;
+        rtnl_shlock();
+        imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+        imr.imr_address.s_addr = msf->imsf_interface;
+        imr.imr_ifindex = ifindex;
+        in_dev = ip_mc_find_dev(&imr);
+        if (!in_dev) {
+                err = -ENODEV;
+                goto done;
+        }
+        err = -EADDRNOTAVAIL;
+        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+                    pmc->multi.imr_ifindex == imr.imr_ifindex)
+                        break;
+        }
+        if (!pmc)               /* must have a prior join */
+                goto done;
+        if (msf->imsf_numsrc) {
+                newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+                                IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
+                if (!newpsl) {
+                        err = -ENOBUFS;
+                        goto done;
+                }
+                newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+                memcpy(newpsl->sl_addr, msf->imsf_slist,
+                        msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+                err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+                        msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+                if (err) {
+                        sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+                        goto done;
+                }
+        } else
+                newpsl = NULL;
+        psl = pmc->sflist;
+        if (psl) {
+                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+                        psl->sl_count, psl->sl_addr, 0);
+                sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+        } else
+                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+                        0, NULL, 0);
+        pmc->sflist = newpsl;
+        pmc->sfmode = msf->imsf_fmode;
+done:
+        rtnl_shunlock();
+        return err;
+}
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+        struct ip_msfilter __user *optval, int __user *optlen)
+{
+        int err, len, count, copycount;
+        struct ip_mreqn imr;
+        u32 addr = msf->imsf_multiaddr;
+        struct ip_mc_socklist *pmc;
+        struct in_device *in_dev;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_sf_socklist *psl;
+        if (!MULTICAST(addr))
+                return -EINVAL;
+        rtnl_shlock();
+        imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+        imr.imr_address.s_addr = msf->imsf_interface;
+        imr.imr_ifindex = 0;
+        in_dev = ip_mc_find_dev(&imr);
+        if (!in_dev) {
+                err = -ENODEV;
+                goto done;
+        }
+        err = -EADDRNOTAVAIL;
+        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+                    pmc->multi.imr_ifindex == imr.imr_ifindex)
+                        break;
+        }
+        if (!pmc)               /* must have a prior join */
+                goto done;
+        msf->imsf_fmode = pmc->sfmode;
+        psl = pmc->sflist;
+        rtnl_shunlock();
+        if (!psl) {
+                len = 0;
+                count = 0;
+        } else {
+                count = psl->sl_count;
+        }
+        copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+        len = copycount * sizeof(psl->sl_addr[0]);
+        msf->imsf_numsrc = count;
+        if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
+            copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+                return -EFAULT;
+        }
+        if (len &&
+            copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+                return -EFAULT;
+        return 0;
+done:
+        rtnl_shunlock();
+        return err;
+}
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+        struct group_filter __user *optval, int __user *optlen)
+{
+        int err, i, count, copycount;
+        struct sockaddr_in *psin;
+        u32 addr;
+        struct ip_mc_socklist *pmc;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_sf_socklist *psl;
+        psin = (struct sockaddr_in *)&gsf->gf_group;
+        if (psin->sin_family != AF_INET)
+                return -EINVAL;
+        addr = psin->sin_addr.s_addr;
+        if (!MULTICAST(addr))
+                return -EINVAL;
+        rtnl_shlock();
+        err = -EADDRNOTAVAIL;
+        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+                if (pmc->multi.imr_multiaddr.s_addr == addr &&
+                    pmc->multi.imr_ifindex == gsf->gf_interface)
+                        break;
+        }
+        if (!pmc)               /* must have a prior join */
+                goto done;
+        gsf->gf_fmode = pmc->sfmode;
+        psl = pmc->sflist;
+        rtnl_shunlock();
+        count = psl ? psl->sl_count : 0;
+        copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+        gsf->gf_numsrc = count;
+        if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+            copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+                return -EFAULT;
+        }
+        for (i=0; i<copycount; i++) {
+                struct sockaddr_in *psin;
+                struct sockaddr_storage ss;
+                psin = (struct sockaddr_in *)&ss;
+                memset(&ss, 0, sizeof(ss));
+                psin->sin_family = AF_INET;
+                psin->sin_addr.s_addr = psl->sl_addr[i];
+                if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+                        return -EFAULT;
+        }
+        return 0;
+done:
+        rtnl_shunlock();
+        return err;
+}
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(struct sock *sk, u32 loc_addr, u32 rmt_addr, int dif)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_mc_socklist *pmc;
+        struct ip_sf_socklist *psl;
+        int i;
+        if (!MULTICAST(loc_addr))
+                return 1;
+        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+                if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+                    pmc->multi.imr_ifindex == dif)
+                        break;
+        }
+        if (!pmc)
+                return 1;
+        psl = pmc->sflist;
+        if (!psl)
+                return pmc->sfmode == MCAST_EXCLUDE;
+        for (i=0; i<psl->sl_count; i++) {
+                if (psl->sl_addr[i] == rmt_addr)
+                        break;
+        }
+        if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+                return 0;
+        if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+                return 0;
+        return 1;
+}
+/*
+ *      A socket is closing.
+ */
+void ip_mc_drop_socket(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_mc_socklist *iml;
+        if (inet->mc_list == NULL)
+                return;
+        rtnl_lock();
+        while ((iml = inet->mc_list) != NULL) {
+                struct in_device *in_dev;
+                inet->mc_list = iml->next;
+                if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) {
+                        (void) ip_mc_leave_src(sk, iml, in_dev);
+                        ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+                        in_dev_put(in_dev);
+                }
+                sock_kfree_s(sk, iml, sizeof(*iml));
+        }
+        rtnl_unlock();
+}
+int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto)
+{
+        struct ip_mc_list *im;
+        struct ip_sf_list *psf;
+        int rv = 0;
+        read_lock(&in_dev->mc_list_lock);
+        for (im=in_dev->mc_list; im; im=im->next) {
+                if (im->multiaddr == mc_addr)
+                        break;
+        }
+        if (im && proto == IPPROTO_IGMP) {
+                rv = 1;
+        } else if (im) {
+                if (src_addr) {
+                        for (psf=im->sources; psf; psf=psf->sf_next) {
+                                if (psf->sf_inaddr == src_addr)
+                                        break;
+                        }
+                        if (psf)
+                                rv = psf->sf_count[MCAST_INCLUDE] ||
+                                        psf->sf_count[MCAST_EXCLUDE] !=
+                                        im->sfcount[MCAST_EXCLUDE];
+                        else
+                                rv = im->sfcount[MCAST_EXCLUDE] != 0;
+                } else
+                        rv = 1; /* unspecified source; tentatively allow */
+        }
+        read_unlock(&in_dev->mc_list_lock);
+        return rv;
+}
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+        struct net_device *dev;
+        struct in_device *in_dev;
+};
+#define igmp_mc_seq_private(seq)        ((struct igmp_mc_iter_state *)(seq)->private)
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+        struct ip_mc_list *im = NULL;
+        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+        for (state->dev = dev_base, state->in_dev = NULL;
+             state->dev; 
+             state->dev = state->dev->next) {
+                struct in_device *in_dev;
+                in_dev = in_dev_get(state->dev);
+                if (!in_dev)
+                        continue;
+                read_lock(&in_dev->mc_list_lock);
+                im = in_dev->mc_list;
+                if (im) {
+                        state->in_dev = in_dev;
+                        break;
+                }
+                read_unlock(&in_dev->mc_list_lock);
+                in_dev_put(in_dev);
+        }
+        return im;
+}
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+        im = im->next;
+        while (!im) {
+                if (likely(state->in_dev != NULL)) {
+                        read_unlock(&state->in_dev->mc_list_lock);
+                        in_dev_put(state->in_dev);
+                }
+                state->dev = state->dev->next;
+                if (!state->dev) {
+                        state->in_dev = NULL;
+                        break;
+                }
+                state->in_dev = in_dev_get(state->dev);
+                if (!state->in_dev)
+                        continue;
+                read_lock(&state->in_dev->mc_list_lock);
+                im = state->in_dev->mc_list;
+        }
+        return im;
+}
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct ip_mc_list *im = igmp_mc_get_first(seq);
+        if (im)
+                while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+                        --pos;
+        return pos ? NULL : im;
+}
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock(&dev_base_lock);
+        return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct ip_mc_list *im;
+        if (v == SEQ_START_TOKEN)
+                im = igmp_mc_get_first(seq);
+        else
+                im = igmp_mc_get_next(seq, v);
+        ++*pos;
+        return im;
+}
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+{
+        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+        if (likely(state->in_dev != NULL)) {
+                read_unlock(&state->in_dev->mc_list_lock);
+                in_dev_put(state->in_dev);
+                state->in_dev = NULL;
+        }
+        state->dev = NULL;
+        read_unlock(&dev_base_lock);
+}
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_puts(seq, 
+                         "Idx\tDevice    : Count Querier\tGroup    Users Timer\tReporter\n");
+        else {
+                struct ip_mc_list *im = (struct ip_mc_list *)v;
+                struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+                char   *querier;
+#ifdef CONFIG_IP_MULTICAST
+                querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+                          IGMP_V2_SEEN(state->in_dev) ? "V2" :
+                          "V3";
+#else
+                querier = "NONE";
+#endif
+                if (state->in_dev->mc_list == im) {
+                        seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+                                   state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
+                }
+                seq_printf(seq,
+                           "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+                           im->multiaddr, im->users,
+                           im->tm_running, im->tm_running ?
+                           jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+                           im->reporter);
+        }
+        return 0;
+}
+static struct seq_operations igmp_mc_seq_ops = {
+        .start  =       igmp_mc_seq_start,
+        .next   =       igmp_mc_seq_next,
+        .stop   =       igmp_mc_seq_stop,
+        .show   =       igmp_mc_seq_show,
+};
+static int igmp_mc_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct igmp_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                goto out;
+        rc = seq_open(file, &igmp_mc_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq = file->private_data;
+        seq->private = s;
+        memset(s, 0, sizeof(*s));
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations igmp_mc_seq_fops = {
+        .owner          =       THIS_MODULE,
+        .open           =       igmp_mc_seq_open,
+        .read           =       seq_read,
+        .llseek         =       seq_lseek,
+        .release        =       seq_release_private,
+};
+struct igmp_mcf_iter_state {
+        struct net_device *dev;
+        struct in_device *idev;
+        struct ip_mc_list *im;
+};
+#define igmp_mcf_seq_private(seq)       ((struct igmp_mcf_iter_state *)(seq)->private)
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+        struct ip_sf_list *psf = NULL;
+        struct ip_mc_list *im = NULL;
+        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+        for (state->dev = dev_base, state->idev = NULL, state->im = NULL;
+             state->dev; 
+             state->dev = state->dev->next) {
+                struct in_device *idev;
+                idev = in_dev_get(state->dev);
+                if (unlikely(idev == NULL))
+                        continue;
+                read_lock(&idev->mc_list_lock);
+                im = idev->mc_list;
+                if (likely(im != NULL)) {
+                        spin_lock_bh(&im->lock);
+                        psf = im->sources;
+                        if (likely(psf != NULL)) {
+                                state->im = im;
+                                state->idev = idev;
+                                break;
+                        }
+                        spin_unlock_bh(&im->lock);
+                }
+                read_unlock(&idev->mc_list_lock);
+                in_dev_put(idev);
+        }
+        return psf;
+}
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+        psf = psf->sf_next;
+        while (!psf) {
+                spin_unlock_bh(&state->im->lock);
+                state->im = state->im->next;
+                while (!state->im) {
+                        if (likely(state->idev != NULL)) {
+                                read_unlock(&state->idev->mc_list_lock);
+                                in_dev_put(state->idev);
+                        }
+                        state->dev = state->dev->next;
+                        if (!state->dev) {
+                                state->idev = NULL;
+                                goto out;
+                        }
+                        state->idev = in_dev_get(state->dev);
+                        if (!state->idev)
+                                continue;
+                        read_lock(&state->idev->mc_list_lock);
+                        state->im = state->idev->mc_list;
+                }
+                if (!state->im)
+                        break;
+                spin_lock_bh(&state->im->lock);
+                psf = state->im->sources;
+        }
+out:
+        return psf;
+}
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+        if (psf)
+                while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+                        --pos;
+        return pos ? NULL : psf;
+}
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock(&dev_base_lock);
+        return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct ip_sf_list *psf;
+        if (v == SEQ_START_TOKEN)
+                psf = igmp_mcf_get_first(seq);
+        else
+                psf = igmp_mcf_get_next(seq, v);
+        ++*pos;
+        return psf;
+}
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+{
+        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+        if (likely(state->im != NULL)) {
+                spin_unlock_bh(&state->im->lock);
+                state->im = NULL;
+        }
+        if (likely(state->idev != NULL)) {
+                read_unlock(&state->idev->mc_list_lock);
+                in_dev_put(state->idev);
+                state->idev = NULL;
+        }
+        state->dev = NULL;
+        read_unlock(&dev_base_lock);
+}
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+        struct ip_sf_list *psf = (struct ip_sf_list *)v;
+        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq, 
+                           "%3s %6s "
+                           "%10s %10s %6s %6s\n", "Idx",
+                           "Device", "MCA",
+                           "SRC", "INC", "EXC");
+        } else {
+                seq_printf(seq,
+                           "%3d %6.6s 0x%08x "
+                           "0x%08x %6lu %6lu\n", 
+                           state->dev->ifindex, state->dev->name, 
+                           ntohl(state->im->multiaddr),
+                           ntohl(psf->sf_inaddr),
+                           psf->sf_count[MCAST_INCLUDE],
+                           psf->sf_count[MCAST_EXCLUDE]);
+        }
+        return 0;
+}
+static struct seq_operations igmp_mcf_seq_ops = {
+        .start  =       igmp_mcf_seq_start,
+        .next   =       igmp_mcf_seq_next,
+        .stop   =       igmp_mcf_seq_stop,
+        .show   =       igmp_mcf_seq_show,
+};
+static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct igmp_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                goto out;
+        rc = seq_open(file, &igmp_mcf_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq = file->private_data;
+        seq->private = s;
+        memset(s, 0, sizeof(*s));
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations igmp_mcf_seq_fops = {
+        .owner          =       THIS_MODULE,
+        .open           =       igmp_mcf_seq_open,
+        .read           =       seq_read,
+        .llseek         =       seq_lseek,
+        .release        =       seq_release_private,
+};
+int __init igmp_mc_proc_init(void)
+{
+        proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
+        proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+        return 0;
+}
+#endif
+EXPORT_SYMBOL(ip_mc_dec_group);
+EXPORT_SYMBOL(ip_mc_inc_group);
+EXPORT_SYMBOL(ip_mc_join_group);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 000000000000..95473953c406
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,460 @@
+/*
+ *              INETPEER - A storage for permanent information about peers
+ *
+ *  This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ *  Version:    $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $
+ *
+ *  Authors:    Andrey V. Savochkin <saw@msu.ru>
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <net/inetpeer.h>
+/*
+ *  Theory of operations.
+ *  We keep one entry for each peer IP address.  The nodes contains long-living
+ *  information about the peer which doesn't depend on routes.
+ *  At this moment this information consists only of ID field for the next
+ *  outgoing IP packet.  This field is incremented with each packet as encoded
+ *  in inet_getid() function (include/net/inetpeer.h).
+ *  At the moment of writing this notes identifier of IP packets is generated
+ *  to be unpredictable using this code only for packets subjected
+ *  (actually or potentially) to defragmentation.  I.e. DF packets less than
+ *  PMTU in size uses a constant ID and do not use this code (see
+ *  ip_select_ident() in include/net/ip.h).
+ *
+ *  Route cache entries hold references to our nodes.
+ *  New cache entries get references via lookup by destination IP address in
+ *  the avl tree.  The reference is grabbed only when it's needed i.e. only
+ *  when we try to output IP packet which needs an unpredictable ID (see
+ *  __ip_select_ident() in net/ipv4/route.c).
+ *  Nodes are removed only when reference counter goes to 0.
+ *  When it's happened the node may be removed when a sufficient amount of
+ *  time has been passed since its last use.  The less-recently-used entry can
+ *  also be removed if the pool is overloaded i.e. if the total amount of
+ *  entries is greater-or-equal than the threshold.
+ *
+ *  Node pool is organised as an AVL tree.
+ *  Such an implementation has been chosen not just for fun.  It's a way to
+ *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
+ *  amount of long living nodes in a single hash slot would significantly delay
+ *  lookups performed with disabled BHs.
+ *
+ *  Serialisation issues.
+ *  1.  Nodes may appear in the tree only with the pool write lock held.
+ *  2.  Nodes may disappear from the tree only with the pool write lock held
+ *      AND reference count being 0.
+ *  3.  Nodes appears and disappears from unused node list only under
+ *      "inet_peer_unused_lock".
+ *  4.  Global variable peer_total is modified under the pool lock.
+ *  5.  struct inet_peer fields modification:
+ *              avl_left, avl_right, avl_parent, avl_height: pool lock
+ *              unused_next, unused_prevp: unused node list lock
+ *              refcnt: atomically against modifications on other CPU;
+ *                 usually under some other lock to prevent node disappearing
+ *              dtime: unused node list lock
+ *              v4daddr: unchangeable
+ *              ip_id_count: idlock
+ */
+/* Exported for inet_getid inline function.  */
+DEFINE_SPINLOCK(inet_peer_idlock);
+static kmem_cache_t *peer_cachep;
+#define node_height(x) x->avl_height
+static struct inet_peer peer_fake_node = {
+        .avl_left       = &peer_fake_node,
+        .avl_right      = &peer_fake_node,
+        .avl_height     = 0
+};
+#define peer_avl_empty (&peer_fake_node)
+static struct inet_peer *peer_root = peer_avl_empty;
+static DEFINE_RWLOCK(peer_pool_lock);
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+static volatile int peer_total;
+/* Exported for sysctl_net_ipv4.  */
+int inet_peer_threshold = 65536 + 128;  /* start to throw entries more
+                                         * aggressively at this stage */
+int inet_peer_minttl = 120 * HZ;        /* TTL under high load: 120 sec */
+int inet_peer_maxttl = 10 * 60 * HZ;    /* usual time to live: 10 min */
+static struct inet_peer *inet_peer_unused_head;
+/* Exported for inet_putpeer inline function.  */
+struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
+DEFINE_SPINLOCK(inet_peer_unused_lock);
+#define PEER_MAX_CLEANUP_WORK 30
+static void peer_check_expire(unsigned long dummy);
+static struct timer_list peer_periodic_timer =
+        TIMER_INITIALIZER(peer_check_expire, 0, 0);
+/* Exported for sysctl_net_ipv4.  */
+int inet_peer_gc_mintime = 10 * HZ,
+    inet_peer_gc_maxtime = 120 * HZ;
+/* Called from ip_output.c:ip_init  */
+void __init inet_initpeers(void)
+{
+        struct sysinfo si;
+        /* Use the straight interface to information about memory. */
+        si_meminfo(&si);
+        /* The values below were suggested by Alexey Kuznetsov
+         * <kuznet@ms2.inr.ac.ru>.  I don't have any opinion about the values
+         * myself.  --SAW
+         */
+        if (si.totalram <= (32768*1024)/PAGE_SIZE)
+                inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+        if (si.totalram <= (16384*1024)/PAGE_SIZE)
+                inet_peer_threshold >>= 1; /* about 512KB */
+        if (si.totalram <= (8192*1024)/PAGE_SIZE)
+                inet_peer_threshold >>= 2; /* about 128KB */
+        peer_cachep = kmem_cache_create("inet_peer_cache",
+                        sizeof(struct inet_peer),
+                        0, SLAB_HWCACHE_ALIGN,
+                        NULL, NULL);
+        if (!peer_cachep)
+                panic("cannot create inet_peer_cache");
+        /* All the timers, started at system startup tend
+           to synchronize. Perturb it a bit.
+         */
+        peer_periodic_timer.expires = jiffies
+                + net_random() % inet_peer_gc_maxtime
+                + inet_peer_gc_maxtime;
+        add_timer(&peer_periodic_timer);
+}
+/* Called with or without local BH being disabled. */
+static void unlink_from_unused(struct inet_peer *p)
+{
+        spin_lock_bh(&inet_peer_unused_lock);
+        if (p->unused_prevp != NULL) {
+                /* On unused list. */
+                *p->unused_prevp = p->unused_next;
+                if (p->unused_next != NULL)
+                        p->unused_next->unused_prevp = p->unused_prevp;
+                else
+                        inet_peer_unused_tailp = p->unused_prevp;
+                p->unused_prevp = NULL; /* mark it as removed */
+        }
+        spin_unlock_bh(&inet_peer_unused_lock);
+}
+/* Called with local BH disabled and the pool lock held. */
+#define lookup(daddr)                                           \
+({                                                              \
+        struct inet_peer *u, **v;                               \
+        stackptr = stack;                                       \
+        *stackptr++ = &peer_root;                               \
+        for (u = peer_root; u != peer_avl_empty; ) {            \
+                if (daddr == u->v4daddr)                        \
+                        break;                                  \
+                if (daddr < u->v4daddr)                         \
+                        v = &u->avl_left;                       \
+                else                                            \
+                        v = &u->avl_right;                      \
+                *stackptr++ = v;                                \
+                u = *v;                                         \
+        }                                                       \
+        u;                                                      \
+})
+/* Called with local BH disabled and the pool write lock held. */
+#define lookup_rightempty(start)                                \
+({                                                              \
+        struct inet_peer *u, **v;                               \
+        *stackptr++ = &start->avl_left;                         \
+        v = &start->avl_left;                                   \
+        for (u = *v; u->avl_right != peer_avl_empty; ) {        \
+                v = &u->avl_right;                              \
+                *stackptr++ = v;                                \
+                u = *v;                                         \
+        }                                                       \
+        u;                                                      \
+})
+/* Called with local BH disabled and the pool write lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas.  */
+static void peer_avl_rebalance(struct inet_peer **stack[],
+                struct inet_peer ***stackend)
+{
+        struct inet_peer **nodep, *node, *l, *r;
+        int lh, rh;
+        while (stackend > stack) {
+                nodep = *--stackend;
+                node = *nodep;
+                l = node->avl_left;
+                r = node->avl_right;
+                lh = node_height(l);
+                rh = node_height(r);
+                if (lh > rh + 1) { /* l: RH+2 */
+                        struct inet_peer *ll, *lr, *lrl, *lrr;
+                        int lrh;
+                        ll = l->avl_left;
+                        lr = l->avl_right;
+                        lrh = node_height(lr);
+                        if (lrh <= node_height(ll)) {   /* ll: RH+1 */
+                                node->avl_left = lr;    /* lr: RH or RH+1 */
+                                node->avl_right = r;    /* r: RH */
+                                node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+                                l->avl_left = ll;       /* ll: RH+1 */
+                                l->avl_right = node;    /* node: RH+1 or RH+2 */
+                                l->avl_height = node->avl_height + 1;
+                                *nodep = l;
+                        } else { /* ll: RH, lr: RH+1 */
+                                lrl = lr->avl_left;     /* lrl: RH or RH-1 */
+                                lrr = lr->avl_right;    /* lrr: RH or RH-1 */
+                                node->avl_left = lrr;   /* lrr: RH or RH-1 */
+                                node->avl_right = r;    /* r: RH */
+                                node->avl_height = rh + 1; /* node: RH+1 */
+                                l->avl_left = ll;       /* ll: RH */
+                                l->avl_right = lrl;     /* lrl: RH or RH-1 */
+                                l->avl_height = rh + 1; /* l: RH+1 */
+                                lr->avl_left = l;       /* l: RH+1 */
+                                lr->avl_right = node;   /* node: RH+1 */
+                                lr->avl_height = rh + 2;
+                                *nodep = lr;
+                        }
+                } else if (rh > lh + 1) { /* r: LH+2 */
+                        struct inet_peer *rr, *rl, *rlr, *rll;
+                        int rlh;
+                        rr = r->avl_right;
+                        rl = r->avl_left;
+                        rlh = node_height(rl);
+                        if (rlh <= node_height(rr)) {   /* rr: LH+1 */
+                                node->avl_right = rl;   /* rl: LH or LH+1 */
+                                node->avl_left = l;     /* l: LH */
+                                node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+                                r->avl_right = rr;      /* rr: LH+1 */
+                                r->avl_left = node;     /* node: LH+1 or LH+2 */
+                                r->avl_height = node->avl_height + 1;
+                                *nodep = r;
+                        } else { /* rr: RH, rl: RH+1 */
+                                rlr = rl->avl_right;    /* rlr: LH or LH-1 */
+                                rll = rl->avl_left;     /* rll: LH or LH-1 */
+                                node->avl_right = rll;  /* rll: LH or LH-1 */
+                                node->avl_left = l;     /* l: LH */
+                                node->avl_height = lh + 1; /* node: LH+1 */
+                                r->avl_right = rr;      /* rr: LH */
+                                r->avl_left = rlr;      /* rlr: LH or LH-1 */
+                                r->avl_height = lh + 1; /* r: LH+1 */
+                                rl->avl_right = r;      /* r: LH+1 */
+                                rl->avl_left = node;    /* node: LH+1 */
+                                rl->avl_height = lh + 2;
+                                *nodep = rl;
+                        }
+                } else {
+                        node->avl_height = (lh > rh ? lh : rh) + 1;
+                }
+        }
+}
+/* Called with local BH disabled and the pool write lock held. */
+#define link_to_pool(n)                                         \
+do {                                                            \
+        n->avl_height = 1;                                      \
+        n->avl_left = peer_avl_empty;                           \
+        n->avl_right = peer_avl_empty;                          \
+        **--stackptr = n;                                       \
+        peer_avl_rebalance(stack, stackptr);                    \
+} while(0)
+/* May be called with local BH enabled. */
+static void unlink_from_pool(struct inet_peer *p)
+{
+        int do_free;
+        do_free = 0;
+        write_lock_bh(&peer_pool_lock);
+        /* Check the reference counter.  It was artificially incremented by 1
+         * in cleanup() function to prevent sudden disappearing.  If the
+         * reference count is still 1 then the node is referenced only as `p'
+         * here and from the pool.  So under the exclusive pool lock it's safe
+         * to remove the node and free it later. */
+        if (atomic_read(&p->refcnt) == 1) {
+                struct inet_peer **stack[PEER_MAXDEPTH];
+                struct inet_peer ***stackptr, ***delp;
+                if (lookup(p->v4daddr) != p)
+                        BUG();
+                delp = stackptr - 1; /* *delp[0] == p */
+                if (p->avl_left == peer_avl_empty) {
+                        *delp[0] = p->avl_right;
+                        --stackptr;
+                } else {
+                        /* look for a node to insert instead of p */
+                        struct inet_peer *t;
+                        t = lookup_rightempty(p);
+                        if (*stackptr[-1] != t)
+                                BUG();
+                        **--stackptr = t->avl_left;
+                        /* t is removed, t->v4daddr > x->v4daddr for any
+                         * x in p->avl_left subtree.
+                         * Put t in the old place of p. */
+                        *delp[0] = t;
+                        t->avl_left = p->avl_left;
+                        t->avl_right = p->avl_right;
+                        t->avl_height = p->avl_height;
+                        if (delp[1] != &p->avl_left)
+                                BUG();
+                        delp[1] = &t->avl_left; /* was &p->avl_left */
+                }
+                peer_avl_rebalance(stack, stackptr);
+                peer_total--;
+                do_free = 1;
+        }
+        write_unlock_bh(&peer_pool_lock);
+        if (do_free)
+                kmem_cache_free(peer_cachep, p);
+        else
+                /* The node is used again.  Decrease the reference counter
+                 * back.  The loop "cleanup -> unlink_from_unused
+                 *   -> unlink_from_pool -> putpeer -> link_to_unused
+                 *   -> cleanup (for the same node)"
+                 * doesn't really exist because the entry will have a
+                 * recent deletion time and will not be cleaned again soon. */
+                inet_putpeer(p);
+}
+/* May be called with local BH enabled. */
+static int cleanup_once(unsigned long ttl)
+{
+        struct inet_peer *p;
+        /* Remove the first entry from the list of unused nodes. */
+        spin_lock_bh(&inet_peer_unused_lock);
+        p = inet_peer_unused_head;
+        if (p != NULL) {
+                if (time_after(p->dtime + ttl, jiffies)) {
+                        /* Do not prune fresh entries. */
+                        spin_unlock_bh(&inet_peer_unused_lock);
+                        return -1;
+                }
+                inet_peer_unused_head = p->unused_next;
+                if (p->unused_next != NULL)
+                        p->unused_next->unused_prevp = p->unused_prevp;
+                else
+                        inet_peer_unused_tailp = p->unused_prevp;
+                p->unused_prevp = NULL; /* mark as not on the list */
+                /* Grab an extra reference to prevent node disappearing
+                 * before unlink_from_pool() call. */
+                atomic_inc(&p->refcnt);
+        }
+        spin_unlock_bh(&inet_peer_unused_lock);
+        if (p == NULL)
+                /* It means that the total number of USED entries has
+                 * grown over inet_peer_threshold.  It shouldn't really
+                 * happen because of entry limits in route cache. */
+                return -1;
+        unlink_from_pool(p);
+        return 0;
+}
+/* Called with or without local BH being disabled. */
+struct inet_peer *inet_getpeer(__u32 daddr, int create)
+{
+        struct inet_peer *p, *n;
+        struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+        /* Look up for the address quickly. */
+        read_lock_bh(&peer_pool_lock);
+        p = lookup(daddr);
+        if (p != peer_avl_empty)
+                atomic_inc(&p->refcnt);
+        read_unlock_bh(&peer_pool_lock);
+        if (p != peer_avl_empty) {
+                /* The existing node has been found. */
+                /* Remove the entry from unused list if it was there. */
+                unlink_from_unused(p);
+                return p;
+        }
+        if (!create)
+                return NULL;
+        /* Allocate the space outside the locked region. */
+        n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+        if (n == NULL)
+                return NULL;
+        n->v4daddr = daddr;
+        atomic_set(&n->refcnt, 1);
+        n->ip_id_count = secure_ip_id(daddr);
+        n->tcp_ts_stamp = 0;
+        write_lock_bh(&peer_pool_lock);
+        /* Check if an entry has suddenly appeared. */
+        p = lookup(daddr);
+        if (p != peer_avl_empty)
+                goto out_free;
+        /* Link the node. */
+        link_to_pool(n);
+        n->unused_prevp = NULL; /* not on the list */
+        peer_total++;
+        write_unlock_bh(&peer_pool_lock);
+        if (peer_total >= inet_peer_threshold)
+                /* Remove one less-recently-used entry. */
+                cleanup_once(0);
+        return n;
+out_free:
+        /* The appropriate node is already in the pool. */
+        atomic_inc(&p->refcnt);
+        write_unlock_bh(&peer_pool_lock);
+        /* Remove the entry from unused list if it was there. */
+        unlink_from_unused(p);
+        /* Free preallocated the preallocated node. */
+        kmem_cache_free(peer_cachep, n);
+        return p;
+}
+/* Called with local BH disabled. */
+static void peer_check_expire(unsigned long dummy)
+{
+        int i;
+        int ttl;
+        if (peer_total >= inet_peer_threshold)
+                ttl = inet_peer_minttl;
+        else
+                ttl = inet_peer_maxttl
+                                - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+                                        peer_total / inet_peer_threshold * HZ;
+        for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++);
+        /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
+         * interval depending on the total number of entries (more entries,
+         * less interval). */
+        peer_periodic_timer.expires = jiffies
+                + inet_peer_gc_maxtime
+                - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+                        peer_total / inet_peer_threshold * HZ;
+        add_timer(&peer_periodic_timer);
+}
+EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 000000000000..77094aac6c28
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,127 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              The IP forwarding functionality.
+ *              
+ * Version:     $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $
+ *
+ * Authors:     see ip.c
+ *
+ * Fixes:
+ *              Many            :       Split from ip.c , see ip_input.c for 
+ *                                      history.
+ *              Dave Gregorich  :       NULL ip_rt_put fix for multicast 
+ *                                      routing.
+ *              Jos Vos         :       Add call_out_firewall before sending,
+ *                                      use output device for accounting.
+ *              Jos Vos         :       Call forward firewall after routing
+ *                                      (always use output device).
+ *              Mike McLagan    :       Routing by source
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+static inline int ip_forward_finish(struct sk_buff *skb)
+{
+        struct ip_options * opt = &(IPCB(skb)->opt);
+        IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+        if (unlikely(opt->optlen))
+                ip_forward_options(skb);
+        return dst_output(skb);
+}
+int ip_forward(struct sk_buff *skb)
+{
+        struct iphdr *iph;      /* Our header */
+        struct rtable *rt;      /* Route we use */
+        struct ip_options * opt = &(IPCB(skb)->opt);
+        if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+                goto drop;
+        if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+                return NET_RX_SUCCESS;
+        if (skb->pkt_type != PACKET_HOST)
+                goto drop;
+        skb->ip_summed = CHECKSUM_NONE;
+        
+        /*
+         *      According to the RFC, we must first decrease the TTL field. If
+         *      that reaches zero, we must reply an ICMP control message telling
+         *      that the packet's lifetime expired.
+         */
+        iph = skb->nh.iph;
+        if (iph->ttl <= 1)
+                goto too_many_hops;
+        if (!xfrm4_route_forward(skb))
+                goto drop;
+        iph = skb->nh.iph;
+        rt = (struct rtable*)skb->dst;
+        if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+                goto sr_failed;
+        /* We are about to mangle packet. Copy it! */
+        if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+                goto drop;
+        iph = skb->nh.iph;
+        /* Decrease ttl after skb cow done */
+        ip_decrease_ttl(iph);
+        /*
+         *      We now generate an ICMP HOST REDIRECT giving the route
+         *      we calculated.
+         */
+        if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+                ip_rt_send_redirect(skb);
+        skb->priority = rt_tos2priority(iph->tos);
+        return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
+                       ip_forward_finish);
+sr_failed:
+        /*
+         *      Strict routing permits no gatewaying
+         */
+         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+         goto drop;
+too_many_hops:
+        /* Tell the sender its packet died... */
+        icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+        kfree_skb(skb);
+        return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 000000000000..7f68e27eb4ea
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,691 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              The IP fragmentation functionality.
+ *              
+ * Version:     $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors:     Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ *              Alan Cox <Alan.Cox@linux.org>
+ *
+ * Fixes:
+ *              Alan Cox        :       Split from ip.c , see ip_input.c for history.
+ *              David S. Miller :       Begin massive cleanup...
+ *              Andi Kleen      :       Add sysctls.
+ *              xxxx            :       Overlapfrag bug.
+ *              Ultima          :       ip_expire() kernel panic.
+ *              Bill Hawes      :       Frag accounting and evictor fixes.
+ *              John McDonald   :       0 length frag bug.
+ *              Alexey Kuznetsov:       SMP races, threading, cleanup.
+ *              Patrick McHardy :       LRU queue of frag heads for evictor.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+/* Fragment cache limits. We will commit 256K at one time. Should we
+ * cross that limit we will prune down to 192K. This should cope with
+ * even the most extreme cases without allowing an attacker to measurably
+ * harm machine performance.
+ */
+int sysctl_ipfrag_high_thresh = 256*1024;
+int sysctl_ipfrag_low_thresh = 192*1024;
+/* Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
+ */
+int sysctl_ipfrag_time = IP_FRAG_TIME;
+struct ipfrag_skb_cb
+{
+        struct inet_skb_parm    h;
+        int                     offset;
+};
+#define FRAG_CB(skb)    ((struct ipfrag_skb_cb*)((skb)->cb))
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+        struct ipq      *next;          /* linked list pointers                 */
+        struct list_head lru_list;      /* lru list member                      */
+        u32             user;
+        u32             saddr;
+        u32             daddr;
+        u16             id;
+        u8              protocol;
+        u8              last_in;
+#define COMPLETE                4
+#define FIRST_IN                2
+#define LAST_IN                 1
+        struct sk_buff  *fragments;     /* linked list of received fragments    */
+        int             len;            /* total length of original datagram    */
+        int             meat;
+        spinlock_t      lock;
+        atomic_t        refcnt;
+        struct timer_list timer;        /* when will this queue expire?         */
+        struct ipq      **pprev;
+        int             iif;
+        struct timeval  stamp;
+};
+/* Hash table. */
+#define IPQ_HASHSZ      64
+/* Per-bucket lock is easy to add now. */
+static struct ipq *ipq_hash[IPQ_HASHSZ];
+static DEFINE_RWLOCK(ipfrag_lock);
+static u32 ipfrag_hash_rnd;
+static LIST_HEAD(ipq_lru_list);
+int ip_frag_nqueues = 0;
+static __inline__ void __ipq_unlink(struct ipq *qp)
+{
+        if(qp->next)
+                qp->next->pprev = qp->pprev;
+        *qp->pprev = qp->next;
+        list_del(&qp->lru_list);
+        ip_frag_nqueues--;
+}
+static __inline__ void ipq_unlink(struct ipq *ipq)
+{
+        write_lock(&ipfrag_lock);
+        __ipq_unlink(ipq);
+        write_unlock(&ipfrag_lock);
+}
+static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
+{
+        return jhash_3words((u32)id << 16 | prot, saddr, daddr,
+                            ipfrag_hash_rnd) & (IPQ_HASHSZ - 1);
+}
+static struct timer_list ipfrag_secret_timer;
+int sysctl_ipfrag_secret_interval = 10 * 60 * HZ;
+static void ipfrag_secret_rebuild(unsigned long dummy)
+{
+        unsigned long now = jiffies;
+        int i;
+        write_lock(&ipfrag_lock);
+        get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
+        for (i = 0; i < IPQ_HASHSZ; i++) {
+                struct ipq *q;
+                q = ipq_hash[i];
+                while (q) {
+                        struct ipq *next = q->next;
+                        unsigned int hval = ipqhashfn(q->id, q->saddr,
+                                                      q->daddr, q->protocol);
+                        if (hval != i) {
+                                /* Unlink. */
+                                if (q->next)
+                                        q->next->pprev = q->pprev;
+                                *q->pprev = q->next;
+                                /* Relink to new hash chain. */
+                                if ((q->next = ipq_hash[hval]) != NULL)
+                                        q->next->pprev = &q->next;
+                                ipq_hash[hval] = q;
+                                q->pprev = &ipq_hash[hval];
+                        }
+                        q = next;
+                }
+        }
+        write_unlock(&ipfrag_lock);
+        mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval);
+}
+atomic_t ip_frag_mem = ATOMIC_INIT(0);  /* Memory used for fragments */
+/* Memory Tracking Functions. */
+static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
+{
+        if (work)
+                *work -= skb->truesize;
+        atomic_sub(skb->truesize, &ip_frag_mem);
+        kfree_skb(skb);
+}
+static __inline__ void frag_free_queue(struct ipq *qp, int *work)
+{
+        if (work)
+                *work -= sizeof(struct ipq);
+        atomic_sub(sizeof(struct ipq), &ip_frag_mem);
+        kfree(qp);
+}
+static __inline__ struct ipq *frag_alloc_queue(void)
+{
+        struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
+        if(!qp)
+                return NULL;
+        atomic_add(sizeof(struct ipq), &ip_frag_mem);
+        return qp;
+}
+/* Destruction primitives. */
+/* Complete destruction of ipq. */
+static void ip_frag_destroy(struct ipq *qp, int *work)
+{
+        struct sk_buff *fp;
+        BUG_TRAP(qp->last_in&COMPLETE);
+        BUG_TRAP(del_timer(&qp->timer) == 0);
+        /* Release all fragment data. */
+        fp = qp->fragments;
+        while (fp) {
+                struct sk_buff *xp = fp->next;
+                frag_kfree_skb(fp, work);
+                fp = xp;
+        }
+        /* Finally, release the queue descriptor itself. */
+        frag_free_queue(qp, work);
+}
+static __inline__ void ipq_put(struct ipq *ipq, int *work)
+{
+        if (atomic_dec_and_test(&ipq->refcnt))
+                ip_frag_destroy(ipq, work);
+}
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+        if (del_timer(&ipq->timer))
+                atomic_dec(&ipq->refcnt);
+        if (!(ipq->last_in & COMPLETE)) {
+                ipq_unlink(ipq);
+                atomic_dec(&ipq->refcnt);
+                ipq->last_in |= COMPLETE;
+        }
+}
+/* Memory limiting on fragments.  Evictor trashes the oldest 
+ * fragment queue until we are back under the threshold.
+ */
+static void ip_evictor(void)
+{
+        struct ipq *qp;
+        struct list_head *tmp;
+        int work;
+        work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
+        if (work <= 0)
+                return;
+        while (work > 0) {
+                read_lock(&ipfrag_lock);
+                if (list_empty(&ipq_lru_list)) {
+                        read_unlock(&ipfrag_lock);
+                        return;
+                }
+                tmp = ipq_lru_list.next;
+                qp = list_entry(tmp, struct ipq, lru_list);
+                atomic_inc(&qp->refcnt);
+                read_unlock(&ipfrag_lock);
+                spin_lock(&qp->lock);
+                if (!(qp->last_in&COMPLETE))
+                        ipq_kill(qp);
+                spin_unlock(&qp->lock);
+                ipq_put(qp, &work);
+                IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+        }
+}
+/*
+ * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+        struct ipq *qp = (struct ipq *) arg;
+        spin_lock(&qp->lock);
+        if (qp->last_in & COMPLETE)
+                goto out;
+        ipq_kill(qp);
+        IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT);
+        IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+        if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
+                struct sk_buff *head = qp->fragments;
+                /* Send an ICMP "Fragment Reassembly Timeout" message. */
+                if ((head->dev = dev_get_by_index(qp->iif)) != NULL) {
+                        icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+                        dev_put(head->dev);
+                }
+        }
+out:
+        spin_unlock(&qp->lock);
+        ipq_put(qp, NULL);
+}
+/* Creation primitives. */
+static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
+{
+        struct ipq *qp;
+        write_lock(&ipfrag_lock);
+#ifdef CONFIG_SMP
+        /* With SMP race we have to recheck hash table, because
+         * such entry could be created on other cpu, while we
+         * promoted read lock to write lock.
+         */
+        for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+                if(qp->id == qp_in->id          &&
+                   qp->saddr == qp_in->saddr    &&
+                   qp->daddr == qp_in->daddr    &&
+                   qp->protocol == qp_in->protocol &&
+                   qp->user == qp_in->user) {
+                        atomic_inc(&qp->refcnt);
+                        write_unlock(&ipfrag_lock);
+                        qp_in->last_in |= COMPLETE;
+                        ipq_put(qp_in, NULL);
+                        return qp;
+                }
+        }
+#endif
+        qp = qp_in;
+        if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time))
+                atomic_inc(&qp->refcnt);
+        atomic_inc(&qp->refcnt);
+        if((qp->next = ipq_hash[hash]) != NULL)
+                qp->next->pprev = &qp->next;
+        ipq_hash[hash] = qp;
+        qp->pprev = &ipq_hash[hash];
+        INIT_LIST_HEAD(&qp->lru_list);
+        list_add_tail(&qp->lru_list, &ipq_lru_list);
+        ip_frag_nqueues++;
+        write_unlock(&ipfrag_lock);
+        return qp;
+}
+/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
+static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
+{
+        struct ipq *qp;
+        if ((qp = frag_alloc_queue()) == NULL)
+                goto out_nomem;
+        qp->protocol = iph->protocol;
+        qp->last_in = 0;
+        qp->id = iph->id;
+        qp->saddr = iph->saddr;
+        qp->daddr = iph->daddr;
+        qp->user = user;
+        qp->len = 0;
+        qp->meat = 0;
+        qp->fragments = NULL;
+        qp->iif = 0;
+        /* Initialize a timer for this entry. */
+        init_timer(&qp->timer);
+        qp->timer.data = (unsigned long) qp;    /* pointer to queue     */
+        qp->timer.function = ip_expire;         /* expire function      */
+        spin_lock_init(&qp->lock);
+        atomic_set(&qp->refcnt, 1);
+        return ip_frag_intern(hash, qp);
+out_nomem:
+        NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+        return NULL;
+}
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
+{
+        __u16 id = iph->id;
+        __u32 saddr = iph->saddr;
+        __u32 daddr = iph->daddr;
+        __u8 protocol = iph->protocol;
+        unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
+        struct ipq *qp;
+        read_lock(&ipfrag_lock);
+        for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+                if(qp->id == id         &&
+                   qp->saddr == saddr   &&
+                   qp->daddr == daddr   &&
+                   qp->protocol == protocol &&
+                   qp->user == user) {
+                        atomic_inc(&qp->refcnt);
+                        read_unlock(&ipfrag_lock);
+                        return qp;
+                }
+        }
+        read_unlock(&ipfrag_lock);
+        return ip_frag_create(hash, iph, user);
+}
+/* Add new segment to existing queue. */
+static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+        struct sk_buff *prev, *next;
+        int flags, offset;
+        int ihl, end;
+        if (qp->last_in & COMPLETE)
+                goto err;
+        offset = ntohs(skb->nh.iph->frag_off);
+        flags = offset & ~IP_OFFSET;
+        offset &= IP_OFFSET;
+        offset <<= 3;           /* offset is in 8-byte chunks */
+        ihl = skb->nh.iph->ihl * 4;
+        /* Determine the position of this fragment. */
+        end = offset + skb->len - ihl;
+        /* Is this the final fragment? */
+        if ((flags & IP_MF) == 0) {
+                /* If we already have some bits beyond end
+                 * or have different end, the segment is corrrupted.
+                 */
+                if (end < qp->len ||
+                    ((qp->last_in & LAST_IN) && end != qp->len))
+                        goto err;
+                qp->last_in |= LAST_IN;
+                qp->len = end;
+        } else {
+                if (end&7) {
+                        end &= ~7;
+                        if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+                                skb->ip_summed = CHECKSUM_NONE;
+                }
+                if (end > qp->len) {
+                        /* Some bits beyond end -> corruption. */
+                        if (qp->last_in & LAST_IN)
+                                goto err;
+                        qp->len = end;
+                }
+        }
+        if (end == offset)
+                goto err;
+        if (pskb_pull(skb, ihl) == NULL)
+                goto err;
+        if (pskb_trim(skb, end-offset))
+                goto err;
+        /* Find out which fragments are in front and at the back of us
+         * in the chain of fragments so far.  We must know where to put
+         * this fragment, right?
+         */
+        prev = NULL;
+        for(next = qp->fragments; next != NULL; next = next->next) {
+                if (FRAG_CB(next)->offset >= offset)
+                        break;  /* bingo! */
+                prev = next;
+        }
+        /* We found where to put this one.  Check for overlap with
+         * preceding fragment, and, if needed, align things so that
+         * any overlaps are eliminated.
+         */
+        if (prev) {
+                int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+                if (i > 0) {
+                        offset += i;
+                        if (end <= offset)
+                                goto err;
+                        if (!pskb_pull(skb, i))
+                                goto err;
+                        if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+                                skb->ip_summed = CHECKSUM_NONE;
+                }
+        }
+        while (next && FRAG_CB(next)->offset < end) {
+                int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+                if (i < next->len) {
+                        /* Eat head of the next overlapped fragment
+                         * and leave the loop. The next ones cannot overlap.
+                         */
+                        if (!pskb_pull(next, i))
+                                goto err;
+                        FRAG_CB(next)->offset += i;
+                        qp->meat -= i;
+                        if (next->ip_summed != CHECKSUM_UNNECESSARY)
+                                next->ip_summed = CHECKSUM_NONE;
+                        break;
+                } else {
+                        struct sk_buff *free_it = next;
+                        /* Old fragmnet is completely overridden with
+                         * new one drop it.
+                         */
+                        next = next->next;
+                        if (prev)
+                                prev->next = next;
+                        else
+                                qp->fragments = next;
+                        qp->meat -= free_it->len;
+                        frag_kfree_skb(free_it, NULL);
+                }
+        }
+        FRAG_CB(skb)->offset = offset;
+        /* Insert this fragment in the chain of fragments. */
+        skb->next = next;
+        if (prev)
+                prev->next = skb;
+        else
+                qp->fragments = skb;
+        if (skb->dev)
+                qp->iif = skb->dev->ifindex;
+        skb->dev = NULL;
+        qp->stamp = skb->stamp;
+        qp->meat += skb->len;
+        atomic_add(skb->truesize, &ip_frag_mem);
+        if (offset == 0)
+                qp->last_in |= FIRST_IN;
+        write_lock(&ipfrag_lock);
+        list_move_tail(&qp->lru_list, &ipq_lru_list);
+        write_unlock(&ipfrag_lock);
+        return;
+err:
+        kfree_skb(skb);
+}
+/* Build a new IP datagram from all its fragments. */
+static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
+{
+        struct iphdr *iph;
+        struct sk_buff *fp, *head = qp->fragments;
+        int len;
+        int ihlen;
+        ipq_kill(qp);
+        BUG_TRAP(head != NULL);
+        BUG_TRAP(FRAG_CB(head)->offset == 0);
+        /* Allocate a new buffer for the datagram. */
+        ihlen = head->nh.iph->ihl*4;
+        len = ihlen + qp->len;
+        if(len > 65535)
+                goto out_oversize;
+        /* Head of list must not be cloned. */
+        if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+                goto out_nomem;
+        /* If the first fragment is fragmented itself, we split
+         * it to two chunks: the first with data and paged part
+         * and the second, holding only fragments. */
+        if (skb_shinfo(head)->frag_list) {
+                struct sk_buff *clone;
+                int i, plen = 0;
+                if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
+                        goto out_nomem;
+                clone->next = head->next;
+                head->next = clone;
+                skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+                skb_shinfo(head)->frag_list = NULL;
+                for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+                        plen += skb_shinfo(head)->frags[i].size;
+                clone->len = clone->data_len = head->data_len - plen;
+                head->data_len -= clone->len;
+                head->len -= clone->len;
+                clone->csum = 0;
+                clone->ip_summed = head->ip_summed;
+                atomic_add(clone->truesize, &ip_frag_mem);
+        }
+        skb_shinfo(head)->frag_list = head->next;
+        skb_push(head, head->data - head->nh.raw);
+        atomic_sub(head->truesize, &ip_frag_mem);
+        for (fp=head->next; fp; fp = fp->next) {
+                head->data_len += fp->len;
+                head->len += fp->len;
+                if (head->ip_summed != fp->ip_summed)
+                        head->ip_summed = CHECKSUM_NONE;
+                else if (head->ip_summed == CHECKSUM_HW)
+                        head->csum = csum_add(head->csum, fp->csum);
+                head->truesize += fp->truesize;
+                atomic_sub(fp->truesize, &ip_frag_mem);
+        }
+        head->next = NULL;
+        head->dev = dev;
+        head->stamp = qp->stamp;
+        iph = head->nh.iph;
+        iph->frag_off = 0;
+        iph->tot_len = htons(len);
+        IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
+        qp->fragments = NULL;
+        return head;
+out_nomem:
+        NETDEBUG(if (net_ratelimit())
+                 printk(KERN_ERR 
+                        "IP: queue_glue: no memory for gluing queue %p\n",
+                        qp));
+        goto out_fail;
+out_oversize:
+        if (net_ratelimit())
+                printk(KERN_INFO
+                        "Oversized IP packet from %d.%d.%d.%d.\n",
+                        NIPQUAD(qp->saddr));
+out_fail:
+        IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+        return NULL;
+}
+/* Process an incoming IP datagram fragment. */
+struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
+{
+        struct iphdr *iph = skb->nh.iph;
+        struct ipq *qp;
+        struct net_device *dev;
+        
+        IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
+        /* Start by cleaning up the memory. */
+        if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
+                ip_evictor();
+        dev = skb->dev;
+        /* Lookup (or create) queue header */
+        if ((qp = ip_find(iph, user)) != NULL) {
+                struct sk_buff *ret = NULL;
+                spin_lock(&qp->lock);
+                ip_frag_queue(qp, skb);
+                if (qp->last_in == (FIRST_IN|LAST_IN) &&
+                    qp->meat == qp->len)
+                        ret = ip_frag_reasm(qp, dev);
+                spin_unlock(&qp->lock);
+                ipq_put(qp, NULL);
+                return ret;
+        }
+        IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+        kfree_skb(skb);
+        return NULL;
+}
+void ipfrag_init(void)
+{
+        ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+                                 (jiffies ^ (jiffies >> 6)));
+        init_timer(&ipfrag_secret_timer);
+        ipfrag_secret_timer.function = ipfrag_secret_rebuild;
+        ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
+        add_timer(&ipfrag_secret_timer);
+}
+EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 000000000000..884835522224
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,1290 @@
+/*
+ *      Linux NET3:     GRE over IP protocol decoder. 
+ *
+ *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#ifdef CONFIG_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+/*
+   Problems & solutions
+   --------------------
+   1. The most important issue is detecting local dead loops.
+   They would cause complete host lockup in transmit, which
+   would be "resolved" by stack overflow or, if queueing is enabled,
+   with infinite looping in net_bh.
+   We cannot track such dead loops during route installation,
+   it is infeasible task. The most general solutions would be
+   to keep skb->encapsulation counter (sort of local ttl),
+   and silently drop packet when it expires. It is the best
+   solution, but it supposes maintaing new variable in ALL
+   skb, even if no tunneling is used.
+   Current solution: t->recursion lock breaks dead loops. It looks 
+   like dev->tbusy flag, but I preferred new variable, because
+   the semantics is different. One day, when hard_start_xmit
+   will be multithreaded we will have to use skb->encapsulation.
+   2. Networking dead loops would not kill routers, but would really
+   kill network. IP hop limit plays role of "t->recursion" in this case,
+   if we copy it from packet being encapsulated to upper header.
+   It is very good solution, but it introduces two problems:
+   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+     do not work over tunnels.
+   - traceroute does not work. I planned to relay ICMP from tunnel,
+     so that this problem would be solved and traceroute output
+     would even more informative. This idea appeared to be wrong:
+     only Linux complies to rfc1812 now (yes, guys, Linux is the only
+     true router now :-)), all routers (at least, in neighbourhood of mine)
+     return only 8 bytes of payload. It is the end.
+   Hence, if we want that OSPF worked or traceroute said something reasonable,
+   we should search for another solution.
+   One of them is to parse packet trying to detect inner encapsulation
+   made by our node. It is difficult or even impossible, especially,
+   taking into account fragmentation. TO be short, tt is not solution at all.
+   Current solution: The solution was UNEXPECTEDLY SIMPLE.
+   We force DF flag on tunnels with preconfigured hop limit,
+   that is ALL. :-) Well, it does not remove the problem completely,
+   but exponential growth of network traffic is changed to linear
+   (branches, that exceed pmtu are pruned) and tunnel mtu
+   fastly degrades to value <68, where looping stops.
+   Yes, it is not good if there exists a router in the loop,
+   which does not force DF, even when encapsulating packets have DF set.
+   But it is not our problem! Nobody could accuse us, we made
+   all that we could make. Even if it is your gated who injected
+   fatal route to network, even if it were you who configured
+   fatal static route: you are innocent. :-)
+   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
+   practically identical code. It would be good to glue them
+   together, but it is not very evident, how to make them modular.
+   sit is integral part of IPv6, ipip and gre are naturally modular.
+   We could extract common parts (hash table, ioctl etc)
+   to a separate module (ip_tunnel.c).
+   Alexey Kuznetsov.
+ */
+static int ipgre_tunnel_init(struct net_device *dev);
+static void ipgre_tunnel_setup(struct net_device *dev);
+/* Fallback tunnel: no source, no destination, no key, no options */
+static int ipgre_fb_tunnel_init(struct net_device *dev);
+static struct net_device *ipgre_fb_tunnel_dev;
+/* Tunnel hash table */
+/*
+   4 hash tables:
+   3: (remote,local)
+   2: (remote,*)
+   1: (*,local)
+   0: (*,*)
+   We require exact key match i.e. if a key is present in packet
+   it will match only tunnel with the same key; if it is not present,
+   it will match only keyless tunnel.
+   All keysless packets, if not matched configured keyless tunnels
+   will match fallback tunnel.
+ */
+#define HASH_SIZE  16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+static struct ip_tunnel *tunnels[4][HASH_SIZE];
+#define tunnels_r_l     (tunnels[3])
+#define tunnels_r       (tunnels[2])
+#define tunnels_l       (tunnels[1])
+#define tunnels_wc      (tunnels[0])
+static DEFINE_RWLOCK(ipgre_lock);
+/* Given src, dst and key, find appropriate for input tunnel. */
+static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
+{
+        unsigned h0 = HASH(remote);
+        unsigned h1 = HASH(key);
+        struct ip_tunnel *t;
+        for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+                if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+                        if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+                                return t;
+                }
+        }
+        for (t = tunnels_r[h0^h1]; t; t = t->next) {
+                if (remote == t->parms.iph.daddr) {
+                        if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+                                return t;
+                }
+        }
+        for (t = tunnels_l[h1]; t; t = t->next) {
+                if (local == t->parms.iph.saddr ||
+                     (local == t->parms.iph.daddr && MULTICAST(local))) {
+                        if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+                                return t;
+                }
+        }
+        for (t = tunnels_wc[h1]; t; t = t->next) {
+                if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+                        return t;
+        }
+        if (ipgre_fb_tunnel_dev->flags&IFF_UP)
+                return ipgre_fb_tunnel_dev->priv;
+        return NULL;
+}
+static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
+{
+        u32 remote = t->parms.iph.daddr;
+        u32 local = t->parms.iph.saddr;
+        u32 key = t->parms.i_key;
+        unsigned h = HASH(key);
+        int prio = 0;
+        if (local)
+                prio |= 1;
+        if (remote && !MULTICAST(remote)) {
+                prio |= 2;
+                h ^= HASH(remote);
+        }
+        return &tunnels[prio][h];
+}
+static void ipgre_tunnel_link(struct ip_tunnel *t)
+{
+        struct ip_tunnel **tp = ipgre_bucket(t);
+        t->next = *tp;
+        write_lock_bh(&ipgre_lock);
+        *tp = t;
+        write_unlock_bh(&ipgre_lock);
+}
+static void ipgre_tunnel_unlink(struct ip_tunnel *t)
+{
+        struct ip_tunnel **tp;
+        for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
+                if (t == *tp) {
+                        write_lock_bh(&ipgre_lock);
+                        *tp = t->next;
+                        write_unlock_bh(&ipgre_lock);
+                        break;
+                }
+        }
+}
+static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+        u32 remote = parms->iph.daddr;
+        u32 local = parms->iph.saddr;
+        u32 key = parms->i_key;
+        struct ip_tunnel *t, **tp, *nt;
+        struct net_device *dev;
+        unsigned h = HASH(key);
+        int prio = 0;
+        char name[IFNAMSIZ];
+        if (local)
+                prio |= 1;
+        if (remote && !MULTICAST(remote)) {
+                prio |= 2;
+                h ^= HASH(remote);
+        }
+        for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+                if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+                        if (key == t->parms.i_key)
+                                return t;
+                }
+        }
+        if (!create)
+                return NULL;
+        if (parms->name[0])
+                strlcpy(name, parms->name, IFNAMSIZ);
+        else {
+                int i;
+                for (i=1; i<100; i++) {
+                        sprintf(name, "gre%d", i);
+                        if (__dev_get_by_name(name) == NULL)
+                                break;
+                }
+                if (i==100)
+                        goto failed;
+        }
+        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
+        if (!dev)
+          return NULL;
+        dev->init = ipgre_tunnel_init;
+        nt = dev->priv;
+        nt->parms = *parms;
+        if (register_netdevice(dev) < 0) {
+                free_netdev(dev);
+                goto failed;
+        }
+        nt = dev->priv;
+        nt->parms = *parms;
+        dev_hold(dev);
+        ipgre_tunnel_link(nt);
+        /* Do not decrement MOD_USE_COUNT here. */
+        return nt;
+failed:
+        return NULL;
+}
+static void ipgre_tunnel_uninit(struct net_device *dev)
+{
+        ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
+        dev_put(dev);
+}
+static void ipgre_err(struct sk_buff *skb, u32 info)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+/* It is not :-( All the routers (except for Linux) return only
+   8 bytes of packet payload. It means, that precise relaying of
+   ICMP in the real Internet is absolutely infeasible.
+   Moreover, Cisco "wise men" put GRE key to the third word
+   in GRE header. It makes impossible maintaining even soft state for keyed
+   GRE tunnels with enabled checksum. Tell them "thank you".
+   Well, I wonder, rfc1812 was written by Cisco employee,
+   what the hell these idiots break standrads established
+   by themself???
+ */
+        struct iphdr *iph = (struct iphdr*)skb->data;
+        u16          *p = (u16*)(skb->data+(iph->ihl<<2));
+        int grehlen = (iph->ihl<<2) + 4;
+        int type = skb->h.icmph->type;
+        int code = skb->h.icmph->code;
+        struct ip_tunnel *t;
+        u16 flags;
+        flags = p[0];
+        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+                if (flags&(GRE_VERSION|GRE_ROUTING))
+                        return;
+                if (flags&GRE_KEY) {
+                        grehlen += 4;
+                        if (flags&GRE_CSUM)
+                                grehlen += 4;
+                }
+        }
+        /* If only 8 bytes returned, keyed message will be dropped here */
+        if (skb_headlen(skb) < grehlen)
+                return;
+        switch (type) {
+        default:
+        case ICMP_PARAMETERPROB:
+                return;
+        case ICMP_DEST_UNREACH:
+                switch (code) {
+                case ICMP_SR_FAILED:
+                case ICMP_PORT_UNREACH:
+                        /* Impossible event. */
+                        return;
+                case ICMP_FRAG_NEEDED:
+                        /* Soft state for pmtu is maintained by IP core. */
+                        return;
+                default:
+                        /* All others are translated to HOST_UNREACH.
+                           rfc2003 contains "deep thoughts" about NET_UNREACH,
+                           I believe they are just ether pollution. --ANK
+                         */
+                        break;
+                }
+                break;
+        case ICMP_TIME_EXCEEDED:
+                if (code != ICMP_EXC_TTL)
+                        return;
+                break;
+        }
+        read_lock(&ipgre_lock);
+        t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
+        if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
+                goto out;
+        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+                goto out;
+        if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+                t->err_count++;
+        else
+                t->err_count = 1;
+        t->err_time = jiffies;
+out:
+        read_unlock(&ipgre_lock);
+        return;
+#else
+        struct iphdr *iph = (struct iphdr*)dp;
+        struct iphdr *eiph;
+        u16          *p = (u16*)(dp+(iph->ihl<<2));
+        int type = skb->h.icmph->type;
+        int code = skb->h.icmph->code;
+        int rel_type = 0;
+        int rel_code = 0;
+        int rel_info = 0;
+        u16 flags;
+        int grehlen = (iph->ihl<<2) + 4;
+        struct sk_buff *skb2;
+        struct flowi fl;
+        struct rtable *rt;
+        if (p[1] != htons(ETH_P_IP))
+                return;
+        flags = p[0];
+        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+                if (flags&(GRE_VERSION|GRE_ROUTING))
+                        return;
+                if (flags&GRE_CSUM)
+                        grehlen += 4;
+                if (flags&GRE_KEY)
+                        grehlen += 4;
+                if (flags&GRE_SEQ)
+                        grehlen += 4;
+        }
+        if (len < grehlen + sizeof(struct iphdr))
+                return;
+        eiph = (struct iphdr*)(dp + grehlen);
+        switch (type) {
+        default:
+                return;
+        case ICMP_PARAMETERPROB:
+                if (skb->h.icmph->un.gateway < (iph->ihl<<2))
+                        return;
+                /* So... This guy found something strange INSIDE encapsulated
+                   packet. Well, he is fool, but what can we do ?
+                 */
+                rel_type = ICMP_PARAMETERPROB;
+                rel_info = skb->h.icmph->un.gateway - grehlen;
+                break;
+        case ICMP_DEST_UNREACH:
+                switch (code) {
+                case ICMP_SR_FAILED:
+                case ICMP_PORT_UNREACH:
+                        /* Impossible event. */
+                        return;
+                case ICMP_FRAG_NEEDED:
+                        /* And it is the only really necessary thing :-) */
+                        rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+                        if (rel_info < grehlen+68)
+                                return;
+                        rel_info -= grehlen;
+                        /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+                        if (rel_info > ntohs(eiph->tot_len))
+                                return;
+                        break;
+                default:
+                        /* All others are translated to HOST_UNREACH.
+                           rfc2003 contains "deep thoughts" about NET_UNREACH,
+                           I believe, it is just ether pollution. --ANK
+                         */
+                        rel_type = ICMP_DEST_UNREACH;
+                        rel_code = ICMP_HOST_UNREACH;
+                        break;
+                }
+                break;
+        case ICMP_TIME_EXCEEDED:
+                if (code != ICMP_EXC_TTL)
+                        return;
+                break;
+        }
+        /* Prepare fake skb to feed it to icmp_send */
+        skb2 = skb_clone(skb, GFP_ATOMIC);
+        if (skb2 == NULL)
+                return;
+        dst_release(skb2->dst);
+        skb2->dst = NULL;
+        skb_pull(skb2, skb->data - (u8*)eiph);
+        skb2->nh.raw = skb2->data;
+        /* Try to guess incoming interface */
+        memset(&fl, 0, sizeof(fl));
+        fl.fl4_dst = eiph->saddr;
+        fl.fl4_tos = RT_TOS(eiph->tos);
+        fl.proto = IPPROTO_GRE;
+        if (ip_route_output_key(&rt, &fl)) {
+                kfree_skb(skb2);
+                return;
+        }
+        skb2->dev = rt->u.dst.dev;
+        /* route "incoming" packet */
+        if (rt->rt_flags&RTCF_LOCAL) {
+                ip_rt_put(rt);
+                rt = NULL;
+                fl.fl4_dst = eiph->daddr;
+                fl.fl4_src = eiph->saddr;
+                fl.fl4_tos = eiph->tos;
+                if (ip_route_output_key(&rt, &fl) ||
+                    rt->u.dst.dev->type != ARPHRD_IPGRE) {
+                        ip_rt_put(rt);
+                        kfree_skb(skb2);
+                        return;
+                }
+        } else {
+                ip_rt_put(rt);
+                if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+                    skb2->dst->dev->type != ARPHRD_IPGRE) {
+                        kfree_skb(skb2);
+                        return;
+                }
+        }
+        /* change mtu on this route */
+        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+                if (rel_info > dst_mtu(skb2->dst)) {
+                        kfree_skb(skb2);
+                        return;
+                }
+                skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+                rel_info = htonl(rel_info);
+        } else if (type == ICMP_TIME_EXCEEDED) {
+                struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+                if (t->parms.iph.ttl) {
+                        rel_type = ICMP_DEST_UNREACH;
+                        rel_code = ICMP_HOST_UNREACH;
+                }
+        }
+        icmp_send(skb2, rel_type, rel_code, rel_info);
+        kfree_skb(skb2);
+#endif
+}
+static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+{
+        if (INET_ECN_is_ce(iph->tos)) {
+                if (skb->protocol == htons(ETH_P_IP)) {
+                        IP_ECN_set_ce(skb->nh.iph);
+                } else if (skb->protocol == htons(ETH_P_IPV6)) {
+                        IP6_ECN_set_ce(skb->nh.ipv6h);
+                }
+        }
+}
+static inline u8
+ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
+{
+        u8 inner = 0;
+        if (skb->protocol == htons(ETH_P_IP))
+                inner = old_iph->tos;
+        else if (skb->protocol == htons(ETH_P_IPV6))
+                inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
+        return INET_ECN_encapsulate(tos, inner);
+}
+static int ipgre_rcv(struct sk_buff *skb)
+{
+        struct iphdr *iph;
+        u8     *h;
+        u16    flags;
+        u16    csum = 0;
+        u32    key = 0;
+        u32    seqno = 0;
+        struct ip_tunnel *tunnel;
+        int    offset = 4;
+        if (!pskb_may_pull(skb, 16))
+                goto drop_nolock;
+        iph = skb->nh.iph;
+        h = skb->data;
+        flags = *(u16*)h;
+        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
+                /* - Version must be 0.
+                   - We do not support routing headers.
+                 */
+                if (flags&(GRE_VERSION|GRE_ROUTING))
+                        goto drop_nolock;
+                if (flags&GRE_CSUM) {
+                        if (skb->ip_summed == CHECKSUM_HW) {
+                                csum = (u16)csum_fold(skb->csum);
+                                if (csum)
+                                        skb->ip_summed = CHECKSUM_NONE;
+                        }
+                        if (skb->ip_summed == CHECKSUM_NONE) {
+                                skb->csum = skb_checksum(skb, 0, skb->len, 0);
+                                skb->ip_summed = CHECKSUM_HW;
+                                csum = (u16)csum_fold(skb->csum);
+                        }
+                        offset += 4;
+                }
+                if (flags&GRE_KEY) {
+                        key = *(u32*)(h + offset);
+                        offset += 4;
+                }
+                if (flags&GRE_SEQ) {
+                        seqno = ntohl(*(u32*)(h + offset));
+                        offset += 4;
+                }
+        }
+        read_lock(&ipgre_lock);
+        if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
+                secpath_reset(skb);
+                skb->protocol = *(u16*)(h + 2);
+                /* WCCP version 1 and 2 protocol decoding.
+                 * - Change protocol to IP
+                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+                 */
+                if (flags == 0 &&
+                    skb->protocol == __constant_htons(ETH_P_WCCP)) {
+                        skb->protocol = __constant_htons(ETH_P_IP);
+                        if ((*(h + offset) & 0xF0) != 0x40) 
+                                offset += 4;
+                }
+                skb->mac.raw = skb->nh.raw;
+                skb->nh.raw = __pskb_pull(skb, offset);
+                skb_postpull_rcsum(skb, skb->mac.raw, offset);
+                memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+                skb->pkt_type = PACKET_HOST;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+                if (MULTICAST(iph->daddr)) {
+                        /* Looped back packet, drop it! */
+                        if (((struct rtable*)skb->dst)->fl.iif == 0)
+                                goto drop;
+                        tunnel->stat.multicast++;
+                        skb->pkt_type = PACKET_BROADCAST;
+                }
+#endif
+                if (((flags&GRE_CSUM) && csum) ||
+                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+                        tunnel->stat.rx_crc_errors++;
+                        tunnel->stat.rx_errors++;
+                        goto drop;
+                }
+                if (tunnel->parms.i_flags&GRE_SEQ) {
+                        if (!(flags&GRE_SEQ) ||
+                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
+                                tunnel->stat.rx_fifo_errors++;
+                                tunnel->stat.rx_errors++;
+                                goto drop;
+                        }
+                        tunnel->i_seqno = seqno + 1;
+                }
+                tunnel->stat.rx_packets++;
+                tunnel->stat.rx_bytes += skb->len;
+                skb->dev = tunnel->dev;
+                dst_release(skb->dst);
+                skb->dst = NULL;
+                nf_reset(skb);
+                ipgre_ecn_decapsulate(iph, skb);
+                netif_rx(skb);
+                read_unlock(&ipgre_lock);
+                return(0);
+        }
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+drop:
+        read_unlock(&ipgre_lock);
+drop_nolock:
+        kfree_skb(skb);
+        return(0);
+}
+static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+        struct net_device_stats *stats = &tunnel->stat;
+        struct iphdr  *old_iph = skb->nh.iph;
+        struct iphdr  *tiph;
+        u8     tos;
+        u16    df;
+        struct rtable *rt;                      /* Route to the other host */
+        struct net_device *tdev;                        /* Device to other host */
+        struct iphdr  *iph;                     /* Our new IP header */
+        int    max_headroom;                    /* The extra header space needed */
+        int    gre_hlen;
+        u32    dst;
+        int    mtu;
+        if (tunnel->recursion++) {
+                tunnel->stat.collisions++;
+                goto tx_error;
+        }
+        if (dev->hard_header) {
+                gre_hlen = 0;
+                tiph = (struct iphdr*)skb->data;
+        } else {
+                gre_hlen = tunnel->hlen;
+                tiph = &tunnel->parms.iph;
+        }
+        if ((dst = tiph->daddr) == 0) {
+                /* NBMA tunnel */
+                if (skb->dst == NULL) {
+                        tunnel->stat.tx_fifo_errors++;
+                        goto tx_error;
+                }
+                if (skb->protocol == htons(ETH_P_IP)) {
+                        rt = (struct rtable*)skb->dst;
+                        if ((dst = rt->rt_gateway) == 0)
+                                goto tx_error_icmp;
+                }
+#ifdef CONFIG_IPV6
+                else if (skb->protocol == htons(ETH_P_IPV6)) {
+                        struct in6_addr *addr6;
+                        int addr_type;
+                        struct neighbour *neigh = skb->dst->neighbour;
+                        if (neigh == NULL)
+                                goto tx_error;
+                        addr6 = (struct in6_addr*)&neigh->primary_key;
+                        addr_type = ipv6_addr_type(addr6);
+                        if (addr_type == IPV6_ADDR_ANY) {
+                                addr6 = &skb->nh.ipv6h->daddr;
+                                addr_type = ipv6_addr_type(addr6);
+                        }
+                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+                                goto tx_error_icmp;
+                        dst = addr6->s6_addr32[3];
+                }
+#endif
+                else
+                        goto tx_error;
+        }
+        tos = tiph->tos;
+        if (tos&1) {
+                if (skb->protocol == htons(ETH_P_IP))
+                        tos = old_iph->tos;
+                tos &= ~1;
+        }
+        {
+                struct flowi fl = { .oif = tunnel->parms.link,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = dst,
+                                                .saddr = tiph->saddr,
+                                                .tos = RT_TOS(tos) } },
+                                    .proto = IPPROTO_GRE };
+                if (ip_route_output_key(&rt, &fl)) {
+                        tunnel->stat.tx_carrier_errors++;
+                        goto tx_error;
+                }
+        }
+        tdev = rt->u.dst.dev;
+        if (tdev == dev) {
+                ip_rt_put(rt);
+                tunnel->stat.collisions++;
+                goto tx_error;
+        }
+        df = tiph->frag_off;
+        if (df)
+                mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
+        else
+                mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+        if (skb->dst)
+                skb->dst->ops->update_pmtu(skb->dst, mtu);
+        if (skb->protocol == htons(ETH_P_IP)) {
+                df |= (old_iph->frag_off&htons(IP_DF));
+                if ((old_iph->frag_off&htons(IP_DF)) &&
+                    mtu < ntohs(old_iph->tot_len)) {
+                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+                        ip_rt_put(rt);
+                        goto tx_error;
+                }
+        }
+#ifdef CONFIG_IPV6
+        else if (skb->protocol == htons(ETH_P_IPV6)) {
+                struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
+                if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+                        if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
+                            rt6->rt6i_dst.plen == 128) {
+                                rt6->rt6i_flags |= RTF_MODIFIED;
+                                skb->dst->metrics[RTAX_MTU-1] = mtu;
+                        }
+                }
+                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
+                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+                        ip_rt_put(rt);
+                        goto tx_error;
+                }
+        }
+#endif
+        if (tunnel->err_count > 0) {
+                if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+                        tunnel->err_count--;
+                        dst_link_failure(skb);
+                } else
+                        tunnel->err_count = 0;
+        }
+        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+        if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+                if (!new_skb) {
+                        ip_rt_put(rt);
+                        stats->tx_dropped++;
+                        dev_kfree_skb(skb);
+                        tunnel->recursion--;
+                        return 0;
+                }
+                if (skb->sk)
+                        skb_set_owner_w(new_skb, skb->sk);
+                dev_kfree_skb(skb);
+                skb = new_skb;
+                old_iph = skb->nh.iph;
+        }
+        skb->h.raw = skb->nh.raw;
+        skb->nh.raw = skb_push(skb, gre_hlen);
+        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /*
+         *      Push down and install the IPIP header.
+         */
+        iph                     =       skb->nh.iph;
+        iph->version            =       4;
+        iph->ihl                =       sizeof(struct iphdr) >> 2;
+        iph->frag_off           =       df;
+        iph->protocol           =       IPPROTO_GRE;
+        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
+        iph->daddr              =       rt->rt_dst;
+        iph->saddr              =       rt->rt_src;
+        if ((iph->ttl = tiph->ttl) == 0) {
+                if (skb->protocol == htons(ETH_P_IP))
+                        iph->ttl = old_iph->ttl;
+#ifdef CONFIG_IPV6
+                else if (skb->protocol == htons(ETH_P_IPV6))
+                        iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
+#endif
+                else
+                        iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+        }
+        ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
+        ((u16*)(iph+1))[1] = skb->protocol;
+        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+                u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
+                if (tunnel->parms.o_flags&GRE_SEQ) {
+                        ++tunnel->o_seqno;
+                        *ptr = htonl(tunnel->o_seqno);
+                        ptr--;
+                }
+                if (tunnel->parms.o_flags&GRE_KEY) {
+                        *ptr = tunnel->parms.o_key;
+                        ptr--;
+                }
+                if (tunnel->parms.o_flags&GRE_CSUM) {
+                        *ptr = 0;
+                        *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+                }
+        }
+        nf_reset(skb);
+        IPTUNNEL_XMIT();
+        tunnel->recursion--;
+        return 0;
+tx_error_icmp:
+        dst_link_failure(skb);
+tx_error:
+        stats->tx_errors++;
+        dev_kfree_skb(skb);
+        tunnel->recursion--;
+        return 0;
+}
+static int
+ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+        int err = 0;
+        struct ip_tunnel_parm p;
+        struct ip_tunnel *t;
+        switch (cmd) {
+        case SIOCGETTUNNEL:
+                t = NULL;
+                if (dev == ipgre_fb_tunnel_dev) {
+                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+                                err = -EFAULT;
+                                break;
+                        }
+                        t = ipgre_tunnel_locate(&p, 0);
+                }
+                if (t == NULL)
+                        t = (struct ip_tunnel*)dev->priv;
+                memcpy(&p, &t->parms, sizeof(p));
+                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+                        err = -EFAULT;
+                break;
+        case SIOCADDTUNNEL:
+        case SIOCCHGTUNNEL:
+                err = -EPERM;
+                if (!capable(CAP_NET_ADMIN))
+                        goto done;
+                err = -EFAULT;
+                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+                        goto done;
+                err = -EINVAL;
+                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+                        goto done;
+                if (p.iph.ttl)
+                        p.iph.frag_off |= htons(IP_DF);
+                if (!(p.i_flags&GRE_KEY))
+                        p.i_key = 0;
+                if (!(p.o_flags&GRE_KEY))
+                        p.o_key = 0;
+                t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+                if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+                        if (t != NULL) {
+                                if (t->dev != dev) {
+                                        err = -EEXIST;
+                                        break;
+                                }
+                        } else {
+                                unsigned nflags=0;
+                                t = (struct ip_tunnel*)dev->priv;
+                                if (MULTICAST(p.iph.daddr))
+                                        nflags = IFF_BROADCAST;
+                                else if (p.iph.daddr)
+                                        nflags = IFF_POINTOPOINT;
+                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+                                        err = -EINVAL;
+                                        break;
+                                }
+                                ipgre_tunnel_unlink(t);
+                                t->parms.iph.saddr = p.iph.saddr;
+                                t->parms.iph.daddr = p.iph.daddr;
+                                t->parms.i_key = p.i_key;
+                                t->parms.o_key = p.o_key;
+                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                                memcpy(dev->broadcast, &p.iph.daddr, 4);
+                                ipgre_tunnel_link(t);
+                                netdev_state_change(dev);
+                        }
+                }
+                if (t) {
+                        err = 0;
+                        if (cmd == SIOCCHGTUNNEL) {
+                                t->parms.iph.ttl = p.iph.ttl;
+                                t->parms.iph.tos = p.iph.tos;
+                                t->parms.iph.frag_off = p.iph.frag_off;
+                        }
+                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+                                err = -EFAULT;
+                } else
+                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+                break;
+        case SIOCDELTUNNEL:
+                err = -EPERM;
+                if (!capable(CAP_NET_ADMIN))
+                        goto done;
+                if (dev == ipgre_fb_tunnel_dev) {
+                        err = -EFAULT;
+                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+                                goto done;
+                        err = -ENOENT;
+                        if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
+                                goto done;
+                        err = -EPERM;
+                        if (t == ipgre_fb_tunnel_dev->priv)
+                                goto done;
+                        dev = t->dev;
+                }
+                err = unregister_netdevice(dev);
+                break;
+        default:
+                err = -EINVAL;
+        }
+done:
+        return err;
+}
+static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
+{
+        return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+        struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+        if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
+                return -EINVAL;
+        dev->mtu = new_mtu;
+        return 0;
+}
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+/* Nice toy. Unfortunately, useless in real life :-)
+   It allows to construct virtual multiprotocol broadcast "LAN"
+   over the Internet, provided multicast routing is tuned.
+   I have no idea was this bicycle invented before me,
+   so that I had to set ARPHRD_IPGRE to a random value.
+   I have an impression, that Cisco could make something similar,
+   but this feature is apparently missing in IOS<=11.2(8).
+   
+   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+   ping -t 255 224.66.66.66
+   If nobody answers, mbone does not work.
+   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+   ip addr add 10.66.66.<somewhat>/24 dev Universe
+   ifconfig Universe up
+   ifconfig Universe add fe80::<Your_real_addr>/10
+   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+   ftp 10.66.66.66
+   ...
+   ftp fec0:6666:6666::193.233.7.65
+   ...
+ */
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
+                        void *daddr, void *saddr, unsigned len)
+{
+        struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
+        u16 *p = (u16*)(iph+1);
+        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+        p[0]            = t->parms.o_flags;
+        p[1]            = htons(type);
+        /*
+         *      Set the source hardware address. 
+         */
+         
+        if (saddr)
+                memcpy(&iph->saddr, saddr, 4);
+        if (daddr) {
+                memcpy(&iph->daddr, daddr, 4);
+                return t->hlen;
+        }
+        if (iph->daddr && !MULTICAST(iph->daddr))
+                return t->hlen;
+        
+        return -t->hlen;
+}
+static int ipgre_open(struct net_device *dev)
+{
+        struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+        if (MULTICAST(t->parms.iph.daddr)) {
+                struct flowi fl = { .oif = t->parms.link,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = t->parms.iph.daddr,
+                                                .saddr = t->parms.iph.saddr,
+                                                .tos = RT_TOS(t->parms.iph.tos) } },
+                                    .proto = IPPROTO_GRE };
+                struct rtable *rt;
+                if (ip_route_output_key(&rt, &fl))
+                        return -EADDRNOTAVAIL;
+                dev = rt->u.dst.dev;
+                ip_rt_put(rt);
+                if (__in_dev_get(dev) == NULL)
+                        return -EADDRNOTAVAIL;
+                t->mlink = dev->ifindex;
+                ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
+        }
+        return 0;
+}
+static int ipgre_close(struct net_device *dev)
+{
+        struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+        if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
+                struct in_device *in_dev = inetdev_by_index(t->mlink);
+                if (in_dev) {
+                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+                        in_dev_put(in_dev);
+                }
+        }
+        return 0;
+}
+#endif
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+        SET_MODULE_OWNER(dev);
+        dev->uninit             = ipgre_tunnel_uninit;
+        dev->destructor         = free_netdev;
+        dev->hard_start_xmit    = ipgre_tunnel_xmit;
+        dev->get_stats          = ipgre_tunnel_get_stats;
+        dev->do_ioctl           = ipgre_tunnel_ioctl;
+        dev->change_mtu         = ipgre_tunnel_change_mtu;
+        dev->type               = ARPHRD_IPGRE;
+        dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+        dev->mtu                = 1500 - sizeof(struct iphdr) - 4;
+        dev->flags              = IFF_NOARP;
+        dev->iflink             = 0;
+        dev->addr_len           = 4;
+}
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+        struct net_device *tdev = NULL;
+        struct ip_tunnel *tunnel;
+        struct iphdr *iph;
+        int hlen = LL_MAX_HEADER;
+        int mtu = 1500;
+        int addend = sizeof(struct iphdr) + 4;
+        tunnel = (struct ip_tunnel*)dev->priv;
+        iph = &tunnel->parms.iph;
+        tunnel->dev = dev;
+        strcpy(tunnel->parms.name, dev->name);
+        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+        /* Guess output device to choose reasonable mtu and hard_header_len */
+        if (iph->daddr) {
+                struct flowi fl = { .oif = tunnel->parms.link,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = iph->daddr,
+                                                .saddr = iph->saddr,
+                                                .tos = RT_TOS(iph->tos) } },
+                                    .proto = IPPROTO_GRE };
+                struct rtable *rt;
+                if (!ip_route_output_key(&rt, &fl)) {
+                        tdev = rt->u.dst.dev;
+                        ip_rt_put(rt);
+                }
+                dev->flags |= IFF_POINTOPOINT;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+                if (MULTICAST(iph->daddr)) {
+                        if (!iph->saddr)
+                                return -EINVAL;
+                        dev->flags = IFF_BROADCAST;
+                        dev->hard_header = ipgre_header;
+                        dev->open = ipgre_open;
+                        dev->stop = ipgre_close;
+                }
+#endif
+        }
+        if (!tdev && tunnel->parms.link)
+                tdev = __dev_get_by_index(tunnel->parms.link);
+        if (tdev) {
+                hlen = tdev->hard_header_len;
+                mtu = tdev->mtu;
+        }
+        dev->iflink = tunnel->parms.link;
+        /* Precalculate GRE options length */
+        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+                if (tunnel->parms.o_flags&GRE_CSUM)
+                        addend += 4;
+                if (tunnel->parms.o_flags&GRE_KEY)
+                        addend += 4;
+                if (tunnel->parms.o_flags&GRE_SEQ)
+                        addend += 4;
+        }
+        dev->hard_header_len = hlen + addend;
+        dev->mtu = mtu - addend;
+        tunnel->hlen = addend;
+        return 0;
+}
+int __init ipgre_fb_tunnel_init(struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+        struct iphdr *iph = &tunnel->parms.iph;
+        tunnel->dev = dev;
+        strcpy(tunnel->parms.name, dev->name);
+        iph->version            = 4;
+        iph->protocol           = IPPROTO_GRE;
+        iph->ihl                = 5;
+        tunnel->hlen            = sizeof(struct iphdr) + 4;
+        dev_hold(dev);
+        tunnels_wc[0]           = tunnel;
+        return 0;
+}
+static struct net_protocol ipgre_protocol = {
+        .handler        =       ipgre_rcv,
+        .err_handler    =       ipgre_err,
+};
+/*
+ *      And now the modules code and kernel interface.
+ */
+static int __init ipgre_init(void)
+{
+        int err;
+        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+        if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
+                printk(KERN_INFO "ipgre init: can't add protocol\n");
+                return -EAGAIN;
+        }
+        ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
+                                           ipgre_tunnel_setup);
+        if (!ipgre_fb_tunnel_dev) {
+                err = -ENOMEM;
+                goto err1;
+        }
+        ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
+        if ((err = register_netdev(ipgre_fb_tunnel_dev)))
+                goto err2;
+out:
+        return err;
+err2:
+        free_netdev(ipgre_fb_tunnel_dev);
+err1:
+        inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+        goto out;
+}
+static void ipgre_fini(void)
+{
+        if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+                printk(KERN_INFO "ipgre close: can't remove protocol\n");
+        unregister_netdev(ipgre_fb_tunnel_dev);
+}
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 000000000000..a0d0833034be
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,431 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              The Internet Protocol (IP) module.
+ *
+ * Version:     $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Donald Becker, <becker@super.org>
+ *              Alan Cox, <Alan.Cox@linux.org>
+ *              Richard Underwood
+ *              Stefan Becker, <stefanb@yello.ping.de>
+ *              Jorge Cwik, <jorge@laser.satlink.net>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              
+ *
+ * Fixes:
+ *              Alan Cox        :       Commented a couple of minor bits of surplus code
+ *              Alan Cox        :       Undefining IP_FORWARD doesn't include the code
+ *                                      (just stops a compiler warning).
+ *              Alan Cox        :       Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ *                                      are junked rather than corrupting things.
+ *              Alan Cox        :       Frames to bad broadcast subnets are dumped
+ *                                      We used to process them non broadcast and
+ *                                      boy could that cause havoc.
+ *              Alan Cox        :       ip_forward sets the free flag on the
+ *                                      new frame it queues. Still crap because
+ *                                      it copies the frame but at least it
+ *                                      doesn't eat memory too.
+ *              Alan Cox        :       Generic queue code and memory fixes.
+ *              Fred Van Kempen :       IP fragment support (borrowed from NET2E)
+ *              Gerhard Koerting:       Forward fragmented frames correctly.
+ *              Gerhard Koerting:       Fixes to my fix of the above 8-).
+ *              Gerhard Koerting:       IP interface addressing fix.
+ *              Linus Torvalds  :       More robustness checks
+ *              Alan Cox        :       Even more checks: Still not as robust as it ought to be
+ *              Alan Cox        :       Save IP header pointer for later
+ *              Alan Cox        :       ip option setting
+ *              Alan Cox        :       Use ip_tos/ip_ttl settings
+ *              Alan Cox        :       Fragmentation bogosity removed
+ *                                      (Thanks to Mark.Bush@prg.ox.ac.uk)
+ *              Dmitry Gorodchanin :    Send of a raw packet crash fix.
+ *              Alan Cox        :       Silly ip bug when an overlength
+ *                                      fragment turns up. Now frees the
+ *                                      queue.
+ *              Linus Torvalds/ :       Memory leakage on fragmentation
+ *              Alan Cox        :       handling.
+ *              Gerhard Koerting:       Forwarding uses IP priority hints
+ *              Teemu Rantanen  :       Fragment problems.
+ *              Alan Cox        :       General cleanup, comments and reformat
+ *              Alan Cox        :       SNMP statistics
+ *              Alan Cox        :       BSD address rule semantics. Also see
+ *                                      UDP as there is a nasty checksum issue
+ *                                      if you do things the wrong way.
+ *              Alan Cox        :       Always defrag, moved IP_FORWARD to the config.in file
+ *              Alan Cox        :       IP options adjust sk->priority.
+ *              Pedro Roque     :       Fix mtu/length error in ip_forward.
+ *              Alan Cox        :       Avoid ip_chk_addr when possible.
+ *      Richard Underwood       :       IP multicasting.
+ *              Alan Cox        :       Cleaned up multicast handlers.
+ *              Alan Cox        :       RAW sockets demultiplex in the BSD style.
+ *              Gunther Mayer   :       Fix the SNMP reporting typo
+ *              Alan Cox        :       Always in group 224.0.0.1
+ *      Pauline Middelink       :       Fast ip_checksum update when forwarding
+ *                                      Masquerading support.
+ *              Alan Cox        :       Multicast loopback error for 224.0.0.1
+ *              Alan Cox        :       IP_MULTICAST_LOOP option.
+ *              Alan Cox        :       Use notifiers.
+ *              Bjorn Ekwall    :       Removed ip_csum (from slhc.c too)
+ *              Bjorn Ekwall    :       Moved ip_fast_csum to ip.h (inline!)
+ *              Stefan Becker   :       Send out ICMP HOST REDIRECT
+ *      Arnt Gulbrandsen        :       ip_build_xmit
+ *              Alan Cox        :       Per socket routing cache
+ *              Alan Cox        :       Fixed routing cache, added header cache.
+ *              Alan Cox        :       Loopback didn't work right in original ip_build_xmit - fixed it.
+ *              Alan Cox        :       Only send ICMP_REDIRECT if src/dest are the same net.
+ *              Alan Cox        :       Incoming IP option handling.
+ *              Alan Cox        :       Set saddr on raw output frames as per BSD.
+ *              Alan Cox        :       Stopped broadcast source route explosions.
+ *              Alan Cox        :       Can disable source routing
+ *              Takeshi Sone    :       Masquerading didn't work.
+ *      Dave Bonn,Alan Cox      :       Faster IP forwarding whenever possible.
+ *              Alan Cox        :       Memory leaks, tramples, misc debugging.
+ *              Alan Cox        :       Fixed multicast (by popular demand 8))
+ *              Alan Cox        :       Fixed forwarding (by even more popular demand 8))
+ *              Alan Cox        :       Fixed SNMP statistics [I think]
+ *      Gerhard Koerting        :       IP fragmentation forwarding fix
+ *              Alan Cox        :       Device lock against page fault.
+ *              Alan Cox        :       IP_HDRINCL facility.
+ *      Werner Almesberger      :       Zero fragment bug
+ *              Alan Cox        :       RAW IP frame length bug
+ *              Alan Cox        :       Outgoing firewall on build_xmit
+ *              A.N.Kuznetsov   :       IP_OPTIONS support throughout the kernel
+ *              Alan Cox        :       Multicast routing hooks
+ *              Jos Vos         :       Do accounting *before* call_in_firewall
+ *      Willy Konynenberg       :       Transparent proxying support
+ *
+ *  
+ *
+ * To Fix:
+ *              IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ *              and could be made very efficient with the addition of some virtual memory hacks to permit
+ *              the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ *              Output fragmentation wants updating along with the buffer management to use a single 
+ *              interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ *              output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ *              fragmentation anyway.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+/*
+ *      SNMP management statistics
+ */
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+/*
+ *      Process Router Attention IP option
+ */ 
+int ip_call_ra_chain(struct sk_buff *skb)
+{
+        struct ip_ra_chain *ra;
+        u8 protocol = skb->nh.iph->protocol;
+        struct sock *last = NULL;
+        read_lock(&ip_ra_lock);
+        for (ra = ip_ra_chain; ra; ra = ra->next) {
+                struct sock *sk = ra->sk;
+                /* If socket is bound to an interface, only report
+                 * the packet if it came  from that interface.
+                 */
+                if (sk && inet_sk(sk)->num == protocol &&
+                    (!sk->sk_bound_dev_if ||
+                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
+                        if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+                                skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
+                                if (skb == NULL) {
+                                        read_unlock(&ip_ra_lock);
+                                        return 1;
+                                }
+                        }
+                        if (last) {
+                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+                                if (skb2)
+                                        raw_rcv(last, skb2);
+                        }
+                        last = sk;
+                }
+        }
+        if (last) {
+                raw_rcv(last, skb);
+                read_unlock(&ip_ra_lock);
+                return 1;
+        }
+        read_unlock(&ip_ra_lock);
+        return 0;
+}
+static inline int ip_local_deliver_finish(struct sk_buff *skb)
+{
+        int ihl = skb->nh.iph->ihl*4;
+#ifdef CONFIG_NETFILTER_DEBUG
+        nf_debug_ip_local_deliver(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+        __skb_pull(skb, ihl);
+        /* Free reference early: we don't need it any more, and it may
+           hold ip_conntrack module loaded indefinitely. */
+        nf_reset(skb);
+        /* Point into the IP datagram, just past the header. */
+        skb->h.raw = skb->data;
+        rcu_read_lock();
+        {
+                /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+                int protocol = skb->nh.iph->protocol;
+                int hash;
+                struct sock *raw_sk;
+                struct net_protocol *ipprot;
+        resubmit:
+                hash = protocol & (MAX_INET_PROTOS - 1);
+                raw_sk = sk_head(&raw_v4_htable[hash]);
+                /* If there maybe a raw socket we must check - if not we
+                 * don't care less
+                 */
+                if (raw_sk)
+                        raw_v4_input(skb, skb->nh.iph, hash);
+                if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
+                        int ret;
+                        if (!ipprot->no_policy &&
+                            !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+                                kfree_skb(skb);
+                                goto out;
+                        }
+                        ret = ipprot->handler(skb);
+                        if (ret < 0) {
+                                protocol = -ret;
+                                goto resubmit;
+                        }
+                        IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+                } else {
+                        if (!raw_sk) {
+                                if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+                                        IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
+                                        icmp_send(skb, ICMP_DEST_UNREACH,
+                                                  ICMP_PROT_UNREACH, 0);
+                                }
+                        } else
+                                IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+                        kfree_skb(skb);
+                }
+        }
+ out:
+        rcu_read_unlock();
+        return 0;
+}
+/*
+ *      Deliver IP Packets to the higher protocol layers.
+ */ 
+int ip_local_deliver(struct sk_buff *skb)
+{
+        /*
+         *      Reassemble IP fragments.
+         */
+        if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+                skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
+                if (!skb)
+                        return 0;
+        }
+        return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
+                       ip_local_deliver_finish);
+}
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
+        struct net_device *dev = skb->dev;
+        struct iphdr *iph = skb->nh.iph;
+        /*
+         *      Initialise the virtual path cache for the packet. It describes
+         *      how the packet travels inside Linux networking.
+         */ 
+        if (skb->dst == NULL) {
+                if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+                        goto drop; 
+        }
+#ifdef CONFIG_NET_CLS_ROUTE
+        if (skb->dst->tclassid) {
+                struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
+                u32 idx = skb->dst->tclassid;
+                st[idx&0xFF].o_packets++;
+                st[idx&0xFF].o_bytes+=skb->len;
+                st[(idx>>16)&0xFF].i_packets++;
+                st[(idx>>16)&0xFF].i_bytes+=skb->len;
+        }
+#endif
+        if (iph->ihl > 5) {
+                struct ip_options *opt;
+                /* It looks as overkill, because not all
+                   IP options require packet mangling.
+                   But it is the easiest for now, especially taking
+                   into account that combination of IP options
+                   and running sniffer is extremely rare condition.
+                                                      --ANK (980813)
+                */
+                if (skb_cow(skb, skb_headroom(skb))) {
+                        IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+                        goto drop;
+                }
+                iph = skb->nh.iph;
+                if (ip_options_compile(NULL, skb))
+                        goto inhdr_error;
+                opt = &(IPCB(skb)->opt);
+                if (opt->srr) {
+                        struct in_device *in_dev = in_dev_get(dev);
+                        if (in_dev) {
+                                if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+                                        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+                                                printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
+                                                       NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+                                        in_dev_put(in_dev);
+                                        goto drop;
+                                }
+                                in_dev_put(in_dev);
+                        }
+                        if (ip_options_rcv_srr(skb))
+                                goto drop;
+                }
+        }
+        return dst_input(skb);
+inhdr_error:
+        IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+        kfree_skb(skb);
+        return NET_RX_DROP;
+}
+/*
+ *      Main IP Receive routine.
+ */ 
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+        struct iphdr *iph;
+        /* When the interface is in promisc. mode, drop all the crap
+         * that it receives, do not try to analyse it.
+         */
+        if (skb->pkt_type == PACKET_OTHERHOST)
+                goto drop;
+        IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
+        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+                IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+                goto out;
+        }
+        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                goto inhdr_error;
+        iph = skb->nh.iph;
+        /*
+         *      RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+         *
+         *      Is the datagram acceptable?
+         *
+         *      1.      Length at least the size of an ip header
+         *      2.      Version of 4
+         *      3.      Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+         *      4.      Doesn't have a bogus length
+         */
+        if (iph->ihl < 5 || iph->version != 4)
+                goto inhdr_error; 
+        if (!pskb_may_pull(skb, iph->ihl*4))
+                goto inhdr_error;
+        iph = skb->nh.iph;
+        if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+                goto inhdr_error; 
+        {
+                __u32 len = ntohs(iph->tot_len); 
+                if (skb->len < len || len < (iph->ihl<<2))
+                        goto inhdr_error;
+                /* Our transport medium may have padded the buffer out. Now we know it
+                 * is IP we can trim to the true length of the frame.
+                 * Note this now means skb->len holds ntohs(iph->tot_len).
+                 */
+                if (pskb_trim_rcsum(skb, len)) {
+                        IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+                        goto drop;
+                }
+        }
+        return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
+                       ip_rcv_finish);
+inhdr_error:
+        IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+        kfree_skb(skb);
+out:
+        return NET_RX_DROP;
+}
+EXPORT_SYMBOL(ip_rcv);
+EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 000000000000..6d89f3f3e701
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,625 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              The options processing module for ip.c
+ *
+ * Version:     $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $
+ *
+ * Authors:     A.N.Kuznetsov
+ *              
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+/* 
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this  function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
+                            u32 daddr, struct rtable *rt, int is_frag) 
+{
+        unsigned char * iph = skb->nh.raw;
+        memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+        memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+        opt = &(IPCB(skb)->opt);
+        opt->is_data = 0;
+        if (opt->srr)
+                memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+        if (!is_frag) {
+                if (opt->rr_needaddr)
+                        ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
+                if (opt->ts_needaddr)
+                        ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
+                if (opt->ts_needtime) {
+                        struct timeval tv;
+                        __u32 midtime;
+                        do_gettimeofday(&tv);
+                        midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+                        memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+                }
+                return;
+        }
+        if (opt->rr) {
+                memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+                opt->rr = 0;
+                opt->rr_needaddr = 0;
+        }
+        if (opt->ts) {
+                memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+                opt->ts = 0;
+                opt->ts_needaddr = opt->ts_needtime = 0;
+        }
+}
+/* 
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) 
+{
+        struct ip_options *sopt;
+        unsigned char *sptr, *dptr;
+        int soffset, doffset;
+        int     optlen;
+        u32     daddr;
+        memset(dopt, 0, sizeof(struct ip_options));
+        dopt->is_data = 1;
+        sopt = &(IPCB(skb)->opt);
+        if (sopt->optlen == 0) {
+                dopt->optlen = 0;
+                return 0;
+        }
+        sptr = skb->nh.raw;
+        dptr = dopt->__data;
+        if (skb->dst)
+                daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
+        else
+                daddr = skb->nh.iph->daddr;
+        if (sopt->rr) {
+                optlen  = sptr[sopt->rr+1];
+                soffset = sptr[sopt->rr+2];
+                dopt->rr = dopt->optlen + sizeof(struct iphdr);
+                memcpy(dptr, sptr+sopt->rr, optlen);
+                if (sopt->rr_needaddr && soffset <= optlen) {
+                        if (soffset + 3 > optlen)
+                                return -EINVAL;
+                        dptr[2] = soffset + 4;
+                        dopt->rr_needaddr = 1;
+                }
+                dptr += optlen;
+                dopt->optlen += optlen;
+        }
+        if (sopt->ts) {
+                optlen = sptr[sopt->ts+1];
+                soffset = sptr[sopt->ts+2];
+                dopt->ts = dopt->optlen + sizeof(struct iphdr);
+                memcpy(dptr, sptr+sopt->ts, optlen);
+                if (soffset <= optlen) {
+                        if (sopt->ts_needaddr) {
+                                if (soffset + 3 > optlen)
+                                        return -EINVAL;
+                                dopt->ts_needaddr = 1;
+                                soffset += 4;
+                        }
+                        if (sopt->ts_needtime) {
+                                if (soffset + 3 > optlen)
+                                        return -EINVAL;
+                                if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+                                        dopt->ts_needtime = 1;
+                                        soffset += 4;
+                                } else {
+                                        dopt->ts_needtime = 0;
+                                        if (soffset + 8 <= optlen) {
+                                                __u32 addr;
+                                                memcpy(&addr, sptr+soffset-1, 4);
+                                                if (inet_addr_type(addr) != RTN_LOCAL) {
+                                                        dopt->ts_needtime = 1;
+                                                        soffset += 8;
+                                                }
+                                        }
+                                }
+                        }
+                        dptr[2] = soffset;
+                }
+                dptr += optlen;
+                dopt->optlen += optlen;
+        }
+        if (sopt->srr) {
+                unsigned char * start = sptr+sopt->srr;
+                u32 faddr;
+                optlen  = start[1];
+                soffset = start[2];
+                doffset = 0;
+                if (soffset > optlen)
+                        soffset = optlen + 1;
+                soffset -= 4;
+                if (soffset > 3) {
+                        memcpy(&faddr, &start[soffset-1], 4);
+                        for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+                                memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+                        /*
+                         * RFC1812 requires to fix illegal source routes.
+                         */
+                        if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0)
+                                doffset -= 4;
+                }
+                if (doffset > 3) {
+                        memcpy(&start[doffset-1], &daddr, 4);
+                        dopt->faddr = faddr;
+                        dptr[0] = start[0];
+                        dptr[1] = doffset+3;
+                        dptr[2] = 4;
+                        dptr += doffset+3;
+                        dopt->srr = dopt->optlen + sizeof(struct iphdr);
+                        dopt->optlen += doffset+3;
+                        dopt->is_strictroute = sopt->is_strictroute;
+                }
+        }
+        while (dopt->optlen & 3) {
+                *dptr++ = IPOPT_END;
+                dopt->optlen++;
+        }
+        return 0;
+}
+/*
+ *      Options "fragmenting", just fill options not
+ *      allowed in fragments with NOOPs.
+ *      Simple and stupid 8), but the most efficient way.
+ */
+void ip_options_fragment(struct sk_buff * skb) 
+{
+        unsigned char * optptr = skb->nh.raw;
+        struct ip_options * opt = &(IPCB(skb)->opt);
+        int  l = opt->optlen;
+        int  optlen;
+        while (l > 0) {
+                switch (*optptr) {
+                case IPOPT_END:
+                        return;
+                case IPOPT_NOOP:
+                        l--;
+                        optptr++;
+                        continue;
+                }
+                optlen = optptr[1];
+                if (optlen<2 || optlen>l)
+                  return;
+                if (!IPOPT_COPIED(*optptr))
+                        memset(optptr, IPOPT_NOOP, optlen);
+                l -= optlen;
+                optptr += optlen;
+        }
+        opt->ts = 0;
+        opt->rr = 0;
+        opt->rr_needaddr = 0;
+        opt->ts_needaddr = 0;
+        opt->ts_needtime = 0;
+        return;
+}
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
+{
+        int l;
+        unsigned char * iph;
+        unsigned char * optptr;
+        int optlen;
+        unsigned char * pp_ptr = NULL;
+        struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL;
+        if (!opt) {
+                opt = &(IPCB(skb)->opt);
+                memset(opt, 0, sizeof(struct ip_options));
+                iph = skb->nh.raw;
+                opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
+                optptr = iph + sizeof(struct iphdr);
+                opt->is_data = 0;
+        } else {
+                optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]);
+                iph = optptr - sizeof(struct iphdr);
+        }
+        for (l = opt->optlen; l > 0; ) {
+                switch (*optptr) {
+                      case IPOPT_END:
+                        for (optptr++, l--; l>0; optptr++, l--) {
+                                if (*optptr != IPOPT_END) {
+                                        *optptr = IPOPT_END;
+                                        opt->is_changed = 1;
+                                }
+                        }
+                        goto eol;
+                      case IPOPT_NOOP:
+                        l--;
+                        optptr++;
+                        continue;
+                }
+                optlen = optptr[1];
+                if (optlen<2 || optlen>l) {
+                        pp_ptr = optptr;
+                        goto error;
+                }
+                switch (*optptr) {
+                      case IPOPT_SSRR:
+                      case IPOPT_LSRR:
+                        if (optlen < 3) {
+                                pp_ptr = optptr + 1;
+                                goto error;
+                        }
+                        if (optptr[2] < 4) {
+                                pp_ptr = optptr + 2;
+                                goto error;
+                        }
+                        /* NB: cf RFC-1812 5.2.4.1 */
+                        if (opt->srr) {
+                                pp_ptr = optptr;
+                                goto error;
+                        }
+                        if (!skb) {
+                                if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+                                        pp_ptr = optptr + 1;
+                                        goto error;
+                                }
+                                memcpy(&opt->faddr, &optptr[3], 4);
+                                if (optlen > 7)
+                                        memmove(&optptr[3], &optptr[7], optlen-7);
+                        }
+                        opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+                        opt->srr = optptr - iph;
+                        break;
+                      case IPOPT_RR:
+                        if (opt->rr) {
+                                pp_ptr = optptr;
+                                goto error;
+                        }
+                        if (optlen < 3) {
+                                pp_ptr = optptr + 1;
+                                goto error;
+                        }
+                        if (optptr[2] < 4) {
+                                pp_ptr = optptr + 2;
+                                goto error;
+                        }
+                        if (optptr[2] <= optlen) {
+                                if (optptr[2]+3 > optlen) {
+                                        pp_ptr = optptr + 2;
+                                        goto error;
+                                }
+                                if (skb) {
+                                        memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+                                        opt->is_changed = 1;
+                                }
+                                optptr[2] += 4;
+                                opt->rr_needaddr = 1;
+                        }
+                        opt->rr = optptr - iph;
+                        break;
+                      case IPOPT_TIMESTAMP:
+                        if (opt->ts) {
+                                pp_ptr = optptr;
+                                goto error;
+                        }
+                        if (optlen < 4) {
+                                pp_ptr = optptr + 1;
+                                goto error;
+                        }
+                        if (optptr[2] < 5) {
+                                pp_ptr = optptr + 2;
+                                goto error;
+                        }
+                        if (optptr[2] <= optlen) {
+                                __u32 * timeptr = NULL;
+                                if (optptr[2]+3 > optptr[1]) {
+                                        pp_ptr = optptr + 2;
+                                        goto error;
+                                }
+                                switch (optptr[3]&0xF) {
+                                      case IPOPT_TS_TSONLY:
+                                        opt->ts = optptr - iph;
+                                        if (skb) 
+                                                timeptr = (__u32*)&optptr[optptr[2]-1];
+                                        opt->ts_needtime = 1;
+                                        optptr[2] += 4;
+                                        break;
+                                      case IPOPT_TS_TSANDADDR:
+                                        if (optptr[2]+7 > optptr[1]) {
+                                                pp_ptr = optptr + 2;
+                                                goto error;
+                                        }
+                                        opt->ts = optptr - iph;
+                                        if (skb) {
+                                                memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+                                                timeptr = (__u32*)&optptr[optptr[2]+3];
+                                        }
+                                        opt->ts_needaddr = 1;
+                                        opt->ts_needtime = 1;
+                                        optptr[2] += 8;
+                                        break;
+                                      case IPOPT_TS_PRESPEC:
+                                        if (optptr[2]+7 > optptr[1]) {
+                                                pp_ptr = optptr + 2;
+                                                goto error;
+                                        }
+                                        opt->ts = optptr - iph;
+                                        {
+                                                u32 addr;
+                                                memcpy(&addr, &optptr[optptr[2]-1], 4);
+                                                if (inet_addr_type(addr) == RTN_UNICAST)
+                                                        break;
+                                                if (skb)
+                                                        timeptr = (__u32*)&optptr[optptr[2]+3];
+                                        }
+                                        opt->ts_needtime = 1;
+                                        optptr[2] += 8;
+                                        break;
+                                      default:
+                                        if (!skb && !capable(CAP_NET_RAW)) {
+                                                pp_ptr = optptr + 3;
+                                                goto error;
+                                        }
+                                        break;
+                                }
+                                if (timeptr) {
+                                        struct timeval tv;
+                                        __u32  midtime;
+                                        do_gettimeofday(&tv);
+                                        midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+                                        memcpy(timeptr, &midtime, sizeof(__u32));
+                                        opt->is_changed = 1;
+                                }
+                        } else {
+                                unsigned overflow = optptr[3]>>4;
+                                if (overflow == 15) {
+                                        pp_ptr = optptr + 3;
+                                        goto error;
+                                }
+                                opt->ts = optptr - iph;
+                                if (skb) {
+                                        optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+                                        opt->is_changed = 1;
+                                }
+                        }
+                        break;
+                      case IPOPT_RA:
+                        if (optlen < 4) {
+                                pp_ptr = optptr + 1;
+                                goto error;
+                        }
+                        if (optptr[2] == 0 && optptr[3] == 0)
+                                opt->router_alert = optptr - iph;
+                        break;
+                      case IPOPT_SEC:
+                      case IPOPT_SID:
+                      default:
+                        if (!skb && !capable(CAP_NET_RAW)) {
+                                pp_ptr = optptr;
+                                goto error;
+                        }
+                        break;
+                }
+                l -= optlen;
+                optptr += optlen;
+        }
+eol:
+        if (!pp_ptr)
+                return 0;
+error:
+        if (skb) {
+                icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+        }
+        return -EINVAL;
+}
+/*
+ *      Undo all the changes done by ip_options_compile().
+ */
+void ip_options_undo(struct ip_options * opt)
+{
+        if (opt->srr) {
+                unsigned  char * optptr = opt->__data+opt->srr-sizeof(struct  iphdr);
+                memmove(optptr+7, optptr+3, optptr[1]-7);
+                memcpy(optptr+3, &opt->faddr, 4);
+        }
+        if (opt->rr_needaddr) {
+                unsigned  char * optptr = opt->__data+opt->rr-sizeof(struct  iphdr);
+                optptr[2] -= 4;
+                memset(&optptr[optptr[2]-1], 0, 4);
+        }
+        if (opt->ts) {
+                unsigned  char * optptr = opt->__data+opt->ts-sizeof(struct  iphdr);
+                if (opt->ts_needtime) {
+                        optptr[2] -= 4;
+                        memset(&optptr[optptr[2]-1], 0, 4);
+                        if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+                                optptr[2] -= 4;
+                }
+                if (opt->ts_needaddr) {
+                        optptr[2] -= 4;
+                        memset(&optptr[optptr[2]-1], 0, 4);
+                }
+        }
+}
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+{
+        struct ip_options *opt;
+        opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
+        if (!opt)
+                return -ENOMEM;
+        memset(opt, 0, sizeof(struct ip_options));
+        if (optlen) {
+                if (user) {
+                        if (copy_from_user(opt->__data, data, optlen)) {
+                                kfree(opt);
+                                return -EFAULT;
+                        }
+                } else
+                        memcpy(opt->__data, data, optlen);
+        }
+        while (optlen & 3)
+                opt->__data[optlen++] = IPOPT_END;
+        opt->optlen = optlen;
+        opt->is_data = 1;
+        opt->is_setbyuser = 1;
+        if (optlen && ip_options_compile(opt, NULL)) {
+                kfree(opt);
+                return -EINVAL;
+        }
+        if (*optp)
+                kfree(*optp);
+        *optp = opt;
+        return 0;
+}
+void ip_forward_options(struct sk_buff *skb)
+{
+        struct   ip_options * opt       = &(IPCB(skb)->opt);
+        unsigned char * optptr;
+        struct rtable *rt = (struct rtable*)skb->dst;
+        unsigned char *raw = skb->nh.raw;
+        if (opt->rr_needaddr) {
+                optptr = (unsigned char *)raw + opt->rr;
+                ip_rt_get_source(&optptr[optptr[2]-5], rt);
+                opt->is_changed = 1;
+        }
+        if (opt->srr_is_hit) {
+                int srrptr, srrspace;
+                optptr = raw + opt->srr;
+                for ( srrptr=optptr[2], srrspace = optptr[1];
+                     srrptr <= srrspace;
+                     srrptr += 4
+                     ) {
+                        if (srrptr + 3 > srrspace)
+                                break;
+                        if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+                                break;
+                }
+                if (srrptr + 3 <= srrspace) {
+                        opt->is_changed = 1;
+                        ip_rt_get_source(&optptr[srrptr-1], rt);
+                        skb->nh.iph->daddr = rt->rt_dst;
+                        optptr[2] = srrptr+4;
+                } else if (net_ratelimit())
+                        printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+                if (opt->ts_needaddr) {
+                        optptr = raw + opt->ts;
+                        ip_rt_get_source(&optptr[optptr[2]-9], rt);
+                        opt->is_changed = 1;
+                }
+        }
+        if (opt->is_changed) {
+                opt->is_changed = 0;
+                ip_send_check(skb->nh.iph);
+        }
+}
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+        struct ip_options *opt = &(IPCB(skb)->opt);
+        int srrspace, srrptr;
+        u32 nexthop;
+        struct iphdr *iph = skb->nh.iph;
+        unsigned char * optptr = skb->nh.raw + opt->srr;
+        struct rtable *rt = (struct rtable*)skb->dst;
+        struct rtable *rt2;
+        int err;
+        if (!opt->srr)
+                return 0;
+        if (skb->pkt_type != PACKET_HOST)
+                return -EINVAL;
+        if (rt->rt_type == RTN_UNICAST) {
+                if (!opt->is_strictroute)
+                        return 0;
+                icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+                return -EINVAL;
+        }
+        if (rt->rt_type != RTN_LOCAL)
+                return -EINVAL;
+        for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+                if (srrptr + 3 > srrspace) {
+                        icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+                        return -EINVAL;
+                }
+                memcpy(&nexthop, &optptr[srrptr-1], 4);
+                rt = (struct rtable*)skb->dst;
+                skb->dst = NULL;
+                err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+                rt2 = (struct rtable*)skb->dst;
+                if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+                        ip_rt_put(rt2);
+                        skb->dst = &rt->u.dst;
+                        return -EINVAL;
+                }
+                ip_rt_put(rt);
+                if (rt2->rt_type != RTN_LOCAL)
+                        break;
+                /* Superfast 8) loopback forward */
+                memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+                opt->is_changed = 1;
+        }
+        if (srrptr <= srrspace) {
+                opt->srr_is_hit = 1;
+                opt->is_changed = 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(ip_options_compile);
+EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 000000000000..30ab7b6ab761
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1359 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              The Internet Protocol (IP) output module.
+ *
+ * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Donald Becker, <becker@super.org>
+ *              Alan Cox, <Alan.Cox@linux.org>
+ *              Richard Underwood
+ *              Stefan Becker, <stefanb@yello.ping.de>
+ *              Jorge Cwik, <jorge@laser.satlink.net>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ *      See ip_input.c for original log
+ *
+ *      Fixes:
+ *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
+ *              Mike Kilburn    :       htons() missing in ip_build_xmit.
+ *              Bradford Johnson:       Fix faulty handling of some frames when 
+ *                                      no route is found.
+ *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
+ *                                      (in case if packet not accepted by
+ *                                      output firewall rules)
+ *              Mike McLagan    :       Routing by source
+ *              Alexey Kuznetsov:       use new route cache
+ *              Andi Kleen:             Fix broken PMTU recovery and remove
+ *                                      some redundant tests.
+ *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
+ *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
+ *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
+ *                                      for decreased register pressure on x86 
+ *                                      and more readibility. 
+ *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
+ *                                      silently drop skb instead of failing with -EPERM.
+ *              Detlev Wengorz  :       Copy protocol for fragments.
+ *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
+ *                                      datagrams.
+ *              Hirokazu Takahashi:     sendfile() on UDP works now.
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/checksum.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+/*
+ *      Shall we try to damage output packets if routing dev changes?
+ */
+int sysctl_ip_dynaddr;
+int sysctl_ip_default_ttl = IPDEFTTL;
+/* Generate a checksum for an outgoing IP datagram. */
+__inline__ void ip_send_check(struct iphdr *iph)
+{
+        iph->check = 0;
+        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+/* dev_loopback_xmit for use with netfilter. */
+static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+{
+        newskb->mac.raw = newskb->data;
+        __skb_pull(newskb, newskb->nh.raw - newskb->data);
+        newskb->pkt_type = PACKET_LOOPBACK;
+        newskb->ip_summed = CHECKSUM_UNNECESSARY;
+        BUG_TRAP(newskb->dst);
+#ifdef CONFIG_NETFILTER_DEBUG
+        nf_debug_ip_loopback_xmit(newskb);
+#endif
+        netif_rx(newskb);
+        return 0;
+}
+static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+{
+        int ttl = inet->uc_ttl;
+        if (ttl < 0)
+                ttl = dst_metric(dst, RTAX_HOPLIMIT);
+        return ttl;
+}
+/* 
+ *              Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+                          u32 saddr, u32 daddr, struct ip_options *opt)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct rtable *rt = (struct rtable *)skb->dst;
+        struct iphdr *iph;
+        /* Build the IP header. */
+        if (opt)
+                iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
+        else
+                iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+        iph->version  = 4;
+        iph->ihl      = 5;
+        iph->tos      = inet->tos;
+        if (ip_dont_fragment(sk, &rt->u.dst))
+                iph->frag_off = htons(IP_DF);
+        else
+                iph->frag_off = 0;
+        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
+        iph->daddr    = rt->rt_dst;
+        iph->saddr    = rt->rt_src;
+        iph->protocol = sk->sk_protocol;
+        iph->tot_len  = htons(skb->len);
+        ip_select_ident(iph, &rt->u.dst, sk);
+        skb->nh.iph   = iph;
+        if (opt && opt->optlen) {
+                iph->ihl += opt->optlen>>2;
+                ip_options_build(skb, opt, daddr, rt, 0);
+        }
+        ip_send_check(iph);
+        skb->priority = sk->sk_priority;
+        /* Send it out. */
+        return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+                       dst_output);
+}
+static inline int ip_finish_output2(struct sk_buff *skb)
+{
+        struct dst_entry *dst = skb->dst;
+        struct hh_cache *hh = dst->hh;
+        struct net_device *dev = dst->dev;
+        int hh_len = LL_RESERVED_SPACE(dev);
+        /* Be paranoid, rather than too clever. */
+        if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
+                struct sk_buff *skb2;
+                skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+                if (skb2 == NULL) {
+                        kfree_skb(skb);
+                        return -ENOMEM;
+                }
+                if (skb->sk)
+                        skb_set_owner_w(skb2, skb->sk);
+                kfree_skb(skb);
+                skb = skb2;
+        }
+#ifdef CONFIG_NETFILTER_DEBUG
+        nf_debug_ip_finish_output2(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+        if (hh) {
+                int hh_alen;
+                read_lock_bh(&hh->hh_lock);
+                hh_alen = HH_DATA_ALIGN(hh->hh_len);
+                memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
+                read_unlock_bh(&hh->hh_lock);
+                skb_push(skb, hh->hh_len);
+                return hh->hh_output(skb);
+        } else if (dst->neighbour)
+                return dst->neighbour->output(skb);
+        if (net_ratelimit())
+                printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+        kfree_skb(skb);
+        return -EINVAL;
+}
+int ip_finish_output(struct sk_buff *skb)
+{
+        struct net_device *dev = skb->dst->dev;
+        skb->dev = dev;
+        skb->protocol = htons(ETH_P_IP);
+        return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
+                       ip_finish_output2);
+}
+int ip_mc_output(struct sk_buff *skb)
+{
+        struct sock *sk = skb->sk;
+        struct rtable *rt = (struct rtable*)skb->dst;
+        struct net_device *dev = rt->u.dst.dev;
+        /*
+         *      If the indicated interface is up and running, send the packet.
+         */
+        IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+        skb->dev = dev;
+        skb->protocol = htons(ETH_P_IP);
+        /*
+         *      Multicasts are looped back for other local users
+         */
+        if (rt->rt_flags&RTCF_MULTICAST) {
+                if ((!sk || inet_sk(sk)->mc_loop)
+#ifdef CONFIG_IP_MROUTE
+                /* Small optimization: do not loopback not local frames,
+                   which returned after forwarding; they will be  dropped
+                   by ip_mr_input in any case.
+                   Note, that local frames are looped back to be delivered
+                   to local recipients.
+                   This check is duplicated in ip_mr_input at the moment.
+                 */
+                    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+#endif
+                ) {
+                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+                        if (newskb)
+                                NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+                                        newskb->dev, 
+                                        ip_dev_loopback_xmit);
+                }
+                /* Multicasts with ttl 0 must not go beyond the host */
+                if (skb->nh.iph->ttl == 0) {
+                        kfree_skb(skb);
+                        return 0;
+                }
+        }
+        if (rt->rt_flags&RTCF_BROADCAST) {
+                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+                if (newskb)
+                        NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+                                newskb->dev, ip_dev_loopback_xmit);
+        }
+        if (skb->len > dst_mtu(&rt->u.dst))
+                return ip_fragment(skb, ip_finish_output);
+        else
+                return ip_finish_output(skb);
+}
+int ip_output(struct sk_buff *skb)
+{
+        IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+        if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
+                return ip_fragment(skb, ip_finish_output);
+        else
+                return ip_finish_output(skb);
+}
+int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
+{
+        struct sock *sk = skb->sk;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_options *opt = inet->opt;
+        struct rtable *rt;
+        struct iphdr *iph;
+        /* Skip all of this if the packet is already routed,
+         * f.e. by something like SCTP.
+         */
+        rt = (struct rtable *) skb->dst;
+        if (rt != NULL)
+                goto packet_routed;
+        /* Make sure we can route this packet. */
+        rt = (struct rtable *)__sk_dst_check(sk, 0);
+        if (rt == NULL) {
+                u32 daddr;
+                /* Use correct destination address if we have options. */
+                daddr = inet->daddr;
+                if(opt && opt->srr)
+                        daddr = opt->faddr;
+                {
+                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
+                                            .nl_u = { .ip4_u =
+                                                      { .daddr = daddr,
+                                                        .saddr = inet->saddr,
+                                                        .tos = RT_CONN_FLAGS(sk) } },
+                                            .proto = sk->sk_protocol,
+                                            .uli_u = { .ports =
+                                                       { .sport = inet->sport,
+                                                         .dport = inet->dport } } };
+                        /* If this fails, retransmit mechanism of transport layer will
+                         * keep trying until route appears or the connection times
+                         * itself out.
+                         */
+                        if (ip_route_output_flow(&rt, &fl, sk, 0))
+                                goto no_route;
+                }
+                __sk_dst_set(sk, &rt->u.dst);
+                tcp_v4_setup_caps(sk, &rt->u.dst);
+        }
+        skb->dst = dst_clone(&rt->u.dst);
+packet_routed:
+        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+                goto no_route;
+        /* OK, we know where to send it, allocate and build IP header. */
+        iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+        *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+        iph->tot_len = htons(skb->len);
+        if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
+                iph->frag_off = htons(IP_DF);
+        else
+                iph->frag_off = 0;
+        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
+        iph->protocol = sk->sk_protocol;
+        iph->saddr    = rt->rt_src;
+        iph->daddr    = rt->rt_dst;
+        skb->nh.iph   = iph;
+        /* Transport layer set skb->h.foo itself. */
+        if (opt && opt->optlen) {
+                iph->ihl += opt->optlen >> 2;
+                ip_options_build(skb, opt, inet->daddr, rt, 0);
+        }
+        ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
+        /* Add an IP checksum. */
+        ip_send_check(iph);
+        skb->priority = sk->sk_priority;
+        return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+                       dst_output);
+no_route:
+        IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+        kfree_skb(skb);
+        return -EHOSTUNREACH;
+}
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+        to->pkt_type = from->pkt_type;
+        to->priority = from->priority;
+        to->protocol = from->protocol;
+        to->security = from->security;
+        dst_release(to->dst);
+        to->dst = dst_clone(from->dst);
+        to->dev = from->dev;
+        /* Copy the flags to each fragment. */
+        IPCB(to)->flags = IPCB(from)->flags;
+#ifdef CONFIG_NET_SCHED
+        to->tc_index = from->tc_index;
+#endif
+#ifdef CONFIG_NETFILTER
+        to->nfmark = from->nfmark;
+        to->nfcache = from->nfcache;
+        /* Connection association is same as pre-frag packet */
+        nf_conntrack_put(to->nfct);
+        to->nfct = from->nfct;
+        nf_conntrack_get(to->nfct);
+        to->nfctinfo = from->nfctinfo;
+#ifdef CONFIG_BRIDGE_NETFILTER
+        nf_bridge_put(to->nf_bridge);
+        to->nf_bridge = from->nf_bridge;
+        nf_bridge_get(to->nf_bridge);
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+        to->nf_debug = from->nf_debug;
+#endif
+#endif
+}
+/*
+ *      This IP datagram is too large to be sent in one piece.  Break it up into
+ *      smaller pieces (each of size equal to IP header plus
+ *      a block of the data of the original IP data part) that will yet fit in a
+ *      single device frame, and queue such a frame for sending.
+ */
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+{
+        struct iphdr *iph;
+        int raw = 0;
+        int ptr;
+        struct net_device *dev;
+        struct sk_buff *skb2;
+        unsigned int mtu, hlen, left, len, ll_rs;
+        int offset;
+        int not_last_frag;
+        struct rtable *rt = (struct rtable*)skb->dst;
+        int err = 0;
+        dev = rt->u.dst.dev;
+        /*
+         *      Point into the IP datagram header.
+         */
+        iph = skb->nh.iph;
+        if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+                          htonl(dst_mtu(&rt->u.dst)));
+                kfree_skb(skb);
+                return -EMSGSIZE;
+        }
+        /*
+         *      Setup starting values.
+         */
+        hlen = iph->ihl * 4;
+        mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
+        /* When frag_list is given, use it. First, check its validity:
+         * some transformers could create wrong frag_list or break existing
+         * one, it is not prohibited. In this case fall back to copying.
+         *
+         * LATER: this step can be merged to real generation of fragments,
+         * we can switch to copy when see the first bad fragment.
+         */
+        if (skb_shinfo(skb)->frag_list) {
+                struct sk_buff *frag;
+                int first_len = skb_pagelen(skb);
+                if (first_len - hlen > mtu ||
+                    ((first_len - hlen) & 7) ||
+                    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+                    skb_cloned(skb))
+                        goto slow_path;
+                for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+                        /* Correct geometry. */
+                        if (frag->len > mtu ||
+                            ((frag->len & 7) && frag->next) ||
+                            skb_headroom(frag) < hlen)
+                            goto slow_path;
+                        /* Partially cloned skb? */
+                        if (skb_shared(frag))
+                                goto slow_path;
+                }
+                /* Everything is OK. Generate! */
+                err = 0;
+                offset = 0;
+                frag = skb_shinfo(skb)->frag_list;
+                skb_shinfo(skb)->frag_list = NULL;
+                skb->data_len = first_len - skb_headlen(skb);
+                skb->len = first_len;
+                iph->tot_len = htons(first_len);
+                iph->frag_off = htons(IP_MF);
+                ip_send_check(iph);
+                for (;;) {
+                        /* Prepare header of the next frame,
+                         * before previous one went down. */
+                        if (frag) {
+                                frag->ip_summed = CHECKSUM_NONE;
+                                frag->h.raw = frag->data;
+                                frag->nh.raw = __skb_push(frag, hlen);
+                                memcpy(frag->nh.raw, iph, hlen);
+                                iph = frag->nh.iph;
+                                iph->tot_len = htons(frag->len);
+                                ip_copy_metadata(frag, skb);
+                                if (offset == 0)
+                                        ip_options_fragment(frag);
+                                offset += skb->len - hlen;
+                                iph->frag_off = htons(offset>>3);
+                                if (frag->next != NULL)
+                                        iph->frag_off |= htons(IP_MF);
+                                /* Ready, complete checksum */
+                                ip_send_check(iph);
+                        }
+                        err = output(skb);
+                        if (err || !frag)
+                                break;
+                        skb = frag;
+                        frag = skb->next;
+                        skb->next = NULL;
+                }
+                if (err == 0) {
+                        IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+                        return 0;
+                }
+                while (frag) {
+                        skb = frag->next;
+                        kfree_skb(frag);
+                        frag = skb;
+                }
+                IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+                return err;
+        }
+slow_path:
+        left = skb->len - hlen;         /* Space per frame */
+        ptr = raw + hlen;               /* Where to start from */
+#ifdef CONFIG_BRIDGE_NETFILTER
+        /* for bridged IP traffic encapsulated inside f.e. a vlan header,
+         * we need to make room for the encapsulating header */
+        ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
+        mtu -= nf_bridge_pad(skb);
+#else
+        ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
+#endif
+        /*
+         *      Fragment the datagram.
+         */
+        offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+        not_last_frag = iph->frag_off & htons(IP_MF);
+        /*
+         *      Keep copying data until we run out.
+         */
+        while(left > 0) {
+                len = left;
+                /* IF: it doesn't fit, use 'mtu' - the data space left */
+                if (len > mtu)
+                        len = mtu;
+                /* IF: we are not sending upto and including the packet end
+                   then align the next start on an eight byte boundary */
+                if (len < left) {
+                        len &= ~7;
+                }
+                /*
+                 *      Allocate buffer.
+                 */
+                if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
+                        NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+                        err = -ENOMEM;
+                        goto fail;
+                }
+                /*
+                 *      Set up data on packet
+                 */
+                ip_copy_metadata(skb2, skb);
+                skb_reserve(skb2, ll_rs);
+                skb_put(skb2, len + hlen);
+                skb2->nh.raw = skb2->data;
+                skb2->h.raw = skb2->data + hlen;
+                /*
+                 *      Charge the memory for the fragment to any owner
+                 *      it might possess
+                 */
+                if (skb->sk)
+                        skb_set_owner_w(skb2, skb->sk);
+                /*
+                 *      Copy the packet header into the new buffer.
+                 */
+                memcpy(skb2->nh.raw, skb->data, hlen);
+                /*
+                 *      Copy a block of the IP datagram.
+                 */
+                if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
+                        BUG();
+                left -= len;
+                /*
+                 *      Fill in the new header fields.
+                 */
+                iph = skb2->nh.iph;
+                iph->frag_off = htons((offset >> 3));
+                /* ANK: dirty, but effective trick. Upgrade options only if
+                 * the segment to be fragmented was THE FIRST (otherwise,
+                 * options are already fixed) and make it ONCE
+                 * on the initial skb, so that all the following fragments
+                 * will inherit fixed options.
+                 */
+                if (offset == 0)
+                        ip_options_fragment(skb);
+                /*
+                 *      Added AC : If we are fragmenting a fragment that's not the
+                 *                 last fragment then keep MF on each bit
+                 */
+                if (left > 0 || not_last_frag)
+                        iph->frag_off |= htons(IP_MF);
+                ptr += len;
+                offset += len;
+                /*
+                 *      Put this fragment into the sending queue.
+                 */
+                IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
+                iph->tot_len = htons(len + hlen);
+                ip_send_check(iph);
+                err = output(skb2);
+                if (err)
+                        goto fail;
+        }
+        kfree_skb(skb);
+        IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+        return err;
+fail:
+        kfree_skb(skb); 
+        IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+        return err;
+}
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+        struct iovec *iov = from;
+        if (skb->ip_summed == CHECKSUM_HW) {
+                if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+                        return -EFAULT;
+        } else {
+                unsigned int csum = 0;
+                if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+                        return -EFAULT;
+                skb->csum = csum_block_add(skb->csum, csum, odd);
+        }
+        return 0;
+}
+static inline unsigned int
+csum_page(struct page *page, int offset, int copy)
+{
+        char *kaddr;
+        unsigned int csum;
+        kaddr = kmap(page);
+        csum = csum_partial(kaddr + offset, copy, 0);
+        kunmap(page);
+        return csum;
+}
+/*
+ *      ip_append_data() and ip_append_page() can make one large IP datagram
+ *      from many pieces of data. Each pieces will be holded on the socket
+ *      until ip_push_pending_frames() is called. Each piece can be a page
+ *      or non-page data.
+ *      
+ *      Not only UDP, other transport protocols - e.g. raw sockets - can use
+ *      this interface potentially.
+ *
+ *      LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk,
+                   int getfrag(void *from, char *to, int offset, int len,
+                               int odd, struct sk_buff *skb),
+                   void *from, int length, int transhdrlen,
+                   struct ipcm_cookie *ipc, struct rtable *rt,
+                   unsigned int flags)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sk_buff *skb;
+        struct ip_options *opt = NULL;
+        int hh_len;
+        int exthdrlen;
+        int mtu;
+        int copy;
+        int err;
+        int offset = 0;
+        unsigned int maxfraglen, fragheaderlen;
+        int csummode = CHECKSUM_NONE;
+        if (flags&MSG_PROBE)
+                return 0;
+        if (skb_queue_empty(&sk->sk_write_queue)) {
+                /*
+                 * setup for corking.
+                 */
+                opt = ipc->opt;
+                if (opt) {
+                        if (inet->cork.opt == NULL) {
+                                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
+                                if (unlikely(inet->cork.opt == NULL))
+                                        return -ENOBUFS;
+                        }
+                        memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
+                        inet->cork.flags |= IPCORK_OPT;
+                        inet->cork.addr = ipc->addr;
+                }
+                dst_hold(&rt->u.dst);
+                inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+                inet->cork.rt = rt;
+                inet->cork.length = 0;
+                sk->sk_sndmsg_page = NULL;
+                sk->sk_sndmsg_off = 0;
+                if ((exthdrlen = rt->u.dst.header_len) != 0) {
+                        length += exthdrlen;
+                        transhdrlen += exthdrlen;
+                }
+        } else {
+                rt = inet->cork.rt;
+                if (inet->cork.flags & IPCORK_OPT)
+                        opt = inet->cork.opt;
+                transhdrlen = 0;
+                exthdrlen = 0;
+                mtu = inet->cork.fragsize;
+        }
+        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+        if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+                return -EMSGSIZE;
+        }
+        /*
+         * transhdrlen > 0 means that this is the first fragment and we wish
+         * it won't be fragmented in the future.
+         */
+        if (transhdrlen &&
+            length + fragheaderlen <= mtu &&
+            rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+            !exthdrlen)
+                csummode = CHECKSUM_HW;
+        inet->cork.length += length;
+        /* So, what's going on in the loop below?
+         *
+         * We use calculated fragment length to generate chained skb,
+         * each of segments is IP fragment ready for sending to network after
+         * adding appropriate IP header.
+         */
+        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+                goto alloc_new_skb;
+        while (length > 0) {
+                /* Check if the remaining data fits into current packet. */
+                copy = mtu - skb->len;
+                if (copy < length)
+                        copy = maxfraglen - skb->len;
+                if (copy <= 0) {
+                        char *data;
+                        unsigned int datalen;
+                        unsigned int fraglen;
+                        unsigned int fraggap;
+                        unsigned int alloclen;
+                        struct sk_buff *skb_prev;
+alloc_new_skb:
+                        skb_prev = skb;
+                        if (skb_prev)
+                                fraggap = skb_prev->len - maxfraglen;
+                        else
+                                fraggap = 0;
+                        /*
+                         * If remaining data exceeds the mtu,
+                         * we know we need more fragment(s).
+                         */
+                        datalen = length + fraggap;
+                        if (datalen > mtu - fragheaderlen)
+                                datalen = maxfraglen - fragheaderlen;
+                        fraglen = datalen + fragheaderlen;
+                        if ((flags & MSG_MORE) && 
+                            !(rt->u.dst.dev->features&NETIF_F_SG))
+                                alloclen = mtu;
+                        else
+                                alloclen = datalen + fragheaderlen;
+                        /* The last fragment gets additional space at tail.
+                         * Note, with MSG_MORE we overallocate on fragments,
+                         * because we have no idea what fragment will be
+                         * the last.
+                         */
+                        if (datalen == length)
+                                alloclen += rt->u.dst.trailer_len;
+                        if (transhdrlen) {
+                                skb = sock_alloc_send_skb(sk, 
+                                                alloclen + hh_len + 15,
+                                                (flags & MSG_DONTWAIT), &err);
+                        } else {
+                                skb = NULL;
+                                if (atomic_read(&sk->sk_wmem_alloc) <=
+                                    2 * sk->sk_sndbuf)
+                                        skb = sock_wmalloc(sk, 
+                                                           alloclen + hh_len + 15, 1,
+                                                           sk->sk_allocation);
+                                if (unlikely(skb == NULL))
+                                        err = -ENOBUFS;
+                        }
+                        if (skb == NULL)
+                                goto error;
+                        /*
+                         *      Fill in the control structures
+                         */
+                        skb->ip_summed = csummode;
+                        skb->csum = 0;
+                        skb_reserve(skb, hh_len);
+                        /*
+                         *      Find where to start putting bytes.
+                         */
+                        data = skb_put(skb, fraglen);
+                        skb->nh.raw = data + exthdrlen;
+                        data += fragheaderlen;
+                        skb->h.raw = data + exthdrlen;
+                        if (fraggap) {
+                                skb->csum = skb_copy_and_csum_bits(
+                                        skb_prev, maxfraglen,
+                                        data + transhdrlen, fraggap, 0);
+                                skb_prev->csum = csum_sub(skb_prev->csum,
+                                                          skb->csum);
+                                data += fraggap;
+                                skb_trim(skb_prev, maxfraglen);
+                        }
+                        copy = datalen - transhdrlen - fraggap;
+                        if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+                                err = -EFAULT;
+                                kfree_skb(skb);
+                                goto error;
+                        }
+                        offset += copy;
+                        length -= datalen - fraggap;
+                        transhdrlen = 0;
+                        exthdrlen = 0;
+                        csummode = CHECKSUM_NONE;
+                        /*
+                         * Put the packet on the pending queue.
+                         */
+                        __skb_queue_tail(&sk->sk_write_queue, skb);
+                        continue;
+                }
+                if (copy > length)
+                        copy = length;
+                if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+                        unsigned int off;
+                        off = skb->len;
+                        if (getfrag(from, skb_put(skb, copy), 
+                                        offset, copy, off, skb) < 0) {
+                                __skb_trim(skb, off);
+                                err = -EFAULT;
+                                goto error;
+                        }
+                } else {
+                        int i = skb_shinfo(skb)->nr_frags;
+                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+                        struct page *page = sk->sk_sndmsg_page;
+                        int off = sk->sk_sndmsg_off;
+                        unsigned int left;
+                        if (page && (left = PAGE_SIZE - off) > 0) {
+                                if (copy >= left)
+                                        copy = left;
+                                if (page != frag->page) {
+                                        if (i == MAX_SKB_FRAGS) {
+                                                err = -EMSGSIZE;
+                                                goto error;
+                                        }
+                                        get_page(page);
+                                        skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
+                                        frag = &skb_shinfo(skb)->frags[i];
+                                }
+                        } else if (i < MAX_SKB_FRAGS) {
+                                if (copy > PAGE_SIZE)
+                                        copy = PAGE_SIZE;
+                                page = alloc_pages(sk->sk_allocation, 0);
+                                if (page == NULL)  {
+                                        err = -ENOMEM;
+                                        goto error;
+                                }
+                                sk->sk_sndmsg_page = page;
+                                sk->sk_sndmsg_off = 0;
+                                skb_fill_page_desc(skb, i, page, 0, 0);
+                                frag = &skb_shinfo(skb)->frags[i];
+                                skb->truesize += PAGE_SIZE;
+                                atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+                        } else {
+                                err = -EMSGSIZE;
+                                goto error;
+                        }
+                        if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+                                err = -EFAULT;
+                                goto error;
+                        }
+                        sk->sk_sndmsg_off += copy;
+                        frag->size += copy;
+                        skb->len += copy;
+                        skb->data_len += copy;
+                }
+                offset += copy;
+                length -= copy;
+        }
+        return 0;
+error:
+        inet->cork.length -= length;
+        IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+        return err; 
+}
+ssize_t ip_append_page(struct sock *sk, struct page *page,
+                       int offset, size_t size, int flags)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sk_buff *skb;
+        struct rtable *rt;
+        struct ip_options *opt = NULL;
+        int hh_len;
+        int mtu;
+        int len;
+        int err;
+        unsigned int maxfraglen, fragheaderlen, fraggap;
+        if (inet->hdrincl)
+                return -EPERM;
+        if (flags&MSG_PROBE)
+                return 0;
+        if (skb_queue_empty(&sk->sk_write_queue))
+                return -EINVAL;
+        rt = inet->cork.rt;
+        if (inet->cork.flags & IPCORK_OPT)
+                opt = inet->cork.opt;
+        if (!(rt->u.dst.dev->features&NETIF_F_SG))
+                return -EOPNOTSUPP;
+        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+        mtu = inet->cork.fragsize;
+        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+        if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
+                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+                return -EMSGSIZE;
+        }
+        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+                return -EINVAL;
+        inet->cork.length += size;
+        while (size > 0) {
+                int i;
+                /* Check if the remaining data fits into current packet. */
+                len = mtu - skb->len;
+                if (len < size)
+                        len = maxfraglen - skb->len;
+                if (len <= 0) {
+                        struct sk_buff *skb_prev;
+                        char *data;
+                        struct iphdr *iph;
+                        int alloclen;
+                        skb_prev = skb;
+                        if (skb_prev)
+                                fraggap = skb_prev->len - maxfraglen;
+                        else
+                                fraggap = 0;
+                        alloclen = fragheaderlen + hh_len + fraggap + 15;
+                        skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+                        if (unlikely(!skb)) {
+                                err = -ENOBUFS;
+                                goto error;
+                        }
+                        /*
+                         *      Fill in the control structures
+                         */
+                        skb->ip_summed = CHECKSUM_NONE;
+                        skb->csum = 0;
+                        skb_reserve(skb, hh_len);
+                        /*
+                         *      Find where to start putting bytes.
+                         */
+                        data = skb_put(skb, fragheaderlen + fraggap);
+                        skb->nh.iph = iph = (struct iphdr *)data;
+                        data += fragheaderlen;
+                        skb->h.raw = data;
+                        if (fraggap) {
+                                skb->csum = skb_copy_and_csum_bits(
+                                        skb_prev, maxfraglen,
+                                        data, fraggap, 0);
+                                skb_prev->csum = csum_sub(skb_prev->csum,
+                                                          skb->csum);
+                                skb_trim(skb_prev, maxfraglen);
+                        }
+                        /*
+                         * Put the packet on the pending queue.
+                         */
+                        __skb_queue_tail(&sk->sk_write_queue, skb);
+                        continue;
+                }
+                i = skb_shinfo(skb)->nr_frags;
+                if (len > size)
+                        len = size;
+                if (skb_can_coalesce(skb, i, page, offset)) {
+                        skb_shinfo(skb)->frags[i-1].size += len;
+                } else if (i < MAX_SKB_FRAGS) {
+                        get_page(page);
+                        skb_fill_page_desc(skb, i, page, offset, len);
+                } else {
+                        err = -EMSGSIZE;
+                        goto error;
+                }
+                if (skb->ip_summed == CHECKSUM_NONE) {
+                        unsigned int csum;
+                        csum = csum_page(page, offset, len);
+                        skb->csum = csum_block_add(skb->csum, csum, skb->len);
+                }
+                skb->len += len;
+                skb->data_len += len;
+                offset += len;
+                size -= len;
+        }
+        return 0;
+error:
+        inet->cork.length -= size;
+        IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+        return err;
+}
+/*
+ *      Combined all pending IP fragments on the socket as one IP datagram
+ *      and push them out.
+ */
+int ip_push_pending_frames(struct sock *sk)
+{
+        struct sk_buff *skb, *tmp_skb;
+        struct sk_buff **tail_skb;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_options *opt = NULL;
+        struct rtable *rt = inet->cork.rt;
+        struct iphdr *iph;
+        int df = 0;
+        __u8 ttl;
+        int err = 0;
+        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+                goto out;
+        tail_skb = &(skb_shinfo(skb)->frag_list);
+        /* move skb->data to ip header from ext header */
+        if (skb->data < skb->nh.raw)
+                __skb_pull(skb, skb->nh.raw - skb->data);
+        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+                __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
+                *tail_skb = tmp_skb;
+                tail_skb = &(tmp_skb->next);
+                skb->len += tmp_skb->len;
+                skb->data_len += tmp_skb->len;
+                skb->truesize += tmp_skb->truesize;
+                __sock_put(tmp_skb->sk);
+                tmp_skb->destructor = NULL;
+                tmp_skb->sk = NULL;
+        }
+        /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+         * to fragment the frame generated here. No matter, what transforms
+         * how transforms change size of the packet, it will come out.
+         */
+        if (inet->pmtudisc != IP_PMTUDISC_DO)
+                skb->local_df = 1;
+        /* DF bit is set when we want to see DF on outgoing frames.
+         * If local_df is set too, we still allow to fragment this frame
+         * locally. */
+        if (inet->pmtudisc == IP_PMTUDISC_DO ||
+            (skb->len <= dst_mtu(&rt->u.dst) &&
+             ip_dont_fragment(sk, &rt->u.dst)))
+                df = htons(IP_DF);
+        if (inet->cork.flags & IPCORK_OPT)
+                opt = inet->cork.opt;
+        if (rt->rt_type == RTN_MULTICAST)
+                ttl = inet->mc_ttl;
+        else
+                ttl = ip_select_ttl(inet, &rt->u.dst);
+        iph = (struct iphdr *)skb->data;
+        iph->version = 4;
+        iph->ihl = 5;
+        if (opt) {
+                iph->ihl += opt->optlen>>2;
+                ip_options_build(skb, opt, inet->cork.addr, rt, 0);
+        }
+        iph->tos = inet->tos;
+        iph->tot_len = htons(skb->len);
+        iph->frag_off = df;
+        if (!df) {
+                __ip_select_ident(iph, &rt->u.dst, 0);
+        } else {
+                iph->id = htons(inet->id++);
+        }
+        iph->ttl = ttl;
+        iph->protocol = sk->sk_protocol;
+        iph->saddr = rt->rt_src;
+        iph->daddr = rt->rt_dst;
+        ip_send_check(iph);
+        skb->priority = sk->sk_priority;
+        skb->dst = dst_clone(&rt->u.dst);
+        /* Netfilter gets whole the not fragmented skb. */
+        err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
+                      skb->dst->dev, dst_output);
+        if (err) {
+                if (err > 0)
+                        err = inet->recverr ? net_xmit_errno(err) : 0;
+                if (err)
+                        goto error;
+        }
+out:
+        inet->cork.flags &= ~IPCORK_OPT;
+        if (inet->cork.opt) {
+                kfree(inet->cork.opt);
+                inet->cork.opt = NULL;
+        }
+        if (inet->cork.rt) {
+                ip_rt_put(inet->cork.rt);
+                inet->cork.rt = NULL;
+        }
+        return err;
+error:
+        IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+        goto out;
+}
+/*
+ *      Throw away all pending data on the socket.
+ */
+void ip_flush_pending_frames(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sk_buff *skb;
+        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+                kfree_skb(skb);
+        inet->cork.flags &= ~IPCORK_OPT;
+        if (inet->cork.opt) {
+                kfree(inet->cork.opt);
+                inet->cork.opt = NULL;
+        }
+        if (inet->cork.rt) {
+                ip_rt_put(inet->cork.rt);
+                inet->cork.rt = NULL;
+        }
+}
+/*
+ *      Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
+                              int len, int odd, struct sk_buff *skb)
+{
+        unsigned int csum;
+        csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+        skb->csum = csum_block_add(skb->csum, csum, odd);
+        return 0;  
+}
+/* 
+ *      Generic function to send a packet as reply to another packet.
+ *      Used to send TCP resets so far. ICMP should use this function too.
+ *
+ *      Should run single threaded per socket because it uses the sock 
+ *      structure to pass arguments.
+ *
+ *      LATER: switch from ip_build_xmit to ip_append_*
+ */
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
+                   unsigned int len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct {
+                struct ip_options       opt;
+                char                    data[40];
+        } replyopts;
+        struct ipcm_cookie ipc;
+        u32 daddr;
+        struct rtable *rt = (struct rtable*)skb->dst;
+        if (ip_options_echo(&replyopts.opt, skb))
+                return;
+        daddr = ipc.addr = rt->rt_src;
+        ipc.opt = NULL;
+        if (replyopts.opt.optlen) {
+                ipc.opt = &replyopts.opt;
+                if (ipc.opt->srr)
+                        daddr = replyopts.opt.faddr;
+        }
+        {
+                struct flowi fl = { .nl_u = { .ip4_u =
+                                              { .daddr = daddr,
+                                                .saddr = rt->rt_spec_dst,
+                                                .tos = RT_TOS(skb->nh.iph->tos) } },
+                                    /* Not quite clean, but right. */
+                                    .uli_u = { .ports =
+                                               { .sport = skb->h.th->dest,
+                                                 .dport = skb->h.th->source } },
+                                    .proto = sk->sk_protocol };
+                if (ip_route_output_key(&rt, &fl))
+                        return;
+        }
+        /* And let IP do all the hard work.
+           This chunk is not reenterable, hence spinlock.
+           Note that it uses the fact, that this function is called
+           with locally disabled BH and that sk cannot be already spinlocked.
+         */
+        bh_lock_sock(sk);
+        inet->tos = skb->nh.iph->tos;
+        sk->sk_priority = skb->priority;
+        sk->sk_protocol = skb->nh.iph->protocol;
+        ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+                       &ipc, rt, MSG_DONTWAIT);
+        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+                if (arg->csumoffset >= 0)
+                        *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
+                skb->ip_summed = CHECKSUM_NONE;
+                ip_push_pending_frames(sk);
+        }
+        bh_unlock_sock(sk);
+        ip_rt_put(rt);
+}
+/*
+ *      IP protocol layer initialiser
+ */
+static struct packet_type ip_packet_type = {
+        .type = __constant_htons(ETH_P_IP),
+        .func = ip_rcv,
+};
+/*
+ *      IP registers the packet type and then calls the subprotocol initialisers
+ */
+void __init ip_init(void)
+{
+        dev_add_pack(&ip_packet_type);
+        ip_rt_init();
+        inet_initpeers();
+#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
+        igmp_mc_proc_init();
+#endif
+}
+EXPORT_SYMBOL(ip_finish_output);
+EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_generic_getfrag);
+EXPORT_SYMBOL(ip_queue_xmit);
+EXPORT_SYMBOL(ip_send_check);
+#ifdef CONFIG_SYSCTL
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
+#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 000000000000..47012b93cad2
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1093 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              The IP to API glue.
+ *              
+ * Version:     $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     see ip.c
+ *
+ * Fixes:
+ *              Many            :       Split from ip.c , see ip.c for history.
+ *              Martin Mares    :       TOS setting fixed.
+ *              Alan Cox        :       Fixed a couple of oopses in Martin's 
+ *                                      TOS tweaks.
+ *              Mike McLagan    :       Routing by source
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+#define IP_CMSG_PKTINFO         1
+#define IP_CMSG_TTL             2
+#define IP_CMSG_TOS             4
+#define IP_CMSG_RECVOPTS        8
+#define IP_CMSG_RETOPTS         16
+/*
+ *      SOL_IP control messages.
+ */
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+        struct in_pktinfo info;
+        struct rtable *rt = (struct rtable *)skb->dst;
+        info.ipi_addr.s_addr = skb->nh.iph->daddr;
+        if (rt) {
+                info.ipi_ifindex = rt->rt_iif;
+                info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
+        } else {
+                info.ipi_ifindex = 0;
+                info.ipi_spec_dst.s_addr = 0;
+        }
+        put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+        int ttl = skb->nh.iph->ttl;
+        put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+        put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos);
+}
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+        if (IPCB(skb)->opt.optlen == 0)
+                return;
+        put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1);
+}
+static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+        unsigned char optbuf[sizeof(struct ip_options) + 40];
+        struct ip_options * opt = (struct ip_options*)optbuf;
+        if (IPCB(skb)->opt.optlen == 0)
+                return;
+        if (ip_options_echo(opt, skb)) {
+                msg->msg_flags |= MSG_CTRUNC;
+                return;
+        }
+        ip_options_undo(opt);
+        put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+{
+        struct inet_sock *inet = inet_sk(skb->sk);
+        unsigned flags = inet->cmsg_flags;
+        /* Ordered by supposed usage frequency */
+        if (flags & 1)
+                ip_cmsg_recv_pktinfo(msg, skb);
+        if ((flags>>=1) == 0)
+                return;
+        if (flags & 1)
+                ip_cmsg_recv_ttl(msg, skb);
+        if ((flags>>=1) == 0)
+                return;
+        if (flags & 1)
+                ip_cmsg_recv_tos(msg, skb);
+        if ((flags>>=1) == 0)
+                return;
+        if (flags & 1)
+                ip_cmsg_recv_opts(msg, skb);
+        if ((flags>>=1) == 0)
+                return;
+        if (flags & 1)
+                ip_cmsg_recv_retopts(msg, skb);
+}
+int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
+{
+        int err;
+        struct cmsghdr *cmsg;
+        for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+                if (!CMSG_OK(msg, cmsg))
+                        return -EINVAL;
+                if (cmsg->cmsg_level != SOL_IP)
+                        continue;
+                switch (cmsg->cmsg_type) {
+                case IP_RETOPTS:
+                        err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+                        err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+                        if (err)
+                                return err;
+                        break;
+                case IP_PKTINFO:
+                {
+                        struct in_pktinfo *info;
+                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+                                return -EINVAL;
+                        info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+                        ipc->oif = info->ipi_ifindex;
+                        ipc->addr = info->ipi_spec_dst.s_addr;
+                        break;
+                }
+                default:
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+/* Special input handler for packets caught by router alert option.
+   They are selected only by protocol field, and then processed likely
+   local ones; but only if someone wants them! Otherwise, router
+   not running rsvpd will kill RSVP.
+   It is user level problem, what it will make with them.
+   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+   but receiver should be enough clever f.e. to forward mtrace requests,
+   sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain *ip_ra_chain;
+DEFINE_RWLOCK(ip_ra_lock);
+int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
+{
+        struct ip_ra_chain *ra, *new_ra, **rap;
+        if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
+                return -EINVAL;
+        new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+        write_lock_bh(&ip_ra_lock);
+        for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+                if (ra->sk == sk) {
+                        if (on) {
+                                write_unlock_bh(&ip_ra_lock);
+                                if (new_ra)
+                                        kfree(new_ra);
+                                return -EADDRINUSE;
+                        }
+                        *rap = ra->next;
+                        write_unlock_bh(&ip_ra_lock);
+                        if (ra->destructor)
+                                ra->destructor(sk);
+                        sock_put(sk);
+                        kfree(ra);
+                        return 0;
+                }
+        }
+        if (new_ra == NULL) {
+                write_unlock_bh(&ip_ra_lock);
+                return -ENOBUFS;
+        }
+        new_ra->sk = sk;
+        new_ra->destructor = destructor;
+        new_ra->next = ra;
+        *rap = new_ra;
+        sock_hold(sk);
+        write_unlock_bh(&ip_ra_lock);
+        return 0;
+}
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, 
+                   u16 port, u32 info, u8 *payload)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sock_exterr_skb *serr;
+        if (!inet->recverr)
+                return;
+        skb = skb_clone(skb, GFP_ATOMIC);
+        if (!skb)
+                return;
+        serr = SKB_EXT_ERR(skb);  
+        serr->ee.ee_errno = err;
+        serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+        serr->ee.ee_type = skb->h.icmph->type; 
+        serr->ee.ee_code = skb->h.icmph->code;
+        serr->ee.ee_pad = 0;
+        serr->ee.ee_info = info;
+        serr->ee.ee_data = 0;
+        serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw;
+        serr->port = port;
+        skb->h.raw = payload;
+        if (!skb_pull(skb, payload - skb->data) ||
+            sock_queue_err_skb(sk, skb))
+                kfree_skb(skb);
+}
+void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sock_exterr_skb *serr;
+        struct iphdr *iph;
+        struct sk_buff *skb;
+        if (!inet->recverr)
+                return;
+        skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+        if (!skb)
+                return;
+        iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr));
+        skb->nh.iph = iph;
+        iph->daddr = daddr;
+        serr = SKB_EXT_ERR(skb);  
+        serr->ee.ee_errno = err;
+        serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+        serr->ee.ee_type = 0; 
+        serr->ee.ee_code = 0;
+        serr->ee.ee_pad = 0;
+        serr->ee.ee_info = info;
+        serr->ee.ee_data = 0;
+        serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
+        serr->port = port;
+        skb->h.raw = skb->tail;
+        __skb_pull(skb, skb->tail - skb->data);
+        if (sock_queue_err_skb(sk, skb))
+                kfree_skb(skb);
+}
+/* 
+ *      Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+        struct sock_exterr_skb *serr;
+        struct sk_buff *skb, *skb2;
+        struct sockaddr_in *sin;
+        struct {
+                struct sock_extended_err ee;
+                struct sockaddr_in       offender;
+        } errhdr;
+        int err;
+        int copied;
+        err = -EAGAIN;
+        skb = skb_dequeue(&sk->sk_error_queue);
+        if (skb == NULL)
+                goto out;
+        copied = skb->len;
+        if (copied > len) {
+                msg->msg_flags |= MSG_TRUNC;
+                copied = len;
+        }
+        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+        if (err)
+                goto out_free_skb;
+        sock_recv_timestamp(msg, sk, skb);
+        serr = SKB_EXT_ERR(skb);
+        sin = (struct sockaddr_in *)msg->msg_name;
+        if (sin) {
+                sin->sin_family = AF_INET;
+                sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset);
+                sin->sin_port = serr->port;
+                memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+        }
+        memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+        sin = &errhdr.offender;
+        sin->sin_family = AF_UNSPEC;
+        if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
+                struct inet_sock *inet = inet_sk(sk);
+                sin->sin_family = AF_INET;
+                sin->sin_addr.s_addr = skb->nh.iph->saddr;
+                sin->sin_port = 0;
+                memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+                if (inet->cmsg_flags)
+                        ip_cmsg_recv(msg, skb);
+        }
+        put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+        /* Now we could try to dump offended packet options */
+        msg->msg_flags |= MSG_ERRQUEUE;
+        err = copied;
+        /* Reset and regenerate socket error */
+        spin_lock_irq(&sk->sk_error_queue.lock);
+        sk->sk_err = 0;
+        if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+                sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+                spin_unlock_irq(&sk->sk_error_queue.lock);
+                sk->sk_error_report(sk);
+        } else
+                spin_unlock_irq(&sk->sk_error_queue.lock);
+out_free_skb:   
+        kfree_skb(skb);
+out:
+        return err;
+}
+/*
+ *      Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
+ *      an IP socket.
+ */
+int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        int val=0,err;
+        if (level != SOL_IP)
+                return -ENOPROTOOPT;
+        if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | 
+                            (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | 
+                            (1<<IP_RETOPTS) | (1<<IP_TOS) | 
+                            (1<<IP_TTL) | (1<<IP_HDRINCL) | 
+                            (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 
+                            (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND))) || 
+                                optname == IP_MULTICAST_TTL || 
+                                optname == IP_MULTICAST_LOOP) { 
+                if (optlen >= sizeof(int)) {
+                        if (get_user(val, (int __user *) optval))
+                                return -EFAULT;
+                } else if (optlen >= sizeof(char)) {
+                        unsigned char ucval;
+                        if (get_user(ucval, (unsigned char __user *) optval))
+                                return -EFAULT;
+                        val = (int) ucval;
+                }
+        }
+        /* If optlen==0, it is equivalent to val == 0 */
+#ifdef CONFIG_IP_MROUTE
+        if (optname >= MRT_BASE && optname <= (MRT_BASE + 10))
+                return ip_mroute_setsockopt(sk,optname,optval,optlen);
+#endif
+        err = 0;
+        lock_sock(sk);
+        switch (optname) {
+                case IP_OPTIONS:
+                {
+                        struct ip_options * opt = NULL;
+                        if (optlen > 40 || optlen < 0)
+                                goto e_inval;
+                        err = ip_options_get(&opt, optval, optlen, 1);
+                        if (err)
+                                break;
+                        if (sk->sk_type == SOCK_STREAM) {
+                                struct tcp_sock *tp = tcp_sk(sk);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+                                if (sk->sk_family == PF_INET ||
+                                    (!((1 << sk->sk_state) &
+                                       (TCPF_LISTEN | TCPF_CLOSE)) &&
+                                     inet->daddr != LOOPBACK4_IPV6)) {
+#endif
+                                        if (inet->opt)
+                                                tp->ext_header_len -= inet->opt->optlen;
+                                        if (opt)
+                                                tp->ext_header_len += opt->optlen;
+                                        tcp_sync_mss(sk, tp->pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+                                }
+#endif
+                        }
+                        opt = xchg(&inet->opt, opt);
+                        if (opt)
+                                kfree(opt);
+                        break;
+                }
+                case IP_PKTINFO:
+                        if (val)
+                                inet->cmsg_flags |= IP_CMSG_PKTINFO;
+                        else
+                                inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
+                        break;
+                case IP_RECVTTL:
+                        if (val)
+                                inet->cmsg_flags |=  IP_CMSG_TTL;
+                        else
+                                inet->cmsg_flags &= ~IP_CMSG_TTL;
+                        break;
+                case IP_RECVTOS:
+                        if (val)
+                                inet->cmsg_flags |=  IP_CMSG_TOS;
+                        else
+                                inet->cmsg_flags &= ~IP_CMSG_TOS;
+                        break;
+                case IP_RECVOPTS:
+                        if (val)
+                                inet->cmsg_flags |=  IP_CMSG_RECVOPTS;
+                        else
+                                inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
+                        break;
+                case IP_RETOPTS:
+                        if (val)
+                                inet->cmsg_flags |= IP_CMSG_RETOPTS;
+                        else
+                                inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
+                        break;
+                case IP_TOS:    /* This sets both TOS and Precedence */
+                        if (sk->sk_type == SOCK_STREAM) {
+                                val &= ~3;
+                                val |= inet->tos & 3;
+                        }
+                        if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && 
+                            !capable(CAP_NET_ADMIN)) {
+                                err = -EPERM;
+                                break;
+                        }
+                        if (inet->tos != val) {
+                                inet->tos = val;
+                                sk->sk_priority = rt_tos2priority(val);
+                                sk_dst_reset(sk); 
+                        }
+                        break;
+                case IP_TTL:
+                        if (optlen<1)
+                                goto e_inval;
+                        if (val != -1 && (val < 1 || val>255))
+                                goto e_inval;
+                        inet->uc_ttl = val;
+                        break;
+                case IP_HDRINCL:
+                        if (sk->sk_type != SOCK_RAW) {
+                                err = -ENOPROTOOPT;
+                                break;
+                        }
+                        inet->hdrincl = val ? 1 : 0;
+                        break;
+                case IP_MTU_DISCOVER:
+                        if (val<0 || val>2)
+                                goto e_inval;
+                        inet->pmtudisc = val;
+                        break;
+                case IP_RECVERR:
+                        inet->recverr = !!val;
+                        if (!val)
+                                skb_queue_purge(&sk->sk_error_queue);
+                        break;
+                case IP_MULTICAST_TTL:
+                        if (sk->sk_type == SOCK_STREAM)
+                                goto e_inval;
+                        if (optlen<1)
+                                goto e_inval;
+                        if (val==-1)
+                                val = 1;
+                        if (val < 0 || val > 255)
+                                goto e_inval;
+                        inet->mc_ttl = val;
+                        break;
+                case IP_MULTICAST_LOOP: 
+                        if (optlen<1)
+                                goto e_inval;
+                        inet->mc_loop = !!val;
+                        break;
+                case IP_MULTICAST_IF: 
+                {
+                        struct ip_mreqn mreq;
+                        struct net_device *dev = NULL;
+                        if (sk->sk_type == SOCK_STREAM)
+                                goto e_inval;
+                        /*
+                         *      Check the arguments are allowable
+                         */
+                        err = -EFAULT;
+                        if (optlen >= sizeof(struct ip_mreqn)) {
+                                if (copy_from_user(&mreq,optval,sizeof(mreq)))
+                                        break;
+                        } else {
+                                memset(&mreq, 0, sizeof(mreq));
+                                if (optlen >= sizeof(struct in_addr) &&
+                                    copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
+                                        break;
+                        }
+                        if (!mreq.imr_ifindex) {
+                                if (mreq.imr_address.s_addr == INADDR_ANY) {
+                                        inet->mc_index = 0;
+                                        inet->mc_addr  = 0;
+                                        err = 0;
+                                        break;
+                                }
+                                dev = ip_dev_find(mreq.imr_address.s_addr);
+                                if (dev) {
+                                        mreq.imr_ifindex = dev->ifindex;
+                                        dev_put(dev);
+                                }
+                        } else
+                                dev = __dev_get_by_index(mreq.imr_ifindex);
+                        err = -EADDRNOTAVAIL;
+                        if (!dev)
+                                break;
+                        err = -EINVAL;
+                        if (sk->sk_bound_dev_if &&
+                            mreq.imr_ifindex != sk->sk_bound_dev_if)
+                                break;
+                        inet->mc_index = mreq.imr_ifindex;
+                        inet->mc_addr  = mreq.imr_address.s_addr;
+                        err = 0;
+                        break;
+                }
+                case IP_ADD_MEMBERSHIP:
+                case IP_DROP_MEMBERSHIP: 
+                {
+                        struct ip_mreqn mreq;
+                        if (optlen < sizeof(struct ip_mreq))
+                                goto e_inval;
+                        err = -EFAULT;
+                        if (optlen >= sizeof(struct ip_mreqn)) {
+                                if(copy_from_user(&mreq,optval,sizeof(mreq)))
+                                        break;
+                        } else {
+                                memset(&mreq, 0, sizeof(mreq));
+                                if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
+                                        break; 
+                        }
+                        if (optname == IP_ADD_MEMBERSHIP)
+                                err = ip_mc_join_group(sk, &mreq);
+                        else
+                                err = ip_mc_leave_group(sk, &mreq);
+                        break;
+                }
+                case IP_MSFILTER:
+                {
+                        extern int sysctl_optmem_max;
+                        extern int sysctl_igmp_max_msf;
+                        struct ip_msfilter *msf;
+                        if (optlen < IP_MSFILTER_SIZE(0))
+                                goto e_inval;
+                        if (optlen > sysctl_optmem_max) {
+                                err = -ENOBUFS;
+                                break;
+                        }
+                        msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL);
+                        if (msf == 0) {
+                                err = -ENOBUFS;
+                                break;
+                        }
+                        err = -EFAULT;
+                        if (copy_from_user(msf, optval, optlen)) {
+                                kfree(msf);
+                                break;
+                        }
+                        /* numsrc >= (1G-4) overflow in 32 bits */
+                        if (msf->imsf_numsrc >= 0x3ffffffcU ||
+                            msf->imsf_numsrc > sysctl_igmp_max_msf) {
+                                kfree(msf);
+                                err = -ENOBUFS;
+                                break;
+                        }
+                        if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+                                kfree(msf);
+                                err = -EINVAL;
+                                break;
+                        }
+                        err = ip_mc_msfilter(sk, msf, 0);
+                        kfree(msf);
+                        break;
+                }
+                case IP_BLOCK_SOURCE:
+                case IP_UNBLOCK_SOURCE:
+                case IP_ADD_SOURCE_MEMBERSHIP:
+                case IP_DROP_SOURCE_MEMBERSHIP:
+                {
+                        struct ip_mreq_source mreqs;
+                        int omode, add;
+                        if (optlen != sizeof(struct ip_mreq_source))
+                                goto e_inval;
+                        if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+                                err = -EFAULT;
+                                break;
+                        }
+                        if (optname == IP_BLOCK_SOURCE) {
+                                omode = MCAST_EXCLUDE;
+                                add = 1;
+                        } else if (optname == IP_UNBLOCK_SOURCE) {
+                                omode = MCAST_EXCLUDE;
+                                add = 0;
+                        } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+                                struct ip_mreqn mreq;
+                                mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+                                mreq.imr_address.s_addr = mreqs.imr_interface;
+                                mreq.imr_ifindex = 0;
+                                err = ip_mc_join_group(sk, &mreq);
+                                if (err)
+                                        break;
+                                omode = MCAST_INCLUDE;
+                                add = 1;
+                        } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+                                omode = MCAST_INCLUDE;
+                                add = 0;
+                        }
+                        err = ip_mc_source(add, omode, sk, &mreqs, 0);
+                        break;
+                }
+                case MCAST_JOIN_GROUP:
+                case MCAST_LEAVE_GROUP: 
+                {
+                        struct group_req greq;
+                        struct sockaddr_in *psin;
+                        struct ip_mreqn mreq;
+                        if (optlen < sizeof(struct group_req))
+                                goto e_inval;
+                        err = -EFAULT;
+                        if(copy_from_user(&greq, optval, sizeof(greq)))
+                                break;
+                        psin = (struct sockaddr_in *)&greq.gr_group;
+                        if (psin->sin_family != AF_INET)
+                                goto e_inval;
+                        memset(&mreq, 0, sizeof(mreq));
+                        mreq.imr_multiaddr = psin->sin_addr;
+                        mreq.imr_ifindex = greq.gr_interface;
+                        if (optname == MCAST_JOIN_GROUP)
+                                err = ip_mc_join_group(sk, &mreq);
+                        else
+                                err = ip_mc_leave_group(sk, &mreq);
+                        break;
+                }
+                case MCAST_JOIN_SOURCE_GROUP:
+                case MCAST_LEAVE_SOURCE_GROUP:
+                case MCAST_BLOCK_SOURCE:
+                case MCAST_UNBLOCK_SOURCE:
+                {
+                        struct group_source_req greqs;
+                        struct ip_mreq_source mreqs;
+                        struct sockaddr_in *psin;
+                        int omode, add;
+                        if (optlen != sizeof(struct group_source_req))
+                                goto e_inval;
+                        if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+                                err = -EFAULT;
+                                break;
+                        }
+                        if (greqs.gsr_group.ss_family != AF_INET ||
+                            greqs.gsr_source.ss_family != AF_INET) {
+                                err = -EADDRNOTAVAIL;
+                                break;
+                        }
+                        psin = (struct sockaddr_in *)&greqs.gsr_group;
+                        mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+                        psin = (struct sockaddr_in *)&greqs.gsr_source;
+                        mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+                        mreqs.imr_interface = 0; /* use index for mc_source */
+                        if (optname == MCAST_BLOCK_SOURCE) {
+                                omode = MCAST_EXCLUDE;
+                                add = 1;
+                        } else if (optname == MCAST_UNBLOCK_SOURCE) {
+                                omode = MCAST_EXCLUDE;
+                                add = 0;
+                        } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+                                struct ip_mreqn mreq;
+                                psin = (struct sockaddr_in *)&greqs.gsr_group;
+                                mreq.imr_multiaddr = psin->sin_addr;
+                                mreq.imr_address.s_addr = 0;
+                                mreq.imr_ifindex = greqs.gsr_interface;
+                                err = ip_mc_join_group(sk, &mreq);
+                                if (err)
+                                        break;
+                                greqs.gsr_interface = mreq.imr_ifindex;
+                                omode = MCAST_INCLUDE;
+                                add = 1;
+                        } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+                                omode = MCAST_INCLUDE;
+                                add = 0;
+                        }
+                        err = ip_mc_source(add, omode, sk, &mreqs,
+                                greqs.gsr_interface);
+                        break;
+                }
+                case MCAST_MSFILTER:
+                {
+                        extern int sysctl_optmem_max;
+                        extern int sysctl_igmp_max_msf;
+                        struct sockaddr_in *psin;
+                        struct ip_msfilter *msf = NULL;
+                        struct group_filter *gsf = NULL;
+                        int msize, i, ifindex;
+                        if (optlen < GROUP_FILTER_SIZE(0))
+                                goto e_inval;
+                        if (optlen > sysctl_optmem_max) {
+                                err = -ENOBUFS;
+                                break;
+                        }
+                        gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL);
+                        if (gsf == 0) {
+                                err = -ENOBUFS;
+                                break;
+                        }
+                        err = -EFAULT;
+                        if (copy_from_user(gsf, optval, optlen)) {
+                                goto mc_msf_out;
+                        }
+                        /* numsrc >= (4G-140)/128 overflow in 32 bits */
+                        if (gsf->gf_numsrc >= 0x1ffffff ||
+                            gsf->gf_numsrc > sysctl_igmp_max_msf) {
+                                err = -ENOBUFS;
+                                goto mc_msf_out;
+                        }
+                        if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+                                err = -EINVAL;
+                                goto mc_msf_out;
+                        }
+                        msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
+                        msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL);
+                        if (msf == 0) {
+                                err = -ENOBUFS;
+                                goto mc_msf_out;
+                        }
+                        ifindex = gsf->gf_interface;
+                        psin = (struct sockaddr_in *)&gsf->gf_group;
+                        if (psin->sin_family != AF_INET) {
+                                err = -EADDRNOTAVAIL;
+                                goto mc_msf_out;
+                        }
+                        msf->imsf_multiaddr = psin->sin_addr.s_addr;
+                        msf->imsf_interface = 0;
+                        msf->imsf_fmode = gsf->gf_fmode;
+                        msf->imsf_numsrc = gsf->gf_numsrc;
+                        err = -EADDRNOTAVAIL;
+                        for (i=0; i<gsf->gf_numsrc; ++i) {
+                                psin = (struct sockaddr_in *)&gsf->gf_slist[i];
+                                if (psin->sin_family != AF_INET)
+                                        goto mc_msf_out;
+                                msf->imsf_slist[i] = psin->sin_addr.s_addr;
+                        }
+                        kfree(gsf);
+                        gsf = NULL;
+                        err = ip_mc_msfilter(sk, msf, ifindex);
+mc_msf_out:
+                        if (msf)
+                                kfree(msf);
+                        if (gsf)
+                                kfree(gsf);
+                        break;
+                }
+                case IP_ROUTER_ALERT:   
+                        err = ip_ra_control(sk, val ? 1 : 0, NULL);
+                        break;
+                case IP_FREEBIND:
+                        if (optlen<1)
+                                goto e_inval;
+                        inet->freebind = !!val; 
+                        break;                  
+ 
+                case IP_IPSEC_POLICY:
+                case IP_XFRM_POLICY:
+                        err = xfrm_user_policy(sk, optname, optval, optlen);
+                        break;
+                default:
+#ifdef CONFIG_NETFILTER
+                        err = nf_setsockopt(sk, PF_INET, optname, optval, 
+                                            optlen);
+#else
+                        err = -ENOPROTOOPT;
+#endif
+                        break;
+        }
+        release_sock(sk);
+        return err;
+e_inval:
+        release_sock(sk);
+        return -EINVAL;
+}
+/*
+ *      Get the options. Note for future reference. The GET of IP options gets the
+ *      _received_ ones. The set sets the _sent_ ones.
+ */
+int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        int val;
+        int len;
+        
+        if(level!=SOL_IP)
+                return -EOPNOTSUPP;
+#ifdef CONFIG_IP_MROUTE
+        if(optname>=MRT_BASE && optname <=MRT_BASE+10)
+        {
+                return ip_mroute_getsockopt(sk,optname,optval,optlen);
+        }
+#endif
+        if(get_user(len,optlen))
+                return -EFAULT;
+        if(len < 0)
+                return -EINVAL;
+                
+        lock_sock(sk);
+        switch(optname) {
+                case IP_OPTIONS:
+                        {
+                                unsigned char optbuf[sizeof(struct ip_options)+40];
+                                struct ip_options * opt = (struct ip_options*)optbuf;
+                                opt->optlen = 0;
+                                if (inet->opt)
+                                        memcpy(optbuf, inet->opt,
+                                               sizeof(struct ip_options)+
+                                               inet->opt->optlen);
+                                release_sock(sk);
+                                if (opt->optlen == 0) 
+                                        return put_user(0, optlen);
+                                ip_options_undo(opt);
+                                len = min_t(unsigned int, len, opt->optlen);
+                                if(put_user(len, optlen))
+                                        return -EFAULT;
+                                if(copy_to_user(optval, opt->__data, len))
+                                        return -EFAULT;
+                                return 0;
+                        }
+                case IP_PKTINFO:
+                        val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
+                        break;
+                case IP_RECVTTL:
+                        val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
+                        break;
+                case IP_RECVTOS:
+                        val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
+                        break;
+                case IP_RECVOPTS:
+                        val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+                        break;
+                case IP_RETOPTS:
+                        val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
+                        break;
+                case IP_TOS:
+                        val = inet->tos;
+                        break;
+                case IP_TTL:
+                        val = (inet->uc_ttl == -1 ?
+                               sysctl_ip_default_ttl :
+                               inet->uc_ttl);
+                        break;
+                case IP_HDRINCL:
+                        val = inet->hdrincl;
+                        break;
+                case IP_MTU_DISCOVER:
+                        val = inet->pmtudisc;
+                        break;
+                case IP_MTU:
+                {
+                        struct dst_entry *dst;
+                        val = 0;
+                        dst = sk_dst_get(sk);
+                        if (dst) {
+                                val = dst_mtu(dst);
+                                dst_release(dst);
+                        }
+                        if (!val) {
+                                release_sock(sk);
+                                return -ENOTCONN;
+                        }
+                        break;
+                }
+                case IP_RECVERR:
+                        val = inet->recverr;
+                        break;
+                case IP_MULTICAST_TTL:
+                        val = inet->mc_ttl;
+                        break;
+                case IP_MULTICAST_LOOP:
+                        val = inet->mc_loop;
+                        break;
+                case IP_MULTICAST_IF:
+                {
+                        struct in_addr addr;
+                        len = min_t(unsigned int, len, sizeof(struct in_addr));
+                        addr.s_addr = inet->mc_addr;
+                        release_sock(sk);
+                        if(put_user(len, optlen))
+                                return -EFAULT;
+                        if(copy_to_user(optval, &addr, len))
+                                return -EFAULT;
+                        return 0;
+                }
+                case IP_MSFILTER:
+                {
+                        struct ip_msfilter msf;
+                        int err;
+                        if (len < IP_MSFILTER_SIZE(0)) {
+                                release_sock(sk);
+                                return -EINVAL;
+                        }
+                        if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+                                release_sock(sk);
+                                return -EFAULT;
+                        }
+                        err = ip_mc_msfget(sk, &msf,
+                                (struct ip_msfilter __user *)optval, optlen);
+                        release_sock(sk);
+                        return err;
+                }
+                case MCAST_MSFILTER:
+                {
+                        struct group_filter gsf;
+                        int err;
+                        if (len < GROUP_FILTER_SIZE(0)) {
+                                release_sock(sk);
+                                return -EINVAL;
+                        }
+                        if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+                                release_sock(sk);
+                                return -EFAULT;
+                        }
+                        err = ip_mc_gsfget(sk, &gsf,
+                                (struct group_filter __user *)optval, optlen);
+                        release_sock(sk);
+                        return err;
+                }
+                case IP_PKTOPTIONS:             
+                {
+                        struct msghdr msg;
+                        release_sock(sk);
+                        if (sk->sk_type != SOCK_STREAM)
+                                return -ENOPROTOOPT;
+                        msg.msg_control = optval;
+                        msg.msg_controllen = len;
+                        msg.msg_flags = 0;
+                        if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+                                struct in_pktinfo info;
+                                info.ipi_addr.s_addr = inet->rcv_saddr;
+                                info.ipi_spec_dst.s_addr = inet->rcv_saddr;
+                                info.ipi_ifindex = inet->mc_index;
+                                put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+                        }
+                        if (inet->cmsg_flags & IP_CMSG_TTL) {
+                                int hlim = inet->mc_ttl;
+                                put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+                        }
+                        len -= msg.msg_controllen;
+                        return put_user(len, optlen);
+                }
+                case IP_FREEBIND: 
+                        val = inet->freebind; 
+                        break; 
+                default:
+#ifdef CONFIG_NETFILTER
+                        val = nf_getsockopt(sk, PF_INET, optname, optval, 
+                                            &len);
+                        release_sock(sk);
+                        if (val >= 0)
+                                val = put_user(len, optlen);
+                        return val;
+#else
+                        release_sock(sk);
+                        return -ENOPROTOOPT;
+#endif
+        }
+        release_sock(sk);
+        
+        if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+                unsigned char ucval = (unsigned char)val;
+                len = 1;
+                if(put_user(len, optlen))
+                        return -EFAULT;
+                if(copy_to_user(optval,&ucval,1))
+                        return -EFAULT;
+        } else {
+                len = min_t(unsigned int, sizeof(int), len);
+                if(put_user(len, optlen))
+                        return -EFAULT;
+                if(copy_to_user(optval,&val,len))
+                        return -EFAULT;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(ip_cmsg_recv);
+#ifdef CONFIG_IP_SCTP_MODULE
+EXPORT_SYMBOL(ip_getsockopt);
+EXPORT_SYMBOL(ip_setsockopt);
+#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 000000000000..1a23c5263b99
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,524 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ * Todo:
+ *   - Tunable compression parameters.
+ *   - Compression stats.
+ *   - Adaptive compression.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/scatterlist.h>
+#include <asm/semaphore.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+struct ipcomp_tfms {
+        struct list_head list;
+        struct crypto_tfm **tfms;
+        int users;
+};
+static DECLARE_MUTEX(ipcomp_resource_sem);
+static void **ipcomp_scratches;
+static int ipcomp_scratch_users;
+static LIST_HEAD(ipcomp_tfms_list);
+static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int err, plen, dlen;
+        struct iphdr *iph;
+        struct ipcomp_data *ipcd = x->data;
+        u8 *start, *scratch;
+        struct crypto_tfm *tfm;
+        int cpu;
+        
+        plen = skb->len;
+        dlen = IPCOMP_SCRATCH_SIZE;
+        start = skb->data;
+        cpu = get_cpu();
+        scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+        tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+        err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
+        if (err)
+                goto out;
+        if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
+                err = -EINVAL;
+                goto out;
+        }
+        err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
+        if (err)
+                goto out;
+                
+        skb_put(skb, dlen - plen);
+        memcpy(skb->data, scratch, dlen);
+        iph = skb->nh.iph;
+        iph->tot_len = htons(dlen + iph->ihl * 4);
+out:    
+        put_cpu();
+        return err;
+}
+static int ipcomp_input(struct xfrm_state *x,
+                        struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+        u8 nexthdr;
+        int err = 0;
+        struct iphdr *iph;
+        union {
+                struct iphdr    iph;
+                char            buf[60];
+        } tmp_iph;
+        if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+            skb_linearize(skb, GFP_ATOMIC) != 0) {
+                err = -ENOMEM;
+                goto out;
+        }
+        skb->ip_summed = CHECKSUM_NONE;
+        /* Remove ipcomp header and decompress original payload */      
+        iph = skb->nh.iph;
+        memcpy(&tmp_iph, iph, iph->ihl * 4);
+        nexthdr = *(u8 *)skb->data;
+        skb_pull(skb, sizeof(struct ip_comp_hdr));
+        skb->nh.raw += sizeof(struct ip_comp_hdr);
+        memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
+        iph = skb->nh.iph;
+        iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
+        iph->protocol = nexthdr;
+        skb->h.raw = skb->data;
+        err = ipcomp_decompress(x, skb);
+out:    
+        return err;
+}
+static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int err, plen, dlen, ihlen;
+        struct iphdr *iph = skb->nh.iph;
+        struct ipcomp_data *ipcd = x->data;
+        u8 *start, *scratch;
+        struct crypto_tfm *tfm;
+        int cpu;
+        
+        ihlen = iph->ihl * 4;
+        plen = skb->len - ihlen;
+        dlen = IPCOMP_SCRATCH_SIZE;
+        start = skb->data + ihlen;
+        cpu = get_cpu();
+        scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+        tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+        err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
+        if (err)
+                goto out;
+        if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
+                err = -EMSGSIZE;
+                goto out;
+        }
+        
+        memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
+        put_cpu();
+        pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
+        return 0;
+        
+out:    
+        put_cpu();
+        return err;
+}
+static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int err;
+        struct iphdr *iph;
+        struct ip_comp_hdr *ipch;
+        struct ipcomp_data *ipcd = x->data;
+        int hdr_len = 0;
+        iph = skb->nh.iph;
+        iph->tot_len = htons(skb->len);
+        hdr_len = iph->ihl * 4;
+        if ((skb->len - hdr_len) < ipcd->threshold) {
+                /* Don't bother compressing */
+                goto out_ok;
+        }
+        if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+            skb_linearize(skb, GFP_ATOMIC) != 0) {
+                goto out_ok;
+        }
+        
+        err = ipcomp_compress(x, skb);
+        iph = skb->nh.iph;
+        if (err) {
+                goto out_ok;
+        }
+        /* Install ipcomp header, convert into ipcomp datagram. */
+        iph->tot_len = htons(skb->len);
+        ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
+        ipch->nexthdr = iph->protocol;
+        ipch->flags = 0;
+        ipch->cpi = htons((u16 )ntohl(x->id.spi));
+        iph->protocol = IPPROTO_COMP;
+        ip_send_check(iph);
+        return 0;
+out_ok:
+        if (x->props.mode)
+                ip_send_check(iph);
+        return 0;
+}
+static void ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+        u32 spi;
+        struct iphdr *iph = (struct iphdr *)skb->data;
+        struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+        struct xfrm_state *x;
+        if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+            skb->h.icmph->code != ICMP_FRAG_NEEDED)
+                return;
+        spi = ntohl(ntohs(ipch->cpi));
+        x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr,
+                              spi, IPPROTO_COMP, AF_INET);
+        if (!x)
+                return;
+        NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+               spi, NIPQUAD(iph->daddr)));
+        xfrm_state_put(x);
+}
+/* We always hold one tunnel user reference to indicate a tunnel */ 
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+        struct xfrm_state *t;
+        
+        t = xfrm_state_alloc();
+        if (t == NULL)
+                goto out;
+        t->id.proto = IPPROTO_IPIP;
+        t->id.spi = x->props.saddr.a4;
+        t->id.daddr.a4 = x->id.daddr.a4;
+        memcpy(&t->sel, &x->sel, sizeof(t->sel));
+        t->props.family = AF_INET;
+        t->props.mode = 1;
+        t->props.saddr.a4 = x->props.saddr.a4;
+        t->props.flags = x->props.flags;
+        
+        t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
+        if (t->type == NULL)
+                goto error;
+                
+        if (t->type->init_state(t, NULL))
+                goto error;
+        t->km.state = XFRM_STATE_VALID;
+        atomic_set(&t->tunnel_users, 1);
+out:
+        return t;
+error:
+        t->km.state = XFRM_STATE_DEAD;
+        xfrm_state_put(t);
+        t = NULL;
+        goto out;
+}
+/*
+ * Must be protected by xfrm_cfg_sem.  State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+        int err = 0;
+        struct xfrm_state *t;
+        t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4,
+                              x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+        if (!t) {
+                t = ipcomp_tunnel_create(x);
+                if (!t) {
+                        err = -EINVAL;
+                        goto out;
+                }
+                xfrm_state_insert(t);
+                xfrm_state_hold(t);
+        }
+        x->tunnel = t;
+        atomic_inc(&t->tunnel_users);
+out:
+        return err;
+}
+static void ipcomp_free_scratches(void)
+{
+        int i;
+        void **scratches;
+        if (--ipcomp_scratch_users)
+                return;
+        scratches = ipcomp_scratches;
+        if (!scratches)
+                return;
+        for_each_cpu(i) {
+                void *scratch = *per_cpu_ptr(scratches, i);
+                if (scratch)
+                        vfree(scratch);
+        }
+        free_percpu(scratches);
+}
+static void **ipcomp_alloc_scratches(void)
+{
+        int i;
+        void **scratches;
+        if (ipcomp_scratch_users++)
+                return ipcomp_scratches;
+        scratches = alloc_percpu(void *);
+        if (!scratches)
+                return NULL;
+        ipcomp_scratches = scratches;
+        for_each_cpu(i) {
+                void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
+                if (!scratch)
+                        return NULL;
+                *per_cpu_ptr(scratches, i) = scratch;
+        }
+        return scratches;
+}
+static void ipcomp_free_tfms(struct crypto_tfm **tfms)
+{
+        struct ipcomp_tfms *pos;
+        int cpu;
+        list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+                if (pos->tfms == tfms)
+                        break;
+        }
+        BUG_TRAP(pos);
+        if (--pos->users)
+                return;
+        list_del(&pos->list);
+        kfree(pos);
+        if (!tfms)
+                return;
+        for_each_cpu(cpu) {
+                struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
+                if (tfm)
+                        crypto_free_tfm(tfm);
+        }
+        free_percpu(tfms);
+}
+static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
+{
+        struct ipcomp_tfms *pos;
+        struct crypto_tfm **tfms;
+        int cpu;
+        /* This can be any valid CPU ID so we don't need locking. */
+        cpu = smp_processor_id();
+        list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+                struct crypto_tfm *tfm;
+                tfms = pos->tfms;
+                tfm = *per_cpu_ptr(tfms, cpu);
+                if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) {
+                        pos->users++;
+                        return tfms;
+                }
+        }
+        pos = kmalloc(sizeof(*pos), GFP_KERNEL);
+        if (!pos)
+                return NULL;
+        pos->users = 1;
+        INIT_LIST_HEAD(&pos->list);
+        list_add(&pos->list, &ipcomp_tfms_list);
+        pos->tfms = tfms = alloc_percpu(struct crypto_tfm *);
+        if (!tfms)
+                goto error;
+        for_each_cpu(cpu) {
+                struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0);
+                if (!tfm)
+                        goto error;
+                *per_cpu_ptr(tfms, cpu) = tfm;
+        }
+        return tfms;
+error:
+        ipcomp_free_tfms(tfms);
+        return NULL;
+}
+static void ipcomp_free_data(struct ipcomp_data *ipcd)
+{
+        if (ipcd->tfms)
+                ipcomp_free_tfms(ipcd->tfms);
+        ipcomp_free_scratches();
+}
+static void ipcomp_destroy(struct xfrm_state *x)
+{
+        struct ipcomp_data *ipcd = x->data;
+        if (!ipcd)
+                return;
+        xfrm_state_delete_tunnel(x);
+        down(&ipcomp_resource_sem);
+        ipcomp_free_data(ipcd);
+        up(&ipcomp_resource_sem);
+        kfree(ipcd);
+}
+static int ipcomp_init_state(struct xfrm_state *x, void *args)
+{
+        int err;
+        struct ipcomp_data *ipcd;
+        struct xfrm_algo_desc *calg_desc;
+        err = -EINVAL;
+        if (!x->calg)
+                goto out;
+        if (x->encap)
+                goto out;
+        err = -ENOMEM;
+        ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL);
+        if (!ipcd)
+                goto out;
+        memset(ipcd, 0, sizeof(*ipcd));
+        x->props.header_len = 0;
+        if (x->props.mode)
+                x->props.header_len += sizeof(struct iphdr);
+        down(&ipcomp_resource_sem);
+        if (!ipcomp_alloc_scratches())
+                goto error;
+        ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
+        if (!ipcd->tfms)
+                goto error;
+        up(&ipcomp_resource_sem);
+        if (x->props.mode) {
+                err = ipcomp_tunnel_attach(x);
+                if (err)
+                        goto error_tunnel;
+        }
+        calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
+        BUG_ON(!calg_desc);
+        ipcd->threshold = calg_desc->uinfo.comp.threshold;
+        x->data = ipcd;
+        err = 0;
+out:
+        return err;
+error_tunnel:
+        down(&ipcomp_resource_sem);
+error:
+        ipcomp_free_data(ipcd);
+        up(&ipcomp_resource_sem);
+        kfree(ipcd);
+        goto out;
+}
+static struct xfrm_type ipcomp_type = {
+        .description    = "IPCOMP4",
+        .owner          = THIS_MODULE,
+        .proto          = IPPROTO_COMP,
+        .init_state     = ipcomp_init_state,
+        .destructor     = ipcomp_destroy,
+        .input          = ipcomp_input,
+        .output         = ipcomp_output
+};
+static struct net_protocol ipcomp4_protocol = {
+        .handler        =       xfrm4_rcv,
+        .err_handler    =       ipcomp4_err,
+        .no_policy      =       1,
+};
+static int __init ipcomp4_init(void)
+{
+        if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+                printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+                return -EAGAIN;
+        }
+        if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+                printk(KERN_INFO "ipcomp init: can't add protocol\n");
+                xfrm_unregister_type(&ipcomp_type, AF_INET);
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void __exit ipcomp4_fini(void)
+{
+        if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+                printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
+        if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
+                printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+}
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 000000000000..f2509034ce72
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1507 @@
+/*
+ *  $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $
+ *
+ *  Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ *  user-supplied information to configure own IP address and routes.
+ *
+ *  Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *  Derived from network configuration code in fs/nfs/nfsroot.c,
+ *  originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ *  BOOTP rewritten to construct and analyse packets itself instead
+ *  of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ *                                           -- MJ, December 1998
+ *  
+ *  Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ *  initialization scheme.
+ *      - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ *  DHCP support added.  To users this looks like a whole separate
+ *  protocol, but we know it's just a bag on the side of BOOTP.
+ *              -- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ *  Ported DHCP support from 2.2.16 to 2.4.0-test4
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ *  Merged changes from 2.2.19 into 2.4.3
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ *  Multiple Nameservers in /proc/net/pnp
+ *              --  Josef Siemes <jsiemes@web.de>, Aug 2002
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+/* Define the friendly delay before and after opening net devices */
+#define CONF_PRE_OPEN           500     /* Before opening: 1/2 second */
+#define CONF_POST_OPEN          1       /* After opening: 1 second */
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES       2       /* (Re)open devices twice */
+#define CONF_SEND_RETRIES       6       /* Send six requests per open */
+#define CONF_INTER_TIMEOUT      (HZ/2)  /* Inter-device timeout: 1/2 second */
+#define CONF_BASE_TIMEOUT       (HZ*2)  /* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM     (HZ)    /* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT       *7/4    /* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX        (HZ*30) /* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX   3       /* Maximum number of nameservers  
+                                           - '3' from resolv.h */
+/*
+ * Public IP configuration
+ */
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars.  If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0;             /* IPconfig parameters set manually */
+static int ic_enable __initdata = 0;            /* IP config enabled? */
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+                        | IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+                        | IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+                        | IC_RARP
+#endif
+                        ;
+static int ic_host_name_set __initdata = 0;     /* Host name set by us? */
+u32 ic_myaddr = INADDR_NONE;            /* My IP address */
+static u32 ic_netmask = INADDR_NONE;    /* Netmask for local subnet */
+u32 ic_gateway = INADDR_NONE;   /* Gateway IP address */
+u32 ic_servaddr = INADDR_NONE;  /* Boot server IP address */
+u32 root_server_addr = INADDR_NONE;     /* Address of NFS server */
+u8 root_server_path[256] = { 0, };      /* Path to mount as root */
+/* Persistent data: */
+static int ic_proto_used;                       /* Protocol used, if any */
+static u32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static u8 ic_domain[64];                /* DNS (not NIS) domain name */
+/*
+ * Private state.
+ */
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata = 0;
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata = 0;    /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata = 0;      /* DHCP msg type received */
+#endif
+/*
+ *      Network devices
+ */
+struct ic_device {
+        struct ic_device *next;
+        struct net_device *dev;
+        unsigned short flags;
+        short able;
+        u32 xid;
+};
+static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
+static struct net_device *ic_dev __initdata = NULL;     /* Selected device */
+static int __init ic_open_devs(void)
+{
+        struct ic_device *d, **last;
+        struct net_device *dev;
+        unsigned short oflags;
+        last = &ic_first_dev;
+        rtnl_shlock();
+        /* bring loopback device up first */
+        if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
+                printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name);
+        for (dev = dev_base; dev; dev = dev->next) {
+                if (dev == &loopback_dev)
+                        continue;
+                if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+                    (!(dev->flags & IFF_LOOPBACK) &&
+                     (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+                     strncmp(dev->name, "dummy", 5))) {
+                        int able = 0;
+                        if (dev->mtu >= 364)
+                                able |= IC_BOOTP;
+                        else
+                                printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+                        if (!(dev->flags & IFF_NOARP))
+                                able |= IC_RARP;
+                        able &= ic_proto_enabled;
+                        if (ic_proto_enabled && !able)
+                                continue;
+                        oflags = dev->flags;
+                        if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+                                printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+                                continue;
+                        }
+                        if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+                                rtnl_shunlock();
+                                return -1;
+                        }
+                        d->dev = dev;
+                        *last = d;
+                        last = &d->next;
+                        d->flags = oflags;
+                        d->able = able;
+                        if (able & IC_BOOTP)
+                                get_random_bytes(&d->xid, sizeof(u32));
+                        else
+                                d->xid = 0;
+                        ic_proto_have_if |= able;
+                        DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
+                                dev->name, able, d->xid));
+                }
+        }
+        rtnl_shunlock();
+        *last = NULL;
+        if (!ic_first_dev) {
+                if (user_dev_name[0])
+                        printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+                else
+                        printk(KERN_ERR "IP-Config: No network devices available.\n");
+                return -1;
+        }
+        return 0;
+}
+static void __init ic_close_devs(void)
+{
+        struct ic_device *d, *next;
+        struct net_device *dev;
+        rtnl_shlock();
+        next = ic_first_dev;
+        while ((d = next)) {
+                next = d->next;
+                dev = d->dev;
+                if (dev != ic_dev) {
+                        DBG(("IP-Config: Downing %s\n", dev->name));
+                        dev_change_flags(dev, d->flags);
+                }
+                kfree(d);
+        }
+        rtnl_shunlock();
+}
+/*
+ *      Interface to various network functions.
+ */
+static inline void
+set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port)
+{
+        sin->sin_family = AF_INET;
+        sin->sin_addr.s_addr = addr;
+        sin->sin_port = port;
+}
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+        int res;
+        mm_segment_t oldfs = get_fs();
+        set_fs(get_ds());
+        res = devinet_ioctl(cmd, (struct ifreq __user *) arg);
+        set_fs(oldfs);
+        return res;
+}
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+        int res;
+        mm_segment_t oldfs = get_fs();
+        set_fs(get_ds());
+        res = ip_rt_ioctl(cmd, (void __user *) arg);
+        set_fs(oldfs);
+        return res;
+}
+/*
+ *      Set up interface addresses and routes.
+ */
+static int __init ic_setup_if(void)
+{
+        struct ifreq ir;
+        struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+        int err;
+        memset(&ir, 0, sizeof(ir));
+        strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+        set_sockaddr(sin, ic_myaddr, 0);
+        if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
+                printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+                return -1;
+        }
+        set_sockaddr(sin, ic_netmask, 0);
+        if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+                printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+                return -1;
+        }
+        set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+        if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+                printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+                return -1;
+        }
+        return 0;
+}
+static int __init ic_setup_routes(void)
+{
+        /* No need to setup device routes, only the default route... */
+        if (ic_gateway != INADDR_NONE) {
+                struct rtentry rm;
+                int err;
+                memset(&rm, 0, sizeof(rm));
+                if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+                        printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+                        return -1;
+                }
+                set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+                set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+                set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+                rm.rt_flags = RTF_UP | RTF_GATEWAY;
+                if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+                        printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+                        return -1;
+                }
+        }
+        return 0;
+}
+/*
+ *      Fill in default values for all missing parameters.
+ */
+static int __init ic_defaults(void)
+{
+        /*
+         *      At this point we have no userspace running so need not
+         *      claim locks on system_utsname
+         */
+         
+        if (!ic_host_name_set)
+                sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+        if (root_server_addr == INADDR_NONE)
+                root_server_addr = ic_servaddr;
+        if (ic_netmask == INADDR_NONE) {
+                if (IN_CLASSA(ntohl(ic_myaddr)))
+                        ic_netmask = htonl(IN_CLASSA_NET);
+                else if (IN_CLASSB(ntohl(ic_myaddr)))
+                        ic_netmask = htonl(IN_CLASSB_NET);
+                else if (IN_CLASSC(ntohl(ic_myaddr)))
+                        ic_netmask = htonl(IN_CLASSC_NET);
+                else {
+                        printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n",
+                                NIPQUAD(ic_myaddr));
+                        return -1;
+                }
+                printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask));
+        }
+        return 0;
+}
+/*
+ *      RARP support.
+ */
+#ifdef IPCONFIG_RARP
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static struct packet_type rarp_packet_type __initdata = {
+        .type = __constant_htons(ETH_P_RARP),
+        .func = ic_rarp_recv,
+};
+static inline void ic_rarp_init(void)
+{
+        dev_add_pack(&rarp_packet_type);
+}
+static inline void ic_rarp_cleanup(void)
+{
+        dev_remove_pack(&rarp_packet_type);
+}
+/*
+ *  Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+        struct arphdr *rarp;
+        unsigned char *rarp_ptr;
+        unsigned long sip, tip;
+        unsigned char *sha, *tha;               /* s for "source", t for "target" */
+        struct ic_device *d;
+        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+                return NET_RX_DROP;
+        if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+                goto drop;
+        /* Basic sanity checks can be done without the lock.  */
+        rarp = (struct arphdr *)skb->h.raw;
+        /* If this test doesn't pass, it's not IP, or we should
+         * ignore it anyway.
+         */
+        if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+                goto drop;
+        /* If it's not a RARP reply, delete it. */
+        if (rarp->ar_op != htons(ARPOP_RREPLY))
+                goto drop;
+        /* If it's not Ethernet, delete it. */
+        if (rarp->ar_pro != htons(ETH_P_IP))
+                goto drop;
+        if (!pskb_may_pull(skb,
+                           sizeof(struct arphdr) +
+                           (2 * dev->addr_len) +
+                           (2 * 4)))
+                goto drop;
+        /* OK, it is all there and looks valid, process... */
+        rarp = (struct arphdr *)skb->h.raw;
+        rarp_ptr = (unsigned char *) (rarp + 1);
+        /* One reply at a time, please. */
+        spin_lock(&ic_recv_lock);
+        /* If we already have a reply, just drop the packet */
+        if (ic_got_reply)
+                goto drop_unlock;
+        /* Find the ic_device that the packet arrived on */
+        d = ic_first_dev;
+        while (d && d->dev != dev)
+                d = d->next;
+        if (!d)
+                goto drop_unlock;       /* should never happen */
+        /* Extract variable-width fields */
+        sha = rarp_ptr;
+        rarp_ptr += dev->addr_len;
+        memcpy(&sip, rarp_ptr, 4);
+        rarp_ptr += 4;
+        tha = rarp_ptr;
+        rarp_ptr += dev->addr_len;
+        memcpy(&tip, rarp_ptr, 4);
+        /* Discard packets which are not meant for us. */
+        if (memcmp(tha, dev->dev_addr, dev->addr_len))
+                goto drop_unlock;
+        /* Discard packets which are not from specified server. */
+        if (ic_servaddr != INADDR_NONE && ic_servaddr != sip)
+                goto drop_unlock;
+        /* We have a winner! */
+        ic_dev = dev;
+        if (ic_myaddr == INADDR_NONE)
+                ic_myaddr = tip;
+        ic_servaddr = sip;
+        ic_got_reply = IC_RARP;
+drop_unlock:
+        /* Show's over.  Nothing to see here.  */
+        spin_unlock(&ic_recv_lock);
+drop:
+        /* Throw the packet out. */
+        kfree_skb(skb);
+        return 0;
+}
+/*
+ *  Send RARP request packet over a single interface.
+ */
+static void __init ic_rarp_send_if(struct ic_device *d)
+{
+        struct net_device *dev = d->dev;
+        arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+                 dev->dev_addr, dev->dev_addr);
+}
+#endif
+/*
+ *      DHCP/BOOTP support.
+ */
+#ifdef IPCONFIG_BOOTP
+struct bootp_pkt {              /* BOOTP packet format */
+        struct iphdr iph;       /* IP header */
+        struct udphdr udph;     /* UDP header */
+        u8 op;                  /* 1=request, 2=reply */
+        u8 htype;               /* HW address type */
+        u8 hlen;                /* HW address length */
+        u8 hops;                /* Used only by gateways */
+        u32 xid;                /* Transaction ID */
+        u16 secs;               /* Seconds since we started */
+        u16 flags;              /* Just what it says */
+        u32 client_ip;          /* Client's IP address if known */
+        u32 your_ip;            /* Assigned IP address */
+        u32 server_ip;          /* (Next, e.g. NFS) Server's IP address */
+        u32 relay_ip;           /* IP address of BOOTP relay */
+        u8 hw_addr[16];         /* Client's HW address */
+        u8 serv_name[64];       /* Server host name */
+        u8 boot_file[128];      /* Name of boot file */
+        u8 exten[312];          /* DHCP options / BOOTP vendor extensions */
+};
+/* packet ops */
+#define BOOTP_REQUEST   1
+#define BOOTP_REPLY     2
+/* DHCP message types */
+#define DHCPDISCOVER    1
+#define DHCPOFFER       2
+#define DHCPREQUEST     3
+#define DHCPDECLINE     4
+#define DHCPACK         5
+#define DHCPNAK         6
+#define DHCPRELEASE     7
+#define DHCPINFORM      8
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static struct packet_type bootp_packet_type __initdata = {
+        .type = __constant_htons(ETH_P_IP),
+        .func = ic_bootp_recv,
+};
+/*
+ *  Initialize DHCP/BOOTP extension fields in the request.
+ */
+static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
+#ifdef IPCONFIG_DHCP
+static void __init
+ic_dhcp_init_options(u8 *options)
+{
+        u8 mt = ((ic_servaddr == INADDR_NONE)
+                 ? DHCPDISCOVER : DHCPREQUEST);
+        u8 *e = options;
+#ifdef IPCONFIG_DEBUG
+        printk("DHCP: Sending message type %d\n", mt);
+#endif
+        memcpy(e, ic_bootp_cookie, 4);  /* RFC1048 Magic Cookie */
+        e += 4;
+        *e++ = 53;              /* DHCP message type */
+        *e++ = 1;
+        *e++ = mt;
+        if (mt == DHCPREQUEST) {
+                *e++ = 54;      /* Server ID (IP address) */
+                *e++ = 4;
+                memcpy(e, &ic_servaddr, 4);
+                e += 4;
+                *e++ = 50;      /* Requested IP address */
+                *e++ = 4;
+                memcpy(e, &ic_myaddr, 4);
+                e += 4;
+        }
+        /* always? */
+        {
+                static const u8 ic_req_params[] = {
+                        1,      /* Subnet mask */
+                        3,      /* Default gateway */
+                        6,      /* DNS server */
+                        12,     /* Host name */
+                        15,     /* Domain name */
+                        17,     /* Boot path */
+                        40,     /* NIS domain name */
+                };
+                *e++ = 55;      /* Parameter request list */
+                *e++ = sizeof(ic_req_params);
+                memcpy(e, ic_req_params, sizeof(ic_req_params));
+                e += sizeof(ic_req_params);
+        }
+        *e++ = 255;     /* End of the list */
+}
+#endif /* IPCONFIG_DHCP */
+static void __init ic_bootp_init_ext(u8 *e)
+{
+        memcpy(e, ic_bootp_cookie, 4);  /* RFC1048 Magic Cookie */
+        e += 4;
+        *e++ = 1;               /* Subnet mask request */
+        *e++ = 4;
+        e += 4;
+        *e++ = 3;               /* Default gateway request */
+        *e++ = 4;
+        e += 4;
+        *e++ = 5;               /* Name server request */
+        *e++ = 8;
+        e += 8;
+        *e++ = 12;              /* Host name request */
+        *e++ = 32;
+        e += 32;
+        *e++ = 40;              /* NIS Domain name request */
+        *e++ = 32;
+        e += 32;
+        *e++ = 17;              /* Boot path */
+        *e++ = 40;
+        e += 40;
+        *e++ = 57;              /* set extension buffer size for reply */ 
+        *e++ = 2;
+        *e++ = 1;               /* 128+236+8+20+14, see dhcpd sources */ 
+        *e++ = 150;
+        *e++ = 255;             /* End of the list */
+}
+/*
+ *  Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void ic_bootp_init(void)
+{
+        int i;
+        for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+                ic_nameservers[i] = INADDR_NONE;
+        dev_add_pack(&bootp_packet_type);
+}
+/*
+ *  DHCP/BOOTP cleanup.
+ */
+static inline void ic_bootp_cleanup(void)
+{
+        dev_remove_pack(&bootp_packet_type);
+}
+/*
+ *  Send DHCP/BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
+{
+        struct net_device *dev = d->dev;
+        struct sk_buff *skb;
+        struct bootp_pkt *b;
+        int hh_len = LL_RESERVED_SPACE(dev);
+        struct iphdr *h;
+        /* Allocate packet */
+        skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL);
+        if (!skb)
+                return;
+        skb_reserve(skb, hh_len);
+        b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
+        memset(b, 0, sizeof(struct bootp_pkt));
+        /* Construct IP header */
+        skb->nh.iph = h = &b->iph;
+        h->version = 4;
+        h->ihl = 5;
+        h->tot_len = htons(sizeof(struct bootp_pkt));
+        h->frag_off = htons(IP_DF);
+        h->ttl = 64;
+        h->protocol = IPPROTO_UDP;
+        h->daddr = INADDR_BROADCAST;
+        h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+        /* Construct UDP header */
+        b->udph.source = htons(68);
+        b->udph.dest = htons(67);
+        b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+        /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+        /* Construct DHCP/BOOTP header */
+        b->op = BOOTP_REQUEST;
+        if (dev->type < 256) /* check for false types */
+                b->htype = dev->type;
+        else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
+                b->htype = ARPHRD_IEEE802;
+        else if (dev->type == ARPHRD_FDDI)
+                b->htype = ARPHRD_ETHER;
+        else {
+                printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+                b->htype = dev->type; /* can cause undefined behavior */
+        }
+        b->hlen = dev->addr_len;
+        b->your_ip = INADDR_NONE;
+        b->server_ip = INADDR_NONE;
+        memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+        b->secs = htons(jiffies_diff / HZ);
+        b->xid = d->xid;
+        /* add DHCP options or BOOTP extensions */
+#ifdef IPCONFIG_DHCP
+        if (ic_proto_enabled & IC_USE_DHCP)
+                ic_dhcp_init_options(b->exten);
+        else
+#endif
+                ic_bootp_init_ext(b->exten);
+        /* Chain packet down the line... */
+        skb->dev = dev;
+        skb->protocol = htons(ETH_P_IP);
+        if ((dev->hard_header &&
+             dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) ||
+            dev_queue_xmit(skb) < 0)
+                printk("E");
+}
+/*
+ *  Copy BOOTP-supplied string if not already set.
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+        if (!len)
+                return 0;
+        if (len > max-1)
+                len = max-1;
+        memcpy(dest, src, len);
+        dest[len] = '\0';
+        return 1;
+}
+/*
+ *  Process BOOTP extensions.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+       u8 servers;
+       int i;
+#ifdef IPCONFIG_DEBUG
+        u8 *c;
+        printk("DHCP/BOOTP: Got extension %d:",*ext);
+        for(c=ext+2; c<ext+2+ext[1]; c++)
+                printk(" %02x", *c);
+        printk("\n");
+#endif
+        switch (*ext++) {
+                case 1:         /* Subnet mask */
+                        if (ic_netmask == INADDR_NONE)
+                                memcpy(&ic_netmask, ext+1, 4);
+                        break;
+                case 3:         /* Default gateway */
+                        if (ic_gateway == INADDR_NONE)
+                                memcpy(&ic_gateway, ext+1, 4);
+                        break;
+                case 6:         /* DNS server */
+                        servers= *ext/4;
+                        if (servers > CONF_NAMESERVERS_MAX)
+                                servers = CONF_NAMESERVERS_MAX;
+                        for (i = 0; i < servers; i++) {
+                                if (ic_nameservers[i] == INADDR_NONE)
+                                        memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+                        }
+                        break;
+                case 12:        /* Host name */
+                        ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN);
+                        ic_host_name_set = 1;
+                        break;
+                case 15:        /* Domain name (DNS) */
+                        ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+                        break;
+                case 17:        /* Root path */
+                        if (!root_server_path[0])
+                                ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
+                        break;
+                case 40:        /* NIS Domain name (_not_ DNS) */
+                        ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN);
+                        break;
+        }
+}
+/*
+ *  Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+        struct bootp_pkt *b;
+        struct iphdr *h;
+        struct ic_device *d;
+        int len, ext_len;
+        /* Perform verifications before taking the lock.  */
+        if (skb->pkt_type == PACKET_OTHERHOST)
+                goto drop;
+        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+                return NET_RX_DROP;
+        if (!pskb_may_pull(skb,
+                           sizeof(struct iphdr) +
+                           sizeof(struct udphdr)))
+                goto drop;
+        b = (struct bootp_pkt *) skb->nh.iph;
+        h = &b->iph;
+        if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+                goto drop;
+        /* Fragments are not supported */
+        if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+                if (net_ratelimit())
+                        printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
+                               "reply.\n");
+                goto drop;
+        }
+        if (skb->len < ntohs(h->tot_len))
+                goto drop;
+        if (ip_fast_csum((char *) h, h->ihl))
+                goto drop;
+        if (b->udph.source != htons(67) || b->udph.dest != htons(68))
+                goto drop;
+        if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+                goto drop;
+        len = ntohs(b->udph.len) - sizeof(struct udphdr);
+        ext_len = len - (sizeof(*b) -
+                         sizeof(struct iphdr) -
+                         sizeof(struct udphdr) -
+                         sizeof(b->exten));
+        if (ext_len < 0)
+                goto drop;
+        /* Ok the front looks good, make sure we can get at the rest.  */
+        if (!pskb_may_pull(skb, skb->len))
+                goto drop;
+        b = (struct bootp_pkt *) skb->nh.iph;
+        h = &b->iph;
+        /* One reply at a time, please. */
+        spin_lock(&ic_recv_lock);
+        /* If we already have a reply, just drop the packet */
+        if (ic_got_reply)
+                goto drop_unlock;
+        /* Find the ic_device that the packet arrived on */
+        d = ic_first_dev;
+        while (d && d->dev != dev)
+                d = d->next;
+        if (!d)
+                goto drop_unlock;  /* should never happen */
+        /* Is it a reply to our BOOTP request? */
+        if (b->op != BOOTP_REPLY ||
+            b->xid != d->xid) {
+                if (net_ratelimit())
+                        printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
+                               "op[%x] xid[%x]\n",
+                               b->op, b->xid);
+                goto drop_unlock;
+        }
+        /* Parse extensions */
+        if (ext_len >= 4 &&
+            !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
+                u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+                u8 *ext;
+#ifdef IPCONFIG_DHCP
+                if (ic_proto_enabled & IC_USE_DHCP) {
+                        u32 server_id = INADDR_NONE;
+                        int mt = 0;
+                        ext = &b->exten[4];
+                        while (ext < end && *ext != 0xff) {
+                                u8 *opt = ext++;
+                                if (*opt == 0)  /* Padding */
+                                        continue;
+                                ext += *ext + 1;
+                                if (ext >= end)
+                                        break;
+                                switch (*opt) {
+                                case 53:        /* Message type */
+                                        if (opt[1])
+                                                mt = opt[2];
+                                        break;
+                                case 54:        /* Server ID (IP address) */
+                                        if (opt[1] >= 4)
+                                                memcpy(&server_id, opt + 2, 4);
+                                        break;
+                                };
+                        }
+#ifdef IPCONFIG_DEBUG
+                        printk("DHCP: Got message type %d\n", mt);
+#endif
+                        switch (mt) {
+                        case DHCPOFFER:
+                                /* While in the process of accepting one offer,
+                                 * ignore all others.
+                                 */
+                                if (ic_myaddr != INADDR_NONE)
+                                        goto drop_unlock;
+                                /* Let's accept that offer. */
+                                ic_myaddr = b->your_ip;
+                                ic_servaddr = server_id;
+#ifdef IPCONFIG_DEBUG
+                                printk("DHCP: Offered address %u.%u.%u.%u",
+                                       NIPQUAD(ic_myaddr));
+                                printk(" by server %u.%u.%u.%u\n",
+                                       NIPQUAD(ic_servaddr));
+#endif
+                                /* The DHCP indicated server address takes
+                                 * precedence over the bootp header one if
+                                 * they are different.
+                                 */
+                                if ((server_id != INADDR_NONE) &&
+                                    (b->server_ip != server_id))
+                                        b->server_ip = ic_servaddr;
+                                break;
+                        case DHCPACK:
+                                if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
+                                        goto drop_unlock;
+                                /* Yeah! */
+                                break;
+                        default:
+                                /* Urque.  Forget it*/
+                                ic_myaddr = INADDR_NONE;
+                                ic_servaddr = INADDR_NONE;
+                                goto drop_unlock;
+                        };
+                        ic_dhcp_msgtype = mt;
+                }
+#endif /* IPCONFIG_DHCP */
+                ext = &b->exten[4];
+                while (ext < end && *ext != 0xff) {
+                        u8 *opt = ext++;
+                        if (*opt == 0)  /* Padding */
+                                continue;
+                        ext += *ext + 1;
+                        if (ext < end)
+                                ic_do_bootp_ext(opt);
+                }
+        }
+        /* We have a winner! */
+        ic_dev = dev;
+        ic_myaddr = b->your_ip;
+        ic_servaddr = b->server_ip;
+        if (ic_gateway == INADDR_NONE && b->relay_ip)
+                ic_gateway = b->relay_ip;
+        if (ic_nameservers[0] == INADDR_NONE)
+                ic_nameservers[0] = ic_servaddr;
+        ic_got_reply = IC_BOOTP;
+drop_unlock:
+        /* Show's over.  Nothing to see here.  */
+        spin_unlock(&ic_recv_lock);
+drop:
+        /* Throw the packet out. */
+        kfree_skb(skb);
+        return 0;
+}       
+#endif
+/*
+ *      Dynamic IP configuration -- DHCP, BOOTP, RARP.
+ */
+#ifdef IPCONFIG_DYNAMIC
+static int __init ic_dynamic(void)
+{
+        int retries;
+        struct ic_device *d;
+        unsigned long start_jiffies, timeout, jiff;
+        int do_bootp = ic_proto_have_if & IC_BOOTP;
+        int do_rarp = ic_proto_have_if & IC_RARP;
+        /*
+         * If none of DHCP/BOOTP/RARP was selected, return with an error.
+         * This routine gets only called when some pieces of information
+         * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
+         */
+        if (!ic_proto_enabled) {
+                printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+                return -1;
+        }
+#ifdef IPCONFIG_BOOTP
+        if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+                printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
+#endif
+#ifdef IPCONFIG_RARP
+        if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+                printk(KERN_ERR "RARP: No suitable device found.\n");
+#endif
+        if (!ic_proto_have_if)
+                /* Error message already printed */
+                return -1;
+        /*
+         * Setup protocols
+         */
+#ifdef IPCONFIG_BOOTP
+        if (do_bootp)
+                ic_bootp_init();
+#endif
+#ifdef IPCONFIG_RARP
+        if (do_rarp)
+                ic_rarp_init();
+#endif
+        /*
+         * Send requests and wait, until we get an answer. This loop
+         * seems to be a terrible waste of CPU time, but actually there is
+         * only one process running at all, so we don't need to use any
+         * scheduler functions.
+         * [Actually we could now, but the nothing else running note still 
+         *  applies.. - AC]
+         */
+        printk(KERN_NOTICE "Sending %s%s%s requests .",
+               do_bootp
+                ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+               (do_bootp && do_rarp) ? " and " : "",
+               do_rarp ? "RARP" : "");
+        start_jiffies = jiffies;
+        d = ic_first_dev;
+        retries = CONF_SEND_RETRIES;
+        get_random_bytes(&timeout, sizeof(timeout));
+        timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+        for(;;) {
+#ifdef IPCONFIG_BOOTP
+                if (do_bootp && (d->able & IC_BOOTP))
+                        ic_bootp_send_if(d, jiffies - start_jiffies);
+#endif
+#ifdef IPCONFIG_RARP
+                if (do_rarp && (d->able & IC_RARP))
+                        ic_rarp_send_if(d);
+#endif
+                jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
+                while (time_before(jiffies, jiff) && !ic_got_reply) {
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_timeout(1);
+                }
+#ifdef IPCONFIG_DHCP
+                /* DHCP isn't done until we get a DHCPACK. */
+                if ((ic_got_reply & IC_BOOTP)
+                    && (ic_proto_enabled & IC_USE_DHCP)
+                    && ic_dhcp_msgtype != DHCPACK)
+                {
+                        ic_got_reply = 0;
+                        printk(",");
+                        continue;
+                }
+#endif /* IPCONFIG_DHCP */
+                if (ic_got_reply) {
+                        printk(" OK\n");
+                        break;
+                }
+                if ((d = d->next))
+                        continue;
+                if (! --retries) {
+                        printk(" timed out!\n");
+                        break;
+                }
+                d = ic_first_dev;
+                timeout = timeout CONF_TIMEOUT_MULT;
+                if (timeout > CONF_TIMEOUT_MAX)
+                        timeout = CONF_TIMEOUT_MAX;
+                printk(".");
+        }
+#ifdef IPCONFIG_BOOTP
+        if (do_bootp)
+                ic_bootp_cleanup();
+#endif
+#ifdef IPCONFIG_RARP
+        if (do_rarp)
+                ic_rarp_cleanup();
+#endif
+        if (!ic_got_reply)
+                return -1;
+        printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
+                ((ic_got_reply & IC_RARP) ? "RARP" 
+                 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+                NIPQUAD(ic_servaddr));
+        printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr));
+        return 0;
+}
+#endif /* IPCONFIG_DYNAMIC */
+#ifdef CONFIG_PROC_FS
+static int pnp_seq_show(struct seq_file *seq, void *v)
+{
+        int i;
+        if (ic_proto_used & IC_PROTO)
+                seq_printf(seq, "#PROTO: %s\n",
+                           (ic_proto_used & IC_RARP) ? "RARP"
+                           : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
+        else
+                seq_puts(seq, "#MANUAL\n");
+        if (ic_domain[0])
+                seq_printf(seq,
+                           "domain %s\n", ic_domain);
+        for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+                if (ic_nameservers[i] != INADDR_NONE)
+                        seq_printf(seq,
+                                   "nameserver %u.%u.%u.%u\n",
+                                   NIPQUAD(ic_nameservers[i]));
+        }
+        if (ic_servaddr != INADDR_NONE)
+                seq_printf(seq,
+                           "bootserver %u.%u.%u.%u\n",
+                           NIPQUAD(ic_servaddr));
+        return 0;
+}
+static int pnp_seq_open(struct inode *indoe, struct file *file)
+{
+        return single_open(file, pnp_seq_show, NULL);
+}
+static struct file_operations pnp_seq_fops = {
+        .owner          = THIS_MODULE,
+        .open           = pnp_seq_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif /* CONFIG_PROC_FS */
+/*
+ *  Extract IP address from the parameter string if needed. Note that we
+ *  need to have root_server_addr set _before_ IPConfig gets called as it
+ *  can override it.
+ */
+u32 __init root_nfs_parse_addr(char *name)
+{
+        u32 addr;
+        int octets = 0;
+        char *cp, *cq;
+        cp = cq = name;
+        while (octets < 4) {
+                while (*cp >= '0' && *cp <= '9')
+                        cp++;
+                if (cp == cq || cp - cq > 3)
+                        break;
+                if (*cp == '.' || octets == 3)
+                        octets++;
+                if (octets < 4)
+                        cp++;
+                cq = cp;
+        }
+        if (octets == 4 && (*cp == ':' || *cp == '\0')) {
+                if (*cp == ':')
+                        *cp++ = '\0';
+                addr = in_aton(name);
+                memmove(name, cp, strlen(cp) + 1);
+        } else
+                addr = INADDR_NONE;
+        return addr;
+}
+/*
+ *      IP Autoconfig dispatcher.
+ */
+static int __init ip_auto_config(void)
+{
+        u32 addr;
+#ifdef CONFIG_PROC_FS
+        proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops);
+#endif /* CONFIG_PROC_FS */
+        if (!ic_enable)
+                return 0;
+        DBG(("IP-Config: Entered.\n"));
+#ifdef IPCONFIG_DYNAMIC
+ try_try_again:
+#endif
+        /* Give hardware a chance to settle */
+        msleep(CONF_PRE_OPEN);
+        /* Setup all network devices */
+        if (ic_open_devs() < 0)
+                return -1;
+        /* Give drivers a chance to settle */
+        ssleep(CONF_POST_OPEN);
+        /*
+         * If the config information is insufficient (e.g., our IP address or
+         * IP address of the boot server is missing or we have multiple network
+         * interfaces and no default was set), use BOOTP or RARP to get the
+         * missing values.
+         */
+        if (ic_myaddr == INADDR_NONE ||
+#ifdef CONFIG_ROOT_NFS
+            (MAJOR(ROOT_DEV) == UNNAMED_MAJOR
+             && root_server_addr == INADDR_NONE
+             && ic_servaddr == INADDR_NONE) ||
+#endif
+            ic_first_dev->next) {
+#ifdef IPCONFIG_DYNAMIC
+        
+                int retries = CONF_OPEN_RETRIES;
+                if (ic_dynamic() < 0) {
+                        ic_close_devs();
+                        /*
+                         * I don't know why, but sometimes the
+                         * eepro100 driver (at least) gets upset and
+                         * doesn't work the first time it's opened.
+                         * But then if you close it and reopen it, it
+                         * works just fine.  So we need to try that at
+                         * least once before giving up.
+                         *
+                         * Also, if the root will be NFS-mounted, we
+                         * have nowhere to go if DHCP fails.  So we
+                         * just have to keep trying forever.
+                         *
+                         *                              -- Chip
+                         */
+#ifdef CONFIG_ROOT_NFS
+                        if (ROOT_DEV ==  Root_NFS) {
+                                printk(KERN_ERR 
+                                        "IP-Config: Retrying forever (NFS root)...\n");
+                                goto try_try_again;
+                        }
+#endif
+                        if (--retries) {
+                                printk(KERN_ERR 
+                                       "IP-Config: Reopening network devices...\n");
+                                goto try_try_again;
+                        }
+                        /* Oh, well.  At least we tried. */
+                        printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+                        return -1;
+                }
+#else /* !DYNAMIC */
+                printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+                ic_close_devs();
+                return -1;
+#endif /* IPCONFIG_DYNAMIC */
+        } else {
+                /* Device selected manually or only one device -> use it */
+                ic_dev = ic_first_dev->dev;
+        }
+        addr = root_nfs_parse_addr(root_server_path);
+        if (root_server_addr == INADDR_NONE)
+                root_server_addr = addr;
+        /*
+         * Use defaults whereever applicable.
+         */
+        if (ic_defaults() < 0)
+                return -1;
+        /*
+         * Close all network devices except the device we've
+         * autoconfigured and set up routes.
+         */
+        ic_close_devs();
+        if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+                return -1;
+        /*
+         * Record which protocol was actually used.
+         */
+#ifdef IPCONFIG_DYNAMIC
+        ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
+#endif
+#ifndef IPCONFIG_SILENT
+        /*
+         * Clue in the operator.
+         */
+        printk("IP-Config: Complete:");
+        printk("\n      device=%s", ic_dev->name);
+        printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+        printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask));
+        printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway));
+        printk(",\n     host=%s, domain=%s, nis-domain=%s",
+               system_utsname.nodename, ic_domain, system_utsname.domainname);
+        printk(",\n     bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr));
+        printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr));
+        printk(", rootpath=%s", root_server_path);
+        printk("\n");
+#endif /* !SILENT */
+        return 0;
+}
+late_initcall(ip_auto_config);
+/*
+ *  Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ *  command line parameter. It consists of option fields separated by colons in
+ *  the following order:
+ *
+ *  <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<PROTO>
+ *
+ *  Any of the fields can be empty which means to use a default value:
+ *      <client-ip>     - address given by BOOTP or RARP
+ *      <server-ip>     - address of host returning BOOTP or RARP packet
+ *      <gw-ip>         - none, or the address returned by BOOTP
+ *      <netmask>       - automatically determined from <client-ip>, or the
+ *                        one returned by BOOTP
+ *      <host name>     - <client-ip> in ASCII notation, or the name returned
+ *                        by BOOTP
+ *      <device>        - use all available devices
+ *      <PROTO>:
+ *         off|none         - don't do autoconfig at all (DEFAULT)
+ *         on|any           - use any configured protocol
+ *         dhcp|bootp|rarp  - use only the specified protocol
+ *         both             - use both BOOTP and RARP (not DHCP)
+ */
+static int __init ic_proto_name(char *name)
+{
+        if (!strcmp(name, "on") || !strcmp(name, "any")) {
+                return 1;
+        }
+#ifdef CONFIG_IP_PNP_DHCP
+        else if (!strcmp(name, "dhcp")) {
+                ic_proto_enabled &= ~IC_RARP;
+                return 1;
+        }
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+        else if (!strcmp(name, "bootp")) {
+                ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
+                return 1;
+        }
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+        else if (!strcmp(name, "rarp")) {
+                ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
+                return 1;
+        }
+#endif
+#ifdef IPCONFIG_DYNAMIC
+        else if (!strcmp(name, "both")) {
+                ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
+                return 1;
+        }
+#endif
+        return 0;
+}
+static int __init ip_auto_config_setup(char *addrs)
+{
+        char *cp, *ip, *dp;
+        int num = 0;
+        ic_set_manually = 1;
+        ic_enable = (*addrs && 
+                (strcmp(addrs, "off") != 0) && 
+                (strcmp(addrs, "none") != 0));
+        if (!ic_enable)
+                return 1;
+        if (ic_proto_name(addrs))
+                return 1;
+        /* Parse the whole string */
+        ip = addrs;
+        while (ip && *ip) {
+                if ((cp = strchr(ip, ':')))
+                        *cp++ = '\0';
+                if (strlen(ip) > 0) {
+                        DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+                        switch (num) {
+                        case 0:
+                                if ((ic_myaddr = in_aton(ip)) == INADDR_ANY)
+                                        ic_myaddr = INADDR_NONE;
+                                break;
+                        case 1:
+                                if ((ic_servaddr = in_aton(ip)) == INADDR_ANY)
+                                        ic_servaddr = INADDR_NONE;
+                                break;
+                        case 2:
+                                if ((ic_gateway = in_aton(ip)) == INADDR_ANY)
+                                        ic_gateway = INADDR_NONE;
+                                break;
+                        case 3:
+                                if ((ic_netmask = in_aton(ip)) == INADDR_ANY)
+                                        ic_netmask = INADDR_NONE;
+                                break;
+                        case 4:
+                                if ((dp = strchr(ip, '.'))) {
+                                        *dp++ = '\0';
+                                        strlcpy(system_utsname.domainname, dp,
+                                                sizeof(system_utsname.domainname));
+                                }
+                                strlcpy(system_utsname.nodename, ip,
+                                        sizeof(system_utsname.nodename));
+                                ic_host_name_set = 1;
+                                break;
+                        case 5:
+                                strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+                                break;
+                        case 6:
+                                ic_proto_name(ip);
+                                break;
+                        }
+                }
+                ip = cp;
+                num++;
+        }
+        return 1;
+}
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+        return ip_auto_config_setup(addrs);
+}
+__setup("ip=", ip_auto_config_setup);
+__setup("nfsaddrs=", nfsaddrs_config_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 000000000000..68a78731f722
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,905 @@
+/*
+ *      Linux NET3:     IP/IP protocol decoder. 
+ *
+ *      Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
+ *
+ *      Authors:
+ *              Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
+ *
+ *      Fixes:
+ *              Alan Cox        :       Merged and made usable non modular (its so tiny its silly as
+ *                                      a module taking up 2 pages).
+ *              Alan Cox        :       Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ *                                      to keep ip_forward happy.
+ *              Alan Cox        :       More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ *              Kai Schulte     :       Fixed #defines for IP_FIREWALL->FIREWALL
+ *              David Woodhouse :       Perform some basic ICMP handling.
+ *                                      IPIP Routing without decapsulation.
+ *              Carlos Picoto   :       GRE over IP support
+ *              Alexey Kuznetsov:       Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ *                                      I do not want to merge them together.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ */
+/* tunnel.c: an IP tunnel driver
+        The purpose of this driver is to provide an IP tunnel through
+        which you can tunnel network traffic transparently across subnets.
+        This was written by looking at Nick Holloway's dummy driver
+        Thanks for the great code!
+                -Sam Lantinga   (slouken@cs.ucdavis.edu)  02/01/95
+                
+        Minor tweaks:
+                Cleaned up the code a little and added some pre-1.3.0 tweaks.
+                dev->hard_header/hard_header_len changed to use no headers.
+                Comments/bracketing tweaked.
+                Made the tunnels use dev->name not tunnel: when error reporting.
+                Added tx_dropped stat
+                
+                -Alan Cox       (Alan.Cox@linux.org) 21 March 95
+        Reworked:
+                Changed to tunnel to destination gateway in addition to the
+                        tunnel's pointopoint address
+                Almost completely rewritten
+                Note:  There is currently no firewall or ICMP handling done.
+                -Sam Lantinga   (slouken@cs.ucdavis.edu) 02/13/96
+                
+*/
+/* Things I wish I had known when writing the tunnel driver:
+        When the tunnel_xmit() function is called, the skb contains the
+        packet to be sent (plus a great deal of extra info), and dev
+        contains the tunnel device that _we_ are.
+        When we are passed a packet, we are expected to fill in the
+        source address with our source IP address.
+        What is the proper way to allocate, copy and free a buffer?
+        After you allocate it, it is a "0 length" chunk of memory
+        starting at zero.  If you want to add headers to the buffer
+        later, you'll have to call "skb_reserve(skb, amount)" with
+        the amount of memory you want reserved.  Then, you call
+        "skb_put(skb, amount)" with the amount of space you want in
+        the buffer.  skb_put() returns a pointer to the top (#0) of
+        that buffer.  skb->len is set to the amount of space you have
+        "allocated" with skb_put().  You can then write up to skb->len
+        bytes to that buffer.  If you need more, you can call skb_put()
+        again with the additional amount of space you need.  You can
+        find out how much more space you can allocate by calling 
+        "skb_tailroom(skb)".
+        Now, to add header space, call "skb_push(skb, header_len)".
+        This creates space at the beginning of the buffer and returns
+        a pointer to this new space.  If later you need to strip a
+        header from a buffer, call "skb_pull(skb, header_len)".
+        skb_headroom() will return how much space is left at the top
+        of the buffer (before the main data).  Remember, this headroom
+        space must be reserved before the skb_put() function is called.
+        */
+/*
+   This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+ 
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#define HASH_SIZE  16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+static int ipip_fb_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
+static void ipip_tunnel_setup(struct net_device *dev);
+static struct net_device *ipip_fb_tunnel_dev;
+static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_r[HASH_SIZE];
+static struct ip_tunnel *tunnels_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_wc[1];
+static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
+static DEFINE_RWLOCK(ipip_lock);
+static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local)
+{
+        unsigned h0 = HASH(remote);
+        unsigned h1 = HASH(local);
+        struct ip_tunnel *t;
+        for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+                if (local == t->parms.iph.saddr &&
+                    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+                        return t;
+        }
+        for (t = tunnels_r[h0]; t; t = t->next) {
+                if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+                        return t;
+        }
+        for (t = tunnels_l[h1]; t; t = t->next) {
+                if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+                        return t;
+        }
+        if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
+                return t;
+        return NULL;
+}
+static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
+{
+        u32 remote = t->parms.iph.daddr;
+        u32 local = t->parms.iph.saddr;
+        unsigned h = 0;
+        int prio = 0;
+        if (remote) {
+                prio |= 2;
+                h ^= HASH(remote);
+        }
+        if (local) {
+                prio |= 1;
+                h ^= HASH(local);
+        }
+        return &tunnels[prio][h];
+}
+static void ipip_tunnel_unlink(struct ip_tunnel *t)
+{
+        struct ip_tunnel **tp;
+        for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
+                if (t == *tp) {
+                        write_lock_bh(&ipip_lock);
+                        *tp = t->next;
+                        write_unlock_bh(&ipip_lock);
+                        break;
+                }
+        }
+}
+static void ipip_tunnel_link(struct ip_tunnel *t)
+{
+        struct ip_tunnel **tp = ipip_bucket(t);
+        t->next = *tp;
+        write_lock_bh(&ipip_lock);
+        *tp = t;
+        write_unlock_bh(&ipip_lock);
+}
+static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+        u32 remote = parms->iph.daddr;
+        u32 local = parms->iph.saddr;
+        struct ip_tunnel *t, **tp, *nt;
+        struct net_device *dev;
+        unsigned h = 0;
+        int prio = 0;
+        char name[IFNAMSIZ];
+        if (remote) {
+                prio |= 2;
+                h ^= HASH(remote);
+        }
+        if (local) {
+                prio |= 1;
+                h ^= HASH(local);
+        }
+        for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+                if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+                        return t;
+        }
+        if (!create)
+                return NULL;
+        if (parms->name[0])
+                strlcpy(name, parms->name, IFNAMSIZ);
+        else {
+                int i;
+                for (i=1; i<100; i++) {
+                        sprintf(name, "tunl%d", i);
+                        if (__dev_get_by_name(name) == NULL)
+                                break;
+                }
+                if (i==100)
+                        goto failed;
+        }
+        dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
+        if (dev == NULL)
+                return NULL;
+        nt = dev->priv;
+        SET_MODULE_OWNER(dev);
+        dev->init = ipip_tunnel_init;
+        nt->parms = *parms;
+        if (register_netdevice(dev) < 0) {
+                free_netdev(dev);
+                goto failed;
+        }
+        dev_hold(dev);
+        ipip_tunnel_link(nt);
+        /* Do not decrement MOD_USE_COUNT here. */
+        return nt;
+failed:
+        return NULL;
+}
+static void ipip_tunnel_uninit(struct net_device *dev)
+{
+        if (dev == ipip_fb_tunnel_dev) {
+                write_lock_bh(&ipip_lock);
+                tunnels_wc[0] = NULL;
+                write_unlock_bh(&ipip_lock);
+        } else
+                ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
+        dev_put(dev);
+}
+static void ipip_err(struct sk_buff *skb, void *__unused)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+/* It is not :-( All the routers (except for Linux) return only
+   8 bytes of packet payload. It means, that precise relaying of
+   ICMP in the real Internet is absolutely infeasible.
+ */
+        struct iphdr *iph = (struct iphdr*)skb->data;
+        int type = skb->h.icmph->type;
+        int code = skb->h.icmph->code;
+        struct ip_tunnel *t;
+        switch (type) {
+        default:
+        case ICMP_PARAMETERPROB:
+                return;
+        case ICMP_DEST_UNREACH:
+                switch (code) {
+                case ICMP_SR_FAILED:
+                case ICMP_PORT_UNREACH:
+                        /* Impossible event. */
+                        return;
+                case ICMP_FRAG_NEEDED:
+                        /* Soft state for pmtu is maintained by IP core. */
+                        return;
+                default:
+                        /* All others are translated to HOST_UNREACH.
+                           rfc2003 contains "deep thoughts" about NET_UNREACH,
+                           I believe they are just ether pollution. --ANK
+                         */
+                        break;
+                }
+                break;
+        case ICMP_TIME_EXCEEDED:
+                if (code != ICMP_EXC_TTL)
+                        return;
+                break;
+        }
+        read_lock(&ipip_lock);
+        t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
+        if (t == NULL || t->parms.iph.daddr == 0)
+                goto out;
+        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+                goto out;
+        if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+                t->err_count++;
+        else
+                t->err_count = 1;
+        t->err_time = jiffies;
+out:
+        read_unlock(&ipip_lock);
+        return;
+#else
+        struct iphdr *iph = (struct iphdr*)dp;
+        int hlen = iph->ihl<<2;
+        struct iphdr *eiph;
+        int type = skb->h.icmph->type;
+        int code = skb->h.icmph->code;
+        int rel_type = 0;
+        int rel_code = 0;
+        int rel_info = 0;
+        struct sk_buff *skb2;
+        struct flowi fl;
+        struct rtable *rt;
+        if (len < hlen + sizeof(struct iphdr))
+                return;
+        eiph = (struct iphdr*)(dp + hlen);
+        switch (type) {
+        default:
+                return;
+        case ICMP_PARAMETERPROB:
+                if (skb->h.icmph->un.gateway < hlen)
+                        return;
+                /* So... This guy found something strange INSIDE encapsulated
+                   packet. Well, he is fool, but what can we do ?
+                 */
+                rel_type = ICMP_PARAMETERPROB;
+                rel_info = skb->h.icmph->un.gateway - hlen;
+                break;
+        case ICMP_DEST_UNREACH:
+                switch (code) {
+                case ICMP_SR_FAILED:
+                case ICMP_PORT_UNREACH:
+                        /* Impossible event. */
+                        return;
+                case ICMP_FRAG_NEEDED:
+                        /* And it is the only really necessary thing :-) */
+                        rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+                        if (rel_info < hlen+68)
+                                return;
+                        rel_info -= hlen;
+                        /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+                        if (rel_info > ntohs(eiph->tot_len))
+                                return;
+                        break;
+                default:
+                        /* All others are translated to HOST_UNREACH.
+                           rfc2003 contains "deep thoughts" about NET_UNREACH,
+                           I believe, it is just ether pollution. --ANK
+                         */
+                        rel_type = ICMP_DEST_UNREACH;
+                        rel_code = ICMP_HOST_UNREACH;
+                        break;
+                }
+                break;
+        case ICMP_TIME_EXCEEDED:
+                if (code != ICMP_EXC_TTL)
+                        return;
+                break;
+        }
+        /* Prepare fake skb to feed it to icmp_send */
+        skb2 = skb_clone(skb, GFP_ATOMIC);
+        if (skb2 == NULL)
+                return;
+        dst_release(skb2->dst);
+        skb2->dst = NULL;
+        skb_pull(skb2, skb->data - (u8*)eiph);
+        skb2->nh.raw = skb2->data;
+        /* Try to guess incoming interface */
+        memset(&fl, 0, sizeof(fl));
+        fl.fl4_daddr = eiph->saddr;
+        fl.fl4_tos = RT_TOS(eiph->tos);
+        fl.proto = IPPROTO_IPIP;
+        if (ip_route_output_key(&rt, &key)) {
+                kfree_skb(skb2);
+                return;
+        }
+        skb2->dev = rt->u.dst.dev;
+        /* route "incoming" packet */
+        if (rt->rt_flags&RTCF_LOCAL) {
+                ip_rt_put(rt);
+                rt = NULL;
+                fl.fl4_daddr = eiph->daddr;
+                fl.fl4_src = eiph->saddr;
+                fl.fl4_tos = eiph->tos;
+                if (ip_route_output_key(&rt, &fl) ||
+                    rt->u.dst.dev->type != ARPHRD_TUNNEL) {
+                        ip_rt_put(rt);
+                        kfree_skb(skb2);
+                        return;
+                }
+        } else {
+                ip_rt_put(rt);
+                if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+                    skb2->dst->dev->type != ARPHRD_TUNNEL) {
+                        kfree_skb(skb2);
+                        return;
+                }
+        }
+        /* change mtu on this route */
+        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+                if (rel_info > dst_mtu(skb2->dst)) {
+                        kfree_skb(skb2);
+                        return;
+                }
+                skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+                rel_info = htonl(rel_info);
+        } else if (type == ICMP_TIME_EXCEEDED) {
+                struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+                if (t->parms.iph.ttl) {
+                        rel_type = ICMP_DEST_UNREACH;
+                        rel_code = ICMP_HOST_UNREACH;
+                }
+        }
+        icmp_send(skb2, rel_type, rel_code, rel_info);
+        kfree_skb(skb2);
+        return;
+#endif
+}
+static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb)
+{
+        struct iphdr *inner_iph = skb->nh.iph;
+        if (INET_ECN_is_ce(outer_iph->tos))
+                IP_ECN_set_ce(inner_iph);
+}
+static int ipip_rcv(struct sk_buff *skb)
+{
+        struct iphdr *iph;
+        struct ip_tunnel *tunnel;
+        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                goto out;
+        iph = skb->nh.iph;
+        read_lock(&ipip_lock);
+        if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
+                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+                        read_unlock(&ipip_lock);
+                        kfree_skb(skb);
+                        return 0;
+                }
+                secpath_reset(skb);
+                skb->mac.raw = skb->nh.raw;
+                skb->nh.raw = skb->data;
+                memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+                skb->protocol = htons(ETH_P_IP);
+                skb->pkt_type = PACKET_HOST;
+                tunnel->stat.rx_packets++;
+                tunnel->stat.rx_bytes += skb->len;
+                skb->dev = tunnel->dev;
+                dst_release(skb->dst);
+                skb->dst = NULL;
+                nf_reset(skb);
+                ipip_ecn_decapsulate(iph, skb);
+                netif_rx(skb);
+                read_unlock(&ipip_lock);
+                return 0;
+        }
+        read_unlock(&ipip_lock);
+out:
+        return -1;
+}
+/*
+ *      This function assumes it is being called from dev_queue_xmit()
+ *      and that skb is filled properly by that function.
+ */
+static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+        struct net_device_stats *stats = &tunnel->stat;
+        struct iphdr  *tiph = &tunnel->parms.iph;
+        u8     tos = tunnel->parms.iph.tos;
+        u16    df = tiph->frag_off;
+        struct rtable *rt;                      /* Route to the other host */
+        struct net_device *tdev;                        /* Device to other host */
+        struct iphdr  *old_iph = skb->nh.iph;
+        struct iphdr  *iph;                     /* Our new IP header */
+        int    max_headroom;                    /* The extra header space needed */
+        u32    dst = tiph->daddr;
+        int    mtu;
+        if (tunnel->recursion++) {
+                tunnel->stat.collisions++;
+                goto tx_error;
+        }
+        if (skb->protocol != htons(ETH_P_IP))
+                goto tx_error;
+        if (tos&1)
+                tos = old_iph->tos;
+        if (!dst) {
+                /* NBMA tunnel */
+                if ((rt = (struct rtable*)skb->dst) == NULL) {
+                        tunnel->stat.tx_fifo_errors++;
+                        goto tx_error;
+                }
+                if ((dst = rt->rt_gateway) == 0)
+                        goto tx_error_icmp;
+        }
+        {
+                struct flowi fl = { .oif = tunnel->parms.link,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = dst,
+                                                .saddr = tiph->saddr,
+                                                .tos = RT_TOS(tos) } },
+                                    .proto = IPPROTO_IPIP };
+                if (ip_route_output_key(&rt, &fl)) {
+                        tunnel->stat.tx_carrier_errors++;
+                        goto tx_error_icmp;
+                }
+        }
+        tdev = rt->u.dst.dev;
+        if (tdev == dev) {
+                ip_rt_put(rt);
+                tunnel->stat.collisions++;
+                goto tx_error;
+        }
+        if (tiph->frag_off)
+                mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+        else
+                mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+        if (mtu < 68) {
+                tunnel->stat.collisions++;
+                ip_rt_put(rt);
+                goto tx_error;
+        }
+        if (skb->dst)
+                skb->dst->ops->update_pmtu(skb->dst, mtu);
+        df |= (old_iph->frag_off&htons(IP_DF));
+        if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
+                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+                ip_rt_put(rt);
+                goto tx_error;
+        }
+        if (tunnel->err_count > 0) {
+                if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+                        tunnel->err_count--;
+                        dst_link_failure(skb);
+                } else
+                        tunnel->err_count = 0;
+        }
+        /*
+         * Okay, now see if we can stuff it in the buffer as-is.
+         */
+        max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
+        if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+                if (!new_skb) {
+                        ip_rt_put(rt);
+                        stats->tx_dropped++;
+                        dev_kfree_skb(skb);
+                        tunnel->recursion--;
+                        return 0;
+                }
+                if (skb->sk)
+                        skb_set_owner_w(new_skb, skb->sk);
+                dev_kfree_skb(skb);
+                skb = new_skb;
+                old_iph = skb->nh.iph;
+        }
+        skb->h.raw = skb->nh.raw;
+        skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /*
+         *      Push down and install the IPIP header.
+         */
+        iph                     =       skb->nh.iph;
+        iph->version            =       4;
+        iph->ihl                =       sizeof(struct iphdr)>>2;
+        iph->frag_off           =       df;
+        iph->protocol           =       IPPROTO_IPIP;
+        iph->tos                =       INET_ECN_encapsulate(tos, old_iph->tos);
+        iph->daddr              =       rt->rt_dst;
+        iph->saddr              =       rt->rt_src;
+        if ((iph->ttl = tiph->ttl) == 0)
+                iph->ttl        =       old_iph->ttl;
+        nf_reset(skb);
+        IPTUNNEL_XMIT();
+        tunnel->recursion--;
+        return 0;
+tx_error_icmp:
+        dst_link_failure(skb);
+tx_error:
+        stats->tx_errors++;
+        dev_kfree_skb(skb);
+        tunnel->recursion--;
+        return 0;
+}
+static int
+ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+        int err = 0;
+        struct ip_tunnel_parm p;
+        struct ip_tunnel *t;
+        switch (cmd) {
+        case SIOCGETTUNNEL:
+                t = NULL;
+                if (dev == ipip_fb_tunnel_dev) {
+                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+                                err = -EFAULT;
+                                break;
+                        }
+                        t = ipip_tunnel_locate(&p, 0);
+                }
+                if (t == NULL)
+                        t = (struct ip_tunnel*)dev->priv;
+                memcpy(&p, &t->parms, sizeof(p));
+                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+                        err = -EFAULT;
+                break;
+        case SIOCADDTUNNEL:
+        case SIOCCHGTUNNEL:
+                err = -EPERM;
+                if (!capable(CAP_NET_ADMIN))
+                        goto done;
+                err = -EFAULT;
+                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+                        goto done;
+                err = -EINVAL;
+                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+                        goto done;
+                if (p.iph.ttl)
+                        p.iph.frag_off |= htons(IP_DF);
+                t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+                if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+                        if (t != NULL) {
+                                if (t->dev != dev) {
+                                        err = -EEXIST;
+                                        break;
+                                }
+                        } else {
+                                if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+                                    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+                                        err = -EINVAL;
+                                        break;
+                                }
+                                t = (struct ip_tunnel*)dev->priv;
+                                ipip_tunnel_unlink(t);
+                                t->parms.iph.saddr = p.iph.saddr;
+                                t->parms.iph.daddr = p.iph.daddr;
+                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                                memcpy(dev->broadcast, &p.iph.daddr, 4);
+                                ipip_tunnel_link(t);
+                                netdev_state_change(dev);
+                        }
+                }
+                if (t) {
+                        err = 0;
+                        if (cmd == SIOCCHGTUNNEL) {
+                                t->parms.iph.ttl = p.iph.ttl;
+                                t->parms.iph.tos = p.iph.tos;
+                                t->parms.iph.frag_off = p.iph.frag_off;
+                        }
+                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+                                err = -EFAULT;
+                } else
+                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+                break;
+        case SIOCDELTUNNEL:
+                err = -EPERM;
+                if (!capable(CAP_NET_ADMIN))
+                        goto done;
+                if (dev == ipip_fb_tunnel_dev) {
+                        err = -EFAULT;
+                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+                                goto done;
+                        err = -ENOENT;
+                        if ((t = ipip_tunnel_locate(&p, 0)) == NULL)
+                                goto done;
+                        err = -EPERM;
+                        if (t->dev == ipip_fb_tunnel_dev)
+                                goto done;
+                        dev = t->dev;
+                }
+                err = unregister_netdevice(dev);
+                break;
+        default:
+                err = -EINVAL;
+        }
+done:
+        return err;
+}
+static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
+{
+        return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+        if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+                return -EINVAL;
+        dev->mtu = new_mtu;
+        return 0;
+}
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+        SET_MODULE_OWNER(dev);
+        dev->uninit             = ipip_tunnel_uninit;
+        dev->hard_start_xmit    = ipip_tunnel_xmit;
+        dev->get_stats          = ipip_tunnel_get_stats;
+        dev->do_ioctl           = ipip_tunnel_ioctl;
+        dev->change_mtu         = ipip_tunnel_change_mtu;
+        dev->destructor         = free_netdev;
+        dev->type               = ARPHRD_TUNNEL;
+        dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr);
+        dev->mtu                = 1500 - sizeof(struct iphdr);
+        dev->flags              = IFF_NOARP;
+        dev->iflink             = 0;
+        dev->addr_len           = 4;
+}
+static int ipip_tunnel_init(struct net_device *dev)
+{
+        struct net_device *tdev = NULL;
+        struct ip_tunnel *tunnel;
+        struct iphdr *iph;
+        tunnel = (struct ip_tunnel*)dev->priv;
+        iph = &tunnel->parms.iph;
+        tunnel->dev = dev;
+        strcpy(tunnel->parms.name, dev->name);
+        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+        if (iph->daddr) {
+                struct flowi fl = { .oif = tunnel->parms.link,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = iph->daddr,
+                                                .saddr = iph->saddr,
+                                                .tos = RT_TOS(iph->tos) } },
+                                    .proto = IPPROTO_IPIP };
+                struct rtable *rt;
+                if (!ip_route_output_key(&rt, &fl)) {
+                        tdev = rt->u.dst.dev;
+                        ip_rt_put(rt);
+                }
+                dev->flags |= IFF_POINTOPOINT;
+        }
+        if (!tdev && tunnel->parms.link)
+                tdev = __dev_get_by_index(tunnel->parms.link);
+        if (tdev) {
+                dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+                dev->mtu = tdev->mtu - sizeof(struct iphdr);
+        }
+        dev->iflink = tunnel->parms.link;
+        return 0;
+}
+static int __init ipip_fb_tunnel_init(struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = dev->priv;
+        struct iphdr *iph = &tunnel->parms.iph;
+        tunnel->dev = dev;
+        strcpy(tunnel->parms.name, dev->name);
+        iph->version            = 4;
+        iph->protocol           = IPPROTO_IPIP;
+        iph->ihl                = 5;
+        dev_hold(dev);
+        tunnels_wc[0]           = tunnel;
+        return 0;
+}
+static struct xfrm_tunnel ipip_handler = {
+        .handler        =       ipip_rcv,
+        .err_handler    =       ipip_err,
+};
+static char banner[] __initdata =
+        KERN_INFO "IPv4 over IPv4 tunneling driver\n";
+static int __init ipip_init(void)
+{
+        int err;
+        printk(banner);
+        if (xfrm4_tunnel_register(&ipip_handler) < 0) {
+                printk(KERN_INFO "ipip init: can't register tunnel\n");
+                return -EAGAIN;
+        }
+        ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
+                                           "tunl0",
+                                           ipip_tunnel_setup);
+        if (!ipip_fb_tunnel_dev) {
+                err = -ENOMEM;
+                goto err1;
+        }
+        ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init;
+        if ((err = register_netdev(ipip_fb_tunnel_dev)))
+                goto err2;
+ out:
+        return err;
+ err2:
+        free_netdev(ipip_fb_tunnel_dev);
+ err1:
+        xfrm4_tunnel_deregister(&ipip_handler);
+        goto out;
+}
+static void __exit ipip_fini(void)
+{
+        if (xfrm4_tunnel_deregister(&ipip_handler) < 0)
+                printk(KERN_INFO "ipip close: can't deregister tunnel\n");
+        unregister_netdev(ipip_fb_tunnel_dev);
+}
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 000000000000..e21c049ec62a
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,1900 @@
+/*
+ *      IP multicast routing support for mrouted 3.6/3.8
+ *
+ *              (c) 1995 Alan Cox, <alan@redhat.com>
+ *        Linux Consultancy and Custom Driver Development
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
+ *
+ *      Fixes:
+ *      Michael Chastain        :       Incorrect size of copying.
+ *      Alan Cox                :       Added the cache manager code
+ *      Alan Cox                :       Fixed the clone/copy bug and device race.
+ *      Mike McLagan            :       Routing by source
+ *      Malcolm Beattie         :       Buffer handling fixes.
+ *      Alexey Kuznetsov        :       Double buffer free and other fixes.
+ *      SVR Anand               :       Fixed several multicast bugs and problems.
+ *      Alexey Kuznetsov        :       Status, optimisations and more.
+ *      Brad Parker             :       Better behaviour on mrouted upcall
+ *                                      overflow.
+ *      Carlos Picoto           :       PIMv1 Support
+ *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
+ *                                      Relax this requrement to work with older peers.
+ *
+ */
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+#define CONFIG_IP_PIMSM 1
+#endif
+static struct sock *mroute_socket;
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+   Note that the changes are semaphored via rtnl_lock.
+ */
+static DEFINE_RWLOCK(mrt_lock);
+/*
+ *      Multicast router control variables
+ */
+static struct vif_device vif_table[MAXVIFS];            /* Devices              */
+static int maxvif;
+#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
+static int mroute_do_assert;                            /* Set in PIM assert    */
+static int mroute_do_pim;
+static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
+static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
+static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+/* We return to original Alan's scheme. Hash table of resolved
+   entries is changed only in process context and protected
+   with weak lock mrt_lock. Queue of unresolved entries is protected
+   with strong spinlock mfc_unres_lock.
+   In this case data path is free of exclusive locks at all.
+ */
+static kmem_cache_t *mrt_cachep;
+static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
+static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+#ifdef CONFIG_IP_PIMSM_V2
+static struct net_protocol pim_protocol;
+#endif
+static struct timer_list ipmr_expire_timer;
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+static
+struct net_device *ipmr_new_tunnel(struct vifctl *v)
+{
+        struct net_device  *dev;
+        dev = __dev_get_by_name("tunl0");
+        if (dev) {
+                int err;
+                struct ifreq ifr;
+                mm_segment_t    oldfs;
+                struct ip_tunnel_parm p;
+                struct in_device  *in_dev;
+                memset(&p, 0, sizeof(p));
+                p.iph.daddr = v->vifc_rmt_addr.s_addr;
+                p.iph.saddr = v->vifc_lcl_addr.s_addr;
+                p.iph.version = 4;
+                p.iph.ihl = 5;
+                p.iph.protocol = IPPROTO_IPIP;
+                sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+                ifr.ifr_ifru.ifru_data = (void*)&p;
+                oldfs = get_fs(); set_fs(KERNEL_DS);
+                err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+                set_fs(oldfs);
+                dev = NULL;
+                if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
+                        dev->flags |= IFF_MULTICAST;
+                        in_dev = __in_dev_get(dev);
+                        if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
+                                goto failure;
+                        in_dev->cnf.rp_filter = 0;
+                        if (dev_open(dev))
+                                goto failure;
+                }
+        }
+        return dev;
+failure:
+        /* allow the register to be completed before unregistering. */
+        rtnl_unlock();
+        rtnl_lock();
+        unregister_netdevice(dev);
+        return NULL;
+}
+#ifdef CONFIG_IP_PIMSM
+static int reg_vif_num = -1;
+static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        read_lock(&mrt_lock);
+        ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
+        ((struct net_device_stats*)dev->priv)->tx_packets++;
+        ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
+        read_unlock(&mrt_lock);
+        kfree_skb(skb);
+        return 0;
+}
+static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
+{
+        return (struct net_device_stats*)dev->priv;
+}
+static void reg_vif_setup(struct net_device *dev)
+{
+        dev->type               = ARPHRD_PIMREG;
+        dev->mtu                = 1500 - sizeof(struct iphdr) - 8;
+        dev->flags              = IFF_NOARP;
+        dev->hard_start_xmit    = reg_vif_xmit;
+        dev->get_stats          = reg_vif_get_stats;
+        dev->destructor         = free_netdev;
+}
+static struct net_device *ipmr_reg_vif(void)
+{
+        struct net_device *dev;
+        struct in_device *in_dev;
+        dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
+                           reg_vif_setup);
+        if (dev == NULL)
+                return NULL;
+        if (register_netdevice(dev)) {
+                free_netdev(dev);
+                return NULL;
+        }
+        dev->iflink = 0;
+        if ((in_dev = inetdev_init(dev)) == NULL)
+                goto failure;
+        in_dev->cnf.rp_filter = 0;
+        if (dev_open(dev))
+                goto failure;
+        return dev;
+failure:
+        /* allow the register to be completed before unregistering. */
+        rtnl_unlock();
+        rtnl_lock();
+        unregister_netdevice(dev);
+        return NULL;
+}
+#endif
+/*
+ *      Delete a VIF entry
+ */
+ 
+static int vif_delete(int vifi)
+{
+        struct vif_device *v;
+        struct net_device *dev;
+        struct in_device *in_dev;
+        if (vifi < 0 || vifi >= maxvif)
+                return -EADDRNOTAVAIL;
+        v = &vif_table[vifi];
+        write_lock_bh(&mrt_lock);
+        dev = v->dev;
+        v->dev = NULL;
+        if (!dev) {
+                write_unlock_bh(&mrt_lock);
+                return -EADDRNOTAVAIL;
+        }
+#ifdef CONFIG_IP_PIMSM
+        if (vifi == reg_vif_num)
+                reg_vif_num = -1;
+#endif
+        if (vifi+1 == maxvif) {
+                int tmp;
+                for (tmp=vifi-1; tmp>=0; tmp--) {
+                        if (VIF_EXISTS(tmp))
+                                break;
+                }
+                maxvif = tmp+1;
+        }
+        write_unlock_bh(&mrt_lock);
+        dev_set_allmulti(dev, -1);
+        if ((in_dev = __in_dev_get(dev)) != NULL) {
+                in_dev->cnf.mc_forwarding--;
+                ip_rt_multicast_event(in_dev);
+        }
+        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+                unregister_netdevice(dev);
+        dev_put(dev);
+        return 0;
+}
+/* Destroy an unresolved cache entry, killing queued skbs
+   and reporting error to netlink readers.
+ */
+static void ipmr_destroy_unres(struct mfc_cache *c)
+{
+        struct sk_buff *skb;
+        atomic_dec(&cache_resolve_queue_len);
+        while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
+                if (skb->nh.iph->version == 0) {
+                        struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+                        nlh->nlmsg_type = NLMSG_ERROR;
+                        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+                        skb_trim(skb, nlh->nlmsg_len);
+                        ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+                        netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+                } else
+                        kfree_skb(skb);
+        }
+        kmem_cache_free(mrt_cachep, c);
+}
+/* Single timer process for all the unresolved queue. */
+static void ipmr_expire_process(unsigned long dummy)
+{
+        unsigned long now;
+        unsigned long expires;
+        struct mfc_cache *c, **cp;
+        if (!spin_trylock(&mfc_unres_lock)) {
+                mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
+                return;
+        }
+        if (atomic_read(&cache_resolve_queue_len) == 0)
+                goto out;
+        now = jiffies;
+        expires = 10*HZ;
+        cp = &mfc_unres_queue;
+        while ((c=*cp) != NULL) {
+                if (time_after(c->mfc_un.unres.expires, now)) {
+                        unsigned long interval = c->mfc_un.unres.expires - now;
+                        if (interval < expires)
+                                expires = interval;
+                        cp = &c->next;
+                        continue;
+                }
+                *cp = c->next;
+                ipmr_destroy_unres(c);
+        }
+        if (atomic_read(&cache_resolve_queue_len))
+                mod_timer(&ipmr_expire_timer, jiffies + expires);
+out:
+        spin_unlock(&mfc_unres_lock);
+}
+/* Fill oifs list. It is called under write locked mrt_lock. */
+static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
+{
+        int vifi;
+        cache->mfc_un.res.minvif = MAXVIFS;
+        cache->mfc_un.res.maxvif = 0;
+        memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+        for (vifi=0; vifi<maxvif; vifi++) {
+                if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
+                        cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+                        if (cache->mfc_un.res.minvif > vifi)
+                                cache->mfc_un.res.minvif = vifi;
+                        if (cache->mfc_un.res.maxvif <= vifi)
+                                cache->mfc_un.res.maxvif = vifi + 1;
+                }
+        }
+}
+static int vif_add(struct vifctl *vifc, int mrtsock)
+{
+        int vifi = vifc->vifc_vifi;
+        struct vif_device *v = &vif_table[vifi];
+        struct net_device *dev;
+        struct in_device *in_dev;
+        /* Is vif busy ? */
+        if (VIF_EXISTS(vifi))
+                return -EADDRINUSE;
+        switch (vifc->vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+        case VIFF_REGISTER:
+                /*
+                 * Special Purpose VIF in PIM
+                 * All the packets will be sent to the daemon
+                 */
+                if (reg_vif_num >= 0)
+                        return -EADDRINUSE;
+                dev = ipmr_reg_vif();
+                if (!dev)
+                        return -ENOBUFS;
+                break;
+#endif
+        case VIFF_TUNNEL:       
+                dev = ipmr_new_tunnel(vifc);
+                if (!dev)
+                        return -ENOBUFS;
+                break;
+        case 0:
+                dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
+                if (!dev)
+                        return -EADDRNOTAVAIL;
+                __dev_put(dev);
+                break;
+        default:
+                return -EINVAL;
+        }
+        if ((in_dev = __in_dev_get(dev)) == NULL)
+                return -EADDRNOTAVAIL;
+        in_dev->cnf.mc_forwarding++;
+        dev_set_allmulti(dev, +1);
+        ip_rt_multicast_event(in_dev);
+        /*
+         *      Fill in the VIF structures
+         */
+        v->rate_limit=vifc->vifc_rate_limit;
+        v->local=vifc->vifc_lcl_addr.s_addr;
+        v->remote=vifc->vifc_rmt_addr.s_addr;
+        v->flags=vifc->vifc_flags;
+        if (!mrtsock)
+                v->flags |= VIFF_STATIC;
+        v->threshold=vifc->vifc_threshold;
+        v->bytes_in = 0;
+        v->bytes_out = 0;
+        v->pkt_in = 0;
+        v->pkt_out = 0;
+        v->link = dev->ifindex;
+        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+                v->link = dev->iflink;
+        /* And finish update writing critical data */
+        write_lock_bh(&mrt_lock);
+        dev_hold(dev);
+        v->dev=dev;
+#ifdef CONFIG_IP_PIMSM
+        if (v->flags&VIFF_REGISTER)
+                reg_vif_num = vifi;
+#endif
+        if (vifi+1 > maxvif)
+                maxvif = vifi+1;
+        write_unlock_bh(&mrt_lock);
+        return 0;
+}
+static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
+{
+        int line=MFC_HASH(mcastgrp,origin);
+        struct mfc_cache *c;
+        for (c=mfc_cache_array[line]; c; c = c->next) {
+                if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
+                        break;
+        }
+        return c;
+}
+/*
+ *      Allocate a multicast cache entry
+ */
+static struct mfc_cache *ipmr_cache_alloc(void)
+{
+        struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
+        if(c==NULL)
+                return NULL;
+        memset(c, 0, sizeof(*c));
+        c->mfc_un.res.minvif = MAXVIFS;
+        return c;
+}
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+        struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
+        if(c==NULL)
+                return NULL;
+        memset(c, 0, sizeof(*c));
+        skb_queue_head_init(&c->mfc_un.unres.unresolved);
+        c->mfc_un.unres.expires = jiffies + 10*HZ;
+        return c;
+}
+/*
+ *      A cache entry has gone into a resolved state from queued
+ */
+ 
+static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
+{
+        struct sk_buff *skb;
+        /*
+         *      Play the pending entries through our router
+         */
+        while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+                if (skb->nh.iph->version == 0) {
+                        int err;
+                        struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+                        if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
+                                nlh->nlmsg_len = skb->tail - (u8*)nlh;
+                        } else {
+                                nlh->nlmsg_type = NLMSG_ERROR;
+                                nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+                                skb_trim(skb, nlh->nlmsg_len);
+                                ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+                        }
+                        err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+                } else
+                        ip_mr_forward(skb, c, 0);
+        }
+}
+/*
+ *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ *      expects the following bizarre scheme.
+ *
+ *      Called under mrt_lock.
+ */
+ 
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+        struct sk_buff *skb;
+        int ihl = pkt->nh.iph->ihl<<2;
+        struct igmphdr *igmp;
+        struct igmpmsg *msg;
+        int ret;
+#ifdef CONFIG_IP_PIMSM
+        if (assert == IGMPMSG_WHOLEPKT)
+                skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+        else
+#endif
+                skb = alloc_skb(128, GFP_ATOMIC);
+        if(!skb)
+                return -ENOBUFS;
+#ifdef CONFIG_IP_PIMSM
+        if (assert == IGMPMSG_WHOLEPKT) {
+                /* Ugly, but we have no choice with this interface.
+                   Duplicate old header, fix ihl, length etc.
+                   And all this only to mangle msg->im_msgtype and
+                   to set msg->im_mbz to "mbz" :-)
+                 */
+                msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
+                skb->nh.raw = skb->h.raw = (u8*)msg;
+                memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
+                msg->im_msgtype = IGMPMSG_WHOLEPKT;
+                msg->im_mbz = 0;
+                msg->im_vif = reg_vif_num;
+                skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
+                skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
+        } else 
+#endif
+        {       
+                
+        /*
+         *      Copy the IP header
+         */
+        skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
+        memcpy(skb->data,pkt->data,ihl);
+        skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
+        msg = (struct igmpmsg*)skb->nh.iph;
+        msg->im_vif = vifi;
+        skb->dst = dst_clone(pkt->dst);
+        /*
+         *      Add our header
+         */
+        igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
+        igmp->type      =
+        msg->im_msgtype = assert;
+        igmp->code      =       0;
+        skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
+        skb->h.raw = skb->nh.raw;
+        }
+        if (mroute_socket == NULL) {
+                kfree_skb(skb);
+                return -EINVAL;
+        }
+        /*
+         *      Deliver to mrouted
+         */
+        if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
+                if (net_ratelimit())
+                        printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+                kfree_skb(skb);
+        }
+        return ret;
+}
+/*
+ *      Queue a packet for resolution. It gets locked cache entry!
+ */
+ 
+static int
+ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
+{
+        int err;
+        struct mfc_cache *c;
+        spin_lock_bh(&mfc_unres_lock);
+        for (c=mfc_unres_queue; c; c=c->next) {
+                if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
+                    c->mfc_origin == skb->nh.iph->saddr)
+                        break;
+        }
+        if (c == NULL) {
+                /*
+                 *      Create a new entry if allowable
+                 */
+                if (atomic_read(&cache_resolve_queue_len)>=10 ||
+                    (c=ipmr_cache_alloc_unres())==NULL) {
+                        spin_unlock_bh(&mfc_unres_lock);
+                        kfree_skb(skb);
+                        return -ENOBUFS;
+                }
+                /*
+                 *      Fill in the new cache entry
+                 */
+                c->mfc_parent=-1;
+                c->mfc_origin=skb->nh.iph->saddr;
+                c->mfc_mcastgrp=skb->nh.iph->daddr;
+                /*
+                 *      Reflect first query at mrouted.
+                 */
+                if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
+                        /* If the report failed throw the cache entry 
+                           out - Brad Parker
+                         */
+                        spin_unlock_bh(&mfc_unres_lock);
+                        kmem_cache_free(mrt_cachep, c);
+                        kfree_skb(skb);
+                        return err;
+                }
+                atomic_inc(&cache_resolve_queue_len);
+                c->next = mfc_unres_queue;
+                mfc_unres_queue = c;
+                mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
+        }
+        /*
+         *      See if we can append the packet
+         */
+        if (c->mfc_un.unres.unresolved.qlen>3) {
+                kfree_skb(skb);
+                err = -ENOBUFS;
+        } else {
+                skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
+                err = 0;
+        }
+        spin_unlock_bh(&mfc_unres_lock);
+        return err;
+}
+/*
+ *      MFC cache manipulation by user space mroute daemon
+ */
+static int ipmr_mfc_delete(struct mfcctl *mfc)
+{
+        int line;
+        struct mfc_cache *c, **cp;
+        line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+        for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+                        write_lock_bh(&mrt_lock);
+                        *cp = c->next;
+                        write_unlock_bh(&mrt_lock);
+                        kmem_cache_free(mrt_cachep, c);
+                        return 0;
+                }
+        }
+        return -ENOENT;
+}
+static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
+{
+        int line;
+        struct mfc_cache *uc, *c, **cp;
+        line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+        for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
+                        break;
+        }
+        if (c != NULL) {
+                write_lock_bh(&mrt_lock);
+                c->mfc_parent = mfc->mfcc_parent;
+                ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+                if (!mrtsock)
+                        c->mfc_flags |= MFC_STATIC;
+                write_unlock_bh(&mrt_lock);
+                return 0;
+        }
+        if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
+                return -EINVAL;
+        c=ipmr_cache_alloc();
+        if (c==NULL)
+                return -ENOMEM;
+        c->mfc_origin=mfc->mfcc_origin.s_addr;
+        c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
+        c->mfc_parent=mfc->mfcc_parent;
+        ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+        if (!mrtsock)
+                c->mfc_flags |= MFC_STATIC;
+        write_lock_bh(&mrt_lock);
+        c->next = mfc_cache_array[line];
+        mfc_cache_array[line] = c;
+        write_unlock_bh(&mrt_lock);
+        /*
+         *      Check to see if we resolved a queued list. If so we
+         *      need to send on the frames and tidy up.
+         */
+        spin_lock_bh(&mfc_unres_lock);
+        for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
+             cp = &uc->next) {
+                if (uc->mfc_origin == c->mfc_origin &&
+                    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+                        *cp = uc->next;
+                        if (atomic_dec_and_test(&cache_resolve_queue_len))
+                                del_timer(&ipmr_expire_timer);
+                        break;
+                }
+        }
+        spin_unlock_bh(&mfc_unres_lock);
+        if (uc) {
+                ipmr_cache_resolve(uc, c);
+                kmem_cache_free(mrt_cachep, uc);
+        }
+        return 0;
+}
+/*
+ *      Close the multicast socket, and clear the vif tables etc
+ */
+ 
+static void mroute_clean_tables(struct sock *sk)
+{
+        int i;
+                
+        /*
+         *      Shut down all active vif entries
+         */
+        for(i=0; i<maxvif; i++) {
+                if (!(vif_table[i].flags&VIFF_STATIC))
+                        vif_delete(i);
+        }
+        /*
+         *      Wipe the cache
+         */
+        for (i=0;i<MFC_LINES;i++) {
+                struct mfc_cache *c, **cp;
+                cp = &mfc_cache_array[i];
+                while ((c = *cp) != NULL) {
+                        if (c->mfc_flags&MFC_STATIC) {
+                                cp = &c->next;
+                                continue;
+                        }
+                        write_lock_bh(&mrt_lock);
+                        *cp = c->next;
+                        write_unlock_bh(&mrt_lock);
+                        kmem_cache_free(mrt_cachep, c);
+                }
+        }
+        if (atomic_read(&cache_resolve_queue_len) != 0) {
+                struct mfc_cache *c;
+                spin_lock_bh(&mfc_unres_lock);
+                while (mfc_unres_queue != NULL) {
+                        c = mfc_unres_queue;
+                        mfc_unres_queue = c->next;
+                        spin_unlock_bh(&mfc_unres_lock);
+                        ipmr_destroy_unres(c);
+                        spin_lock_bh(&mfc_unres_lock);
+                }
+                spin_unlock_bh(&mfc_unres_lock);
+        }
+}
+static void mrtsock_destruct(struct sock *sk)
+{
+        rtnl_lock();
+        if (sk == mroute_socket) {
+                ipv4_devconf.mc_forwarding--;
+                write_lock_bh(&mrt_lock);
+                mroute_socket=NULL;
+                write_unlock_bh(&mrt_lock);
+                mroute_clean_tables(sk);
+        }
+        rtnl_unlock();
+}
+/*
+ *      Socket options and virtual interface manipulation. The whole
+ *      virtual interface system is a complete heap, but unfortunately
+ *      that's how BSD mrouted happens to think. Maybe one day with a proper
+ *      MOSPF/PIM router set up we can clean this up.
+ */
+ 
+int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
+{
+        int ret;
+        struct vifctl vif;
+        struct mfcctl mfc;
+        
+        if(optname!=MRT_INIT)
+        {
+                if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
+                        return -EACCES;
+        }
+        switch(optname)
+        {
+                case MRT_INIT:
+                        if (sk->sk_type != SOCK_RAW ||
+                            inet_sk(sk)->num != IPPROTO_IGMP)
+                                return -EOPNOTSUPP;
+                        if(optlen!=sizeof(int))
+                                return -ENOPROTOOPT;
+                        rtnl_lock();
+                        if (mroute_socket) {
+                                rtnl_unlock();
+                                return -EADDRINUSE;
+                        }
+                        ret = ip_ra_control(sk, 1, mrtsock_destruct);
+                        if (ret == 0) {
+                                write_lock_bh(&mrt_lock);
+                                mroute_socket=sk;
+                                write_unlock_bh(&mrt_lock);
+                                ipv4_devconf.mc_forwarding++;
+                        }
+                        rtnl_unlock();
+                        return ret;
+                case MRT_DONE:
+                        if (sk!=mroute_socket)
+                                return -EACCES;
+                        return ip_ra_control(sk, 0, NULL);
+                case MRT_ADD_VIF:
+                case MRT_DEL_VIF:
+                        if(optlen!=sizeof(vif))
+                                return -EINVAL;
+                        if (copy_from_user(&vif,optval,sizeof(vif)))
+                                return -EFAULT; 
+                        if(vif.vifc_vifi >= MAXVIFS)
+                                return -ENFILE;
+                        rtnl_lock();
+                        if (optname==MRT_ADD_VIF) {
+                                ret = vif_add(&vif, sk==mroute_socket);
+                        } else {
+                                ret = vif_delete(vif.vifc_vifi);
+                        }
+                        rtnl_unlock();
+                        return ret;
+                /*
+                 *      Manipulate the forwarding caches. These live
+                 *      in a sort of kernel/user symbiosis.
+                 */
+                case MRT_ADD_MFC:
+                case MRT_DEL_MFC:
+                        if(optlen!=sizeof(mfc))
+                                return -EINVAL;
+                        if (copy_from_user(&mfc,optval, sizeof(mfc)))
+                                return -EFAULT;
+                        rtnl_lock();
+                        if (optname==MRT_DEL_MFC)
+                                ret = ipmr_mfc_delete(&mfc);
+                        else
+                                ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
+                        rtnl_unlock();
+                        return ret;
+                /*
+                 *      Control PIM assert.
+                 */
+                case MRT_ASSERT:
+                {
+                        int v;
+                        if(get_user(v,(int __user *)optval))
+                                return -EFAULT;
+                        mroute_do_assert=(v)?1:0;
+                        return 0;
+                }
+#ifdef CONFIG_IP_PIMSM
+                case MRT_PIM:
+                {
+                        int v, ret;
+                        if(get_user(v,(int __user *)optval))
+                                return -EFAULT;
+                        v = (v)?1:0;
+                        rtnl_lock();
+                        ret = 0;
+                        if (v != mroute_do_pim) {
+                                mroute_do_pim = v;
+                                mroute_do_assert = v;
+#ifdef CONFIG_IP_PIMSM_V2
+                                if (mroute_do_pim)
+                                        ret = inet_add_protocol(&pim_protocol,
+                                                                IPPROTO_PIM);
+                                else
+                                        ret = inet_del_protocol(&pim_protocol,
+                                                                IPPROTO_PIM);
+                                if (ret < 0)
+                                        ret = -EAGAIN;
+#endif
+                        }
+                        rtnl_unlock();
+                        return ret;
+                }
+#endif
+                /*
+                 *      Spurious command, or MRT_VERSION which you cannot
+                 *      set.
+                 */
+                default:
+                        return -ENOPROTOOPT;
+        }
+}
+/*
+ *      Getsock opt support for the multicast routing system.
+ */
+ 
+int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
+{
+        int olr;
+        int val;
+        if(optname!=MRT_VERSION && 
+#ifdef CONFIG_IP_PIMSM
+           optname!=MRT_PIM &&
+#endif
+           optname!=MRT_ASSERT)
+                return -ENOPROTOOPT;
+        if (get_user(olr, optlen))
+                return -EFAULT;
+        olr = min_t(unsigned int, olr, sizeof(int));
+        if (olr < 0)
+                return -EINVAL;
+                
+        if(put_user(olr,optlen))
+                return -EFAULT;
+        if(optname==MRT_VERSION)
+                val=0x0305;
+#ifdef CONFIG_IP_PIMSM
+        else if(optname==MRT_PIM)
+                val=mroute_do_pim;
+#endif
+        else
+                val=mroute_do_assert;
+        if(copy_to_user(optval,&val,olr))
+                return -EFAULT;
+        return 0;
+}
+/*
+ *      The IP multicast ioctl support routines.
+ */
+ 
+int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+        struct sioc_sg_req sr;
+        struct sioc_vif_req vr;
+        struct vif_device *vif;
+        struct mfc_cache *c;
+        
+        switch(cmd)
+        {
+                case SIOCGETVIFCNT:
+                        if (copy_from_user(&vr,arg,sizeof(vr)))
+                                return -EFAULT; 
+                        if(vr.vifi>=maxvif)
+                                return -EINVAL;
+                        read_lock(&mrt_lock);
+                        vif=&vif_table[vr.vifi];
+                        if(VIF_EXISTS(vr.vifi)) {
+                                vr.icount=vif->pkt_in;
+                                vr.ocount=vif->pkt_out;
+                                vr.ibytes=vif->bytes_in;
+                                vr.obytes=vif->bytes_out;
+                                read_unlock(&mrt_lock);
+                                if (copy_to_user(arg,&vr,sizeof(vr)))
+                                        return -EFAULT;
+                                return 0;
+                        }
+                        read_unlock(&mrt_lock);
+                        return -EADDRNOTAVAIL;
+                case SIOCGETSGCNT:
+                        if (copy_from_user(&sr,arg,sizeof(sr)))
+                                return -EFAULT;
+                        read_lock(&mrt_lock);
+                        c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
+                        if (c) {
+                                sr.pktcnt = c->mfc_un.res.pkt;
+                                sr.bytecnt = c->mfc_un.res.bytes;
+                                sr.wrong_if = c->mfc_un.res.wrong_if;
+                                read_unlock(&mrt_lock);
+                                if (copy_to_user(arg,&sr,sizeof(sr)))
+                                        return -EFAULT;
+                                return 0;
+                        }
+                        read_unlock(&mrt_lock);
+                        return -EADDRNOTAVAIL;
+                default:
+                        return -ENOIOCTLCMD;
+        }
+}
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+        struct vif_device *v;
+        int ct;
+        if (event != NETDEV_UNREGISTER)
+                return NOTIFY_DONE;
+        v=&vif_table[0];
+        for(ct=0;ct<maxvif;ct++,v++) {
+                if (v->dev==ptr)
+                        vif_delete(ct);
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block ip_mr_notifier={
+        .notifier_call = ipmr_device_event,
+};
+/*
+ *      Encapsulate a packet by attaching a valid IPIP header to it.
+ *      This avoids tunnel drivers and other mess and gives us the speed so
+ *      important for multicast video.
+ */
+ 
+static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+        struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+        iph->version    =       4;
+        iph->tos        =       skb->nh.iph->tos;
+        iph->ttl        =       skb->nh.iph->ttl;
+        iph->frag_off   =       0;
+        iph->daddr      =       daddr;
+        iph->saddr      =       saddr;
+        iph->protocol   =       IPPROTO_IPIP;
+        iph->ihl        =       5;
+        iph->tot_len    =       htons(skb->len);
+        ip_select_ident(iph, skb->dst, NULL);
+        ip_send_check(iph);
+        skb->h.ipiph = skb->nh.iph;
+        skb->nh.iph = iph;
+        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+        nf_reset(skb);
+}
+static inline int ipmr_forward_finish(struct sk_buff *skb)
+{
+        struct ip_options * opt = &(IPCB(skb)->opt);
+        IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+        if (unlikely(opt->optlen))
+                ip_forward_options(skb);
+        return dst_output(skb);
+}
+/*
+ *      Processing handlers for ipmr_forward
+ */
+static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
+{
+        struct iphdr *iph = skb->nh.iph;
+        struct vif_device *vif = &vif_table[vifi];
+        struct net_device *dev;
+        struct rtable *rt;
+        int    encap = 0;
+        if (vif->dev == NULL)
+                goto out_free;
+#ifdef CONFIG_IP_PIMSM
+        if (vif->flags & VIFF_REGISTER) {
+                vif->pkt_out++;
+                vif->bytes_out+=skb->len;
+                ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
+                ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
+                ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
+                kfree_skb(skb);
+                return;
+        }
+#endif
+        if (vif->flags&VIFF_TUNNEL) {
+                struct flowi fl = { .oif = vif->link,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = vif->remote,
+                                                .saddr = vif->local,
+                                                .tos = RT_TOS(iph->tos) } },
+                                    .proto = IPPROTO_IPIP };
+                if (ip_route_output_key(&rt, &fl))
+                        goto out_free;
+                encap = sizeof(struct iphdr);
+        } else {
+                struct flowi fl = { .oif = vif->link,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = iph->daddr,
+                                                .tos = RT_TOS(iph->tos) } },
+                                    .proto = IPPROTO_IPIP };
+                if (ip_route_output_key(&rt, &fl))
+                        goto out_free;
+        }
+        dev = rt->u.dst.dev;
+        if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+                /* Do not fragment multicasts. Alas, IPv4 does not
+                   allow to send ICMP, so that packets will disappear
+                   to blackhole.
+                 */
+                IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
+                ip_rt_put(rt);
+                goto out_free;
+        }
+        encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
+        if (skb_cow(skb, encap)) {
+                ip_rt_put(rt);
+                goto out_free;
+        }
+        vif->pkt_out++;
+        vif->bytes_out+=skb->len;
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        iph = skb->nh.iph;
+        ip_decrease_ttl(iph);
+        /* FIXME: forward and output firewalls used to be called here.
+         * What do we do with netfilter? -- RR */
+        if (vif->flags & VIFF_TUNNEL) {
+                ip_encap(skb, vif->local, vif->remote);
+                /* FIXME: extra output firewall step used to be here. --RR */
+                ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
+                ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
+        }
+        IPCB(skb)->flags |= IPSKB_FORWARDED;
+        /*
+         * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+         * not only before forwarding, but after forwarding on all output
+         * interfaces. It is clear, if mrouter runs a multicasting
+         * program, it should receive packets not depending to what interface
+         * program is joined.
+         * If we will not make it, the program will have to join on all
+         * interfaces. On the other hand, multihoming host (or router, but
+         * not mrouter) cannot join to more than one interface - it will
+         * result in receiving multiple packets.
+         */
+        NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
+                ipmr_forward_finish);
+        return;
+out_free:
+        kfree_skb(skb);
+        return;
+}
+static int ipmr_find_vif(struct net_device *dev)
+{
+        int ct;
+        for (ct=maxvif-1; ct>=0; ct--) {
+                if (vif_table[ct].dev == dev)
+                        break;
+        }
+        return ct;
+}
+/* "local" means that we should preserve one skb (for local delivery) */
+static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
+{
+        int psend = -1;
+        int vif, ct;
+        vif = cache->mfc_parent;
+        cache->mfc_un.res.pkt++;
+        cache->mfc_un.res.bytes += skb->len;
+        /*
+         * Wrong interface: drop packet and (maybe) send PIM assert.
+         */
+        if (vif_table[vif].dev != skb->dev) {
+                int true_vifi;
+                if (((struct rtable*)skb->dst)->fl.iif == 0) {
+                        /* It is our own packet, looped back.
+                           Very complicated situation...
+                           The best workaround until routing daemons will be
+                           fixed is not to redistribute packet, if it was
+                           send through wrong interface. It means, that
+                           multicast applications WILL NOT work for
+                           (S,G), which have default multicast route pointing
+                           to wrong oif. In any case, it is not a good
+                           idea to use multicasting applications on router.
+                         */
+                        goto dont_forward;
+                }
+                cache->mfc_un.res.wrong_if++;
+                true_vifi = ipmr_find_vif(skb->dev);
+                if (true_vifi >= 0 && mroute_do_assert &&
+                    /* pimsm uses asserts, when switching from RPT to SPT,
+                       so that we cannot check that packet arrived on an oif.
+                       It is bad, but otherwise we would need to move pretty
+                       large chunk of pimd to kernel. Ough... --ANK
+                     */
+                    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
+                    time_after(jiffies, 
+                               cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+                        cache->mfc_un.res.last_assert = jiffies;
+                        ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
+                }
+                goto dont_forward;
+        }
+        vif_table[vif].pkt_in++;
+        vif_table[vif].bytes_in+=skb->len;
+        /*
+         *      Forward the frame
+         */
+        for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+                if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
+                        if (psend != -1) {
+                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+                                if (skb2)
+                                        ipmr_queue_xmit(skb2, cache, psend);
+                        }
+                        psend=ct;
+                }
+        }
+        if (psend != -1) {
+                if (local) {
+                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+                        if (skb2)
+                                ipmr_queue_xmit(skb2, cache, psend);
+                } else {
+                        ipmr_queue_xmit(skb, cache, psend);
+                        return 0;
+                }
+        }
+dont_forward:
+        if (!local)
+                kfree_skb(skb);
+        return 0;
+}
+/*
+ *      Multicast packets for forwarding arrive here
+ */
+int ip_mr_input(struct sk_buff *skb)
+{
+        struct mfc_cache *cache;
+        int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
+        /* Packet is looped back after forward, it should not be
+           forwarded second time, but still can be delivered locally.
+         */
+        if (IPCB(skb)->flags&IPSKB_FORWARDED)
+                goto dont_forward;
+        if (!local) {
+                    if (IPCB(skb)->opt.router_alert) {
+                            if (ip_call_ra_chain(skb))
+                                    return 0;
+                    } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
+                            /* IGMPv1 (and broken IGMPv2 implementations sort of
+                               Cisco IOS <= 11.2(8)) do not put router alert
+                               option to IGMP packets destined to routable
+                               groups. It is very bad, because it means
+                               that we can forward NO IGMP messages.
+                             */
+                            read_lock(&mrt_lock);
+                            if (mroute_socket) {
+                                    raw_rcv(mroute_socket, skb);
+                                    read_unlock(&mrt_lock);
+                                    return 0;
+                            }
+                            read_unlock(&mrt_lock);
+                    }
+        }
+        read_lock(&mrt_lock);
+        cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
+        /*
+         *      No usable cache entry
+         */
+        if (cache==NULL) {
+                int vif;
+                if (local) {
+                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+                        ip_local_deliver(skb);
+                        if (skb2 == NULL) {
+                                read_unlock(&mrt_lock);
+                                return -ENOBUFS;
+                        }
+                        skb = skb2;
+                }
+                vif = ipmr_find_vif(skb->dev);
+                if (vif >= 0) {
+                        int err = ipmr_cache_unresolved(vif, skb);
+                        read_unlock(&mrt_lock);
+                        return err;
+                }
+                read_unlock(&mrt_lock);
+                kfree_skb(skb);
+                return -ENODEV;
+        }
+        ip_mr_forward(skb, cache, local);
+        read_unlock(&mrt_lock);
+        if (local)
+                return ip_local_deliver(skb);
+        return 0;
+dont_forward:
+        if (local)
+                return ip_local_deliver(skb);
+        kfree_skb(skb);
+        return 0;
+}
+#ifdef CONFIG_IP_PIMSM_V1
+/*
+ * Handle IGMP messages of PIMv1
+ */
+int pim_rcv_v1(struct sk_buff * skb)
+{
+        struct igmphdr *pim;
+        struct iphdr   *encap;
+        struct net_device  *reg_dev = NULL;
+        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
+                goto drop;
+        pim = (struct igmphdr*)skb->h.raw;
+        if (!mroute_do_pim ||
+            skb->len < sizeof(*pim) + sizeof(*encap) ||
+            pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
+                goto drop;
+        encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
+        /*
+           Check that:
+           a. packet is really destinted to a multicast group
+           b. packet is not a NULL-REGISTER
+           c. packet is not truncated
+         */
+        if (!MULTICAST(encap->daddr) ||
+            encap->tot_len == 0 ||
+            ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
+                goto drop;
+        read_lock(&mrt_lock);
+        if (reg_vif_num >= 0)
+                reg_dev = vif_table[reg_vif_num].dev;
+        if (reg_dev)
+                dev_hold(reg_dev);
+        read_unlock(&mrt_lock);
+        if (reg_dev == NULL) 
+                goto drop;
+        skb->mac.raw = skb->nh.raw;
+        skb_pull(skb, (u8*)encap - skb->data);
+        skb->nh.iph = (struct iphdr *)skb->data;
+        skb->dev = reg_dev;
+        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+        skb->protocol = htons(ETH_P_IP);
+        skb->ip_summed = 0;
+        skb->pkt_type = PACKET_HOST;
+        dst_release(skb->dst);
+        skb->dst = NULL;
+        ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+        ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+        nf_reset(skb);
+        netif_rx(skb);
+        dev_put(reg_dev);
+        return 0;
+ drop:
+        kfree_skb(skb);
+        return 0;
+}
+#endif
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff * skb)
+{
+        struct pimreghdr *pim;
+        struct iphdr   *encap;
+        struct net_device  *reg_dev = NULL;
+        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
+                goto drop;
+        pim = (struct pimreghdr*)skb->h.raw;
+        if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
+            (pim->flags&PIM_NULL_REGISTER) ||
+            (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
+             (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
+                goto drop;
+        /* check if the inner packet is destined to mcast group */
+        encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
+        if (!MULTICAST(encap->daddr) ||
+            encap->tot_len == 0 ||
+            ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
+                goto drop;
+        read_lock(&mrt_lock);
+        if (reg_vif_num >= 0)
+                reg_dev = vif_table[reg_vif_num].dev;
+        if (reg_dev)
+                dev_hold(reg_dev);
+        read_unlock(&mrt_lock);
+        if (reg_dev == NULL) 
+                goto drop;
+        skb->mac.raw = skb->nh.raw;
+        skb_pull(skb, (u8*)encap - skb->data);
+        skb->nh.iph = (struct iphdr *)skb->data;
+        skb->dev = reg_dev;
+        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+        skb->protocol = htons(ETH_P_IP);
+        skb->ip_summed = 0;
+        skb->pkt_type = PACKET_HOST;
+        dst_release(skb->dst);
+        ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+        ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+        skb->dst = NULL;
+        nf_reset(skb);
+        netif_rx(skb);
+        dev_put(reg_dev);
+        return 0;
+ drop:
+        kfree_skb(skb);
+        return 0;
+}
+#endif
+static int
+ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
+{
+        int ct;
+        struct rtnexthop *nhp;
+        struct net_device *dev = vif_table[c->mfc_parent].dev;
+        u8 *b = skb->tail;
+        struct rtattr *mp_head;
+        if (dev)
+                RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+        mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
+        for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+                if (c->mfc_un.res.ttls[ct] < 255) {
+                        if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+                                goto rtattr_failure;
+                        nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+                        nhp->rtnh_flags = 0;
+                        nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+                        nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
+                        nhp->rtnh_len = sizeof(*nhp);
+                }
+        }
+        mp_head->rta_type = RTA_MULTIPATH;
+        mp_head->rta_len = skb->tail - (u8*)mp_head;
+        rtm->rtm_type = RTN_MULTICAST;
+        return 1;
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -EMSGSIZE;
+}
+int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+{
+        int err;
+        struct mfc_cache *cache;
+        struct rtable *rt = (struct rtable*)skb->dst;
+        read_lock(&mrt_lock);
+        cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
+        if (cache==NULL) {
+                struct net_device *dev;
+                int vif;
+                if (nowait) {
+                        read_unlock(&mrt_lock);
+                        return -EAGAIN;
+                }
+                dev = skb->dev;
+                if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+                        read_unlock(&mrt_lock);
+                        return -ENODEV;
+                }
+                skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+                skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
+                skb->nh.iph->saddr = rt->rt_src;
+                skb->nh.iph->daddr = rt->rt_dst;
+                skb->nh.iph->version = 0;
+                err = ipmr_cache_unresolved(vif, skb);
+                read_unlock(&mrt_lock);
+                return err;
+        }
+        if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+                cache->mfc_flags |= MFC_NOTIFY;
+        err = ipmr_fill_mroute(skb, cache, rtm);
+        read_unlock(&mrt_lock);
+        return err;
+}
+#ifdef CONFIG_PROC_FS   
+/*
+ *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ */
+struct ipmr_vif_iter {
+        int ct;
+};
+static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
+                                           loff_t pos)
+{
+        for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
+                if(!VIF_EXISTS(iter->ct))
+                        continue;
+                if (pos-- == 0) 
+                        return &vif_table[iter->ct];
+        }
+        return NULL;
+}
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock(&mrt_lock);
+        return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
+                : SEQ_START_TOKEN;
+}
+static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct ipmr_vif_iter *iter = seq->private;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return ipmr_vif_seq_idx(iter, 0);
+        
+        while (++iter->ct < maxvif) {
+                if(!VIF_EXISTS(iter->ct))
+                        continue;
+                return &vif_table[iter->ct];
+        }
+        return NULL;
+}
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+{
+        read_unlock(&mrt_lock);
+}
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN) {
+                seq_puts(seq, 
+                         "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
+        } else {
+                const struct vif_device *vif = v;
+                const char *name =  vif->dev ? vif->dev->name : "none";
+                seq_printf(seq,
+                           "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+                           vif - vif_table,
+                           name, vif->bytes_in, vif->pkt_in, 
+                           vif->bytes_out, vif->pkt_out,
+                           vif->flags, vif->local, vif->remote);
+        }
+        return 0;
+}
+static struct seq_operations ipmr_vif_seq_ops = {
+        .start = ipmr_vif_seq_start,
+        .next  = ipmr_vif_seq_next,
+        .stop  = ipmr_vif_seq_stop,
+        .show  = ipmr_vif_seq_show,
+};
+static int ipmr_vif_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+        if (!s)
+                goto out;
+        rc = seq_open(file, &ipmr_vif_seq_ops);
+        if (rc)
+                goto out_kfree;
+        s->ct = 0;
+        seq = file->private_data;
+        seq->private = s;
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations ipmr_vif_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ipmr_vif_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+struct ipmr_mfc_iter {
+        struct mfc_cache **cache;
+        int ct;
+};
+static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
+{
+        struct mfc_cache *mfc;
+        it->cache = mfc_cache_array;
+        read_lock(&mrt_lock);
+        for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
+                for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
+                        if (pos-- == 0) 
+                                return mfc;
+        read_unlock(&mrt_lock);
+        it->cache = &mfc_unres_queue;
+        spin_lock_bh(&mfc_unres_lock);
+        for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
+                if (pos-- == 0)
+                        return mfc;
+        spin_unlock_bh(&mfc_unres_lock);
+        it->cache = NULL;
+        return NULL;
+}
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct ipmr_mfc_iter *it = seq->private;
+        it->cache = NULL;
+        it->ct = 0;
+        return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
+                : SEQ_START_TOKEN;
+}
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct mfc_cache *mfc = v;
+        struct ipmr_mfc_iter *it = seq->private;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return ipmr_mfc_seq_idx(seq->private, 0);
+        if (mfc->next)
+                return mfc->next;
+        
+        if (it->cache == &mfc_unres_queue) 
+                goto end_of_list;
+        BUG_ON(it->cache != mfc_cache_array);
+        while (++it->ct < MFC_LINES) {
+                mfc = mfc_cache_array[it->ct];
+                if (mfc)
+                        return mfc;
+        }
+        /* exhausted cache_array, show unresolved */
+        read_unlock(&mrt_lock);
+        it->cache = &mfc_unres_queue;
+        it->ct = 0;
+                
+        spin_lock_bh(&mfc_unres_lock);
+        mfc = mfc_unres_queue;
+        if (mfc) 
+                return mfc;
+ end_of_list:
+        spin_unlock_bh(&mfc_unres_lock);
+        it->cache = NULL;
+        return NULL;
+}
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+        struct ipmr_mfc_iter *it = seq->private;
+        if (it->cache == &mfc_unres_queue)
+                spin_unlock_bh(&mfc_unres_lock);
+        else if (it->cache == mfc_cache_array)
+                read_unlock(&mrt_lock);
+}
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+        int n;
+        if (v == SEQ_START_TOKEN) {
+                seq_puts(seq, 
+                 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
+        } else {
+                const struct mfc_cache *mfc = v;
+                const struct ipmr_mfc_iter *it = seq->private;
+                
+                seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
+                           (unsigned long) mfc->mfc_mcastgrp,
+                           (unsigned long) mfc->mfc_origin,
+                           mfc->mfc_parent,
+                           mfc->mfc_un.res.pkt,
+                           mfc->mfc_un.res.bytes,
+                           mfc->mfc_un.res.wrong_if);
+                if (it->cache != &mfc_unres_queue) {
+                        for(n = mfc->mfc_un.res.minvif; 
+                            n < mfc->mfc_un.res.maxvif; n++ ) {
+                                if(VIF_EXISTS(n) 
+                                   && mfc->mfc_un.res.ttls[n] < 255)
+                                seq_printf(seq, 
+                                           " %2d:%-3d", 
+                                           n, mfc->mfc_un.res.ttls[n]);
+                        }
+                }
+                seq_putc(seq, '\n');
+        }
+        return 0;
+}
+static struct seq_operations ipmr_mfc_seq_ops = {
+        .start = ipmr_mfc_seq_start,
+        .next  = ipmr_mfc_seq_next,
+        .stop  = ipmr_mfc_seq_stop,
+        .show  = ipmr_mfc_seq_show,
+};
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+        if (!s)
+                goto out;
+        rc = seq_open(file, &ipmr_mfc_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq = file->private_data;
+        seq->private = s;
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations ipmr_mfc_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ipmr_mfc_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+#endif  
+#ifdef CONFIG_IP_PIMSM_V2
+static struct net_protocol pim_protocol = {
+        .handler        =       pim_rcv,
+};
+#endif
+/*
+ *      Setup for IP multicast routing
+ */
+ 
+void __init ip_mr_init(void)
+{
+        mrt_cachep = kmem_cache_create("ip_mrt_cache",
+                                       sizeof(struct mfc_cache),
+                                       0, SLAB_HWCACHE_ALIGN,
+                                       NULL, NULL);
+        if (!mrt_cachep)
+                panic("cannot allocate ip_mrt_cache");
+        init_timer(&ipmr_expire_timer);
+        ipmr_expire_timer.function=ipmr_expire_process;
+        register_netdevice_notifier(&ip_mr_notifier);
+#ifdef CONFIG_PROC_FS   
+        proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
+        proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
+#endif  
+}
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
new file mode 100644
index 000000000000..63a82b4b64bb
--- /dev/null
+++ b/net/ipv4/ipvs/Kconfig
@@ -0,0 +1,244 @@
+#
+# IP Virtual Server configuration
+#
+menu    "IP: Virtual Server Configuration"
+        depends on INET && NETFILTER
+config  IP_VS
+        tristate "IP virtual server support (EXPERIMENTAL)"
+        depends on INET && NETFILTER
+        ---help---
+          IP Virtual Server support will let you build a high-performance
+          virtual server based on cluster of two or more real servers. This
+          option must be enabled for at least one of the clustered computers
+          that will take care of intercepting incoming connections to a
+          single IP address and scheduling them to real servers.
+          Three request dispatching techniques are implemented, they are
+          virtual server via NAT, virtual server via tunneling and virtual
+          server via direct routing. The several scheduling algorithms can
+          be used to choose which server the connection is directed to,
+          thus load balancing can be achieved among the servers.  For more
+          information and its administration program, please visit the
+          following URL: <http://www.linuxvirtualserver.org/>.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_DEBUG
+        bool "IP virtual server debugging"
+        depends on IP_VS
+        ---help---
+          Say Y here if you want to get additional messages useful in
+          debugging the IP virtual server code. You can change the debug
+          level in /proc/sys/net/ipv4/vs/debug_level
+config  IP_VS_TAB_BITS
+        int "IPVS connection table size (the Nth power of 2)"
+        depends on IP_VS 
+        default "12" 
+        ---help---
+          The IPVS connection hash table uses the chaining scheme to handle
+          hash collisions. Using a big IPVS connection hash table will greatly
+          reduce conflicts when there are hundreds of thousands of connections
+          in the hash table.
+          Note the table size must be power of 2. The table size will be the
+          value of 2 to the your input number power. The number to choose is
+          from 8 to 20, the default number is 12, which means the table size
+          is 4096. Don't input the number too small, otherwise you will lose
+          performance on it. You can adapt the table size yourself, according
+          to your virtual server application. It is good to set the table size
+          not far less than the number of connections per second multiplying
+          average lasting time of connection in the table.  For example, your
+          virtual server gets 200 connections per second, the connection lasts
+          for 200 seconds in average in the connection table, the table size
+          should be not far less than 200x200, it is good to set the table
+          size 32768 (2**15).
+          Another note that each connection occupies 128 bytes effectively and
+          each hash entry uses 8 bytes, so you can estimate how much memory is
+          needed for your box.
+comment "IPVS transport protocol load balancing support"
+        depends on IP_VS
+config  IP_VS_PROTO_TCP
+        bool "TCP load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing TCP transport
+          protocol. Say Y if unsure.
+config  IP_VS_PROTO_UDP
+        bool "UDP load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing UDP transport
+          protocol. Say Y if unsure.
+config  IP_VS_PROTO_ESP
+        bool "ESP load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing ESP (Encapsultion
+          Security Payload) transport protocol. Say Y if unsure.
+config  IP_VS_PROTO_AH
+        bool "AH load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing AH (Authentication
+          Header) transport protocol. Say Y if unsure.
+comment "IPVS scheduler"
+        depends on IP_VS
+config  IP_VS_RR
+        tristate "round-robin scheduling"
+        depends on IP_VS
+        ---help---
+          The robin-robin scheduling algorithm simply directs network
+          connections to different real servers in a round-robin manner.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+ 
+config  IP_VS_WRR
+        tristate "weighted round-robin scheduling" 
+        depends on IP_VS
+        ---help---
+          The weighted robin-robin scheduling algorithm directs network
+          connections to different real servers based on server weights
+          in a round-robin manner. Servers with higher weights receive
+          new connections first than those with less weights, and servers
+          with higher weights get more connections than those with less
+          weights and servers with equal weights get equal connections.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_LC
+        tristate "least-connection scheduling"
+        depends on IP_VS
+        ---help---
+          The least-connection scheduling algorithm directs network
+          connections to the server with the least number of active 
+          connections.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_WLC
+        tristate "weighted least-connection scheduling"
+        depends on IP_VS
+        ---help---
+          The weighted least-connection scheduling algorithm directs network
+          connections to the server with the least active connections
+          normalized by the server weight.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_LBLC
+        tristate "locality-based least-connection scheduling"
+        depends on IP_VS
+        ---help---
+          The locality-based least-connection scheduling algorithm is for
+          destination IP load balancing. It is usually used in cache cluster.
+          This algorithm usually directs packet destined for an IP address to
+          its server if the server is alive and under load. If the server is
+          overloaded (its active connection numbers is larger than its weight)
+          and there is a server in its half load, then allocate the weighted
+          least-connection server to this IP address.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_LBLCR
+        tristate "locality-based least-connection with replication scheduling"
+        depends on IP_VS
+        ---help---
+          The locality-based least-connection with replication scheduling
+          algorithm is also for destination IP load balancing. It is 
+          usually used in cache cluster. It differs from the LBLC scheduling
+          as follows: the load balancer maintains mappings from a target
+          to a set of server nodes that can serve the target. Requests for
+          a target are assigned to the least-connection node in the target's
+          server set. If all the node in the server set are over loaded,
+          it picks up a least-connection node in the cluster and adds it
+          in the sever set for the target. If the server set has not been
+          modified for the specified time, the most loaded node is removed
+          from the server set, in order to avoid high degree of replication.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_DH
+        tristate "destination hashing scheduling"
+        depends on IP_VS
+        ---help---
+          The destination hashing scheduling algorithm assigns network
+          connections to the servers through looking up a statically assigned
+          hash table by their destination IP addresses.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_SH
+        tristate "source hashing scheduling"
+        depends on IP_VS
+        ---help---
+          The source hashing scheduling algorithm assigns network
+          connections to the servers through looking up a statically assigned
+          hash table by their source IP addresses.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_SED
+        tristate "shortest expected delay scheduling"
+        depends on IP_VS
+        ---help---
+          The shortest expected delay scheduling algorithm assigns network
+          connections to the server with the shortest expected delay. The 
+          expected delay that the job will experience is (Ci + 1) / Ui if 
+          sent to the ith server, in which Ci is the number of connections
+          on the the ith server and Ui is the fixed service rate (weight)
+          of the ith server.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_NQ
+        tristate "never queue scheduling"
+        depends on IP_VS
+        ---help---
+          The never queue scheduling algorithm adopts a two-speed model.
+          When there is an idle server available, the job will be sent to
+          the idle server, instead of waiting for a fast one. When there
+          is no idle server available, the job will be sent to the server
+          that minimize its expected delay (The Shortest Expected Delay
+          scheduling algorithm).
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+comment 'IPVS application helper'
+        depends on IP_VS
+config  IP_VS_FTP
+        tristate "FTP protocol helper"
+        depends on IP_VS && IP_VS_PROTO_TCP
+        ---help---
+          FTP is a protocol that transfers IP address and/or port number in
+          the payload. In the virtual server via Network Address Translation,
+          the IP address and port number of real servers cannot be sent to
+          clients in ftp connections directly, so FTP protocol helper is
+          required for tracking the connection and mangling it back to that of
+          virtual service.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+endmenu
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
new file mode 100644
index 000000000000..a788461a40c9
--- /dev/null
+++ b/net/ipv4/ipvs/Makefile
@@ -0,0 +1,34 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
+ip_vs-objs :=   ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o        \
+                ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o                      \
+                ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o               \
+                $(ip_vs_proto-objs-y)
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
new file mode 100644
index 000000000000..d9212addd193
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -0,0 +1,658 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Version:     $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ *              IP_MASQ_APP application masquerading module
+ *
+ * Author:      Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/ip_vs.h>
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+/* ipvs application list head */
+static LIST_HEAD(ip_vs_app_list);
+static DECLARE_MUTEX(__ip_vs_app_mutex);
+/*
+ *      Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+        /* test and get the module atomically */
+        if (app->module)
+                return try_module_get(app->module);
+        else
+                return 1;
+}
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+        if (app->module)
+                module_put(app->module);
+}
+/*
+ *      Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+        struct ip_vs_protocol *pp;
+        struct ip_vs_app *inc;
+        int ret;
+        if (!(pp = ip_vs_proto_get(proto)))
+                return -EPROTONOSUPPORT;
+        if (!pp->unregister_app)
+                return -EOPNOTSUPP;
+        inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
+        if (!inc)
+                return -ENOMEM;
+        memcpy(inc, app, sizeof(*inc));
+        INIT_LIST_HEAD(&inc->p_list);
+        INIT_LIST_HEAD(&inc->incs_list);
+        inc->app = app;
+        inc->port = htons(port);
+        atomic_set(&inc->usecnt, 0);
+        if (app->timeouts) {
+                inc->timeout_table =
+                        ip_vs_create_timeout_table(app->timeouts,
+                                                   app->timeouts_size);
+                if (!inc->timeout_table) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+        }
+        ret = pp->register_app(inc);
+        if (ret)
+                goto out;
+        list_add(&inc->a_list, &app->incs_list);
+        IP_VS_DBG(9, "%s application %s:%u registered\n",
+                  pp->name, inc->name, inc->port);
+        return 0;
+  out:
+        if (inc->timeout_table)
+                kfree(inc->timeout_table);
+        kfree(inc);
+        return ret;
+}
+/*
+ *      Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct ip_vs_app *inc)
+{
+        struct ip_vs_protocol *pp;
+        if (!(pp = ip_vs_proto_get(inc->protocol)))
+                return;
+        if (pp->unregister_app)
+                pp->unregister_app(inc);
+        IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+                  pp->name, inc->name, inc->port);
+        list_del(&inc->a_list);
+        if (inc->timeout_table != NULL)
+                kfree(inc->timeout_table);
+        kfree(inc);
+}
+/*
+ *      Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+        int result;
+        atomic_inc(&inc->usecnt);
+        if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+                atomic_dec(&inc->usecnt);
+        return result;
+}
+/*
+ *      Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+        ip_vs_app_put(inc->app);
+        atomic_dec(&inc->usecnt);
+}
+/*
+ *      Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+        int result;
+        down(&__ip_vs_app_mutex);
+        result = ip_vs_app_inc_new(app, proto, port);
+        up(&__ip_vs_app_mutex);
+        return result;
+}
+/*
+ *      ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct ip_vs_app *app)
+{
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        down(&__ip_vs_app_mutex);
+        list_add(&app->a_list, &ip_vs_app_list);
+        up(&__ip_vs_app_mutex);
+        return 0;
+}
+/*
+ *      ip_vs_app unregistration routine
+ *      We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct ip_vs_app *app)
+{
+        struct ip_vs_app *inc, *nxt;
+        down(&__ip_vs_app_mutex);
+        list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
+                ip_vs_app_inc_release(inc);
+        }
+        list_del(&app->a_list);
+        up(&__ip_vs_app_mutex);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+}
+#if 0000
+/*
+ *      Get reference to app by name (called from user context)
+ */
+struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
+{
+        struct ip_vs_app *app, *a = NULL;
+        down(&__ip_vs_app_mutex);
+        list_for_each_entry(ent, &ip_vs_app_list, a_list) {
+                if (strcmp(app->name, appname))
+                        continue;
+                /* softirq may call ip_vs_app_get too, so the caller
+                   must disable softirq on the current CPU */
+                if (ip_vs_app_get(app))
+                        a = app;
+                break;
+        }
+        up(&__ip_vs_app_mutex);
+        return a;
+}
+#endif
+/*
+ *      Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+{
+        return pp->app_conn_bind(cp);
+}
+/*
+ *      Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+        struct ip_vs_app *inc = cp->app;
+        if (!inc)
+                return;
+        if (inc->unbind_conn)
+                inc->unbind_conn(inc, cp);
+        if (inc->done_conn)
+                inc->done_conn(inc, cp);
+        ip_vs_app_inc_put(inc);
+        cp->app = NULL;
+}
+/*
+ *      Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+        __u32 seq = ntohl(th->seq);
+        /*
+         *      Adjust seq with delta-offset for all packets after
+         *      the most recent resized pkt seq and with previous_delta offset
+         *      for all packets before most recent resized pkt seq.
+         */
+        if (vseq->delta || vseq->previous_delta) {
+                if(after(seq, vseq->init_seq)) {
+                        th->seq = htonl(seq + vseq->delta);
+                        IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
+                                  vseq->delta);
+                } else {
+                        th->seq = htonl(seq + vseq->previous_delta);
+                        IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
+                                  "(%d) to seq\n", vseq->previous_delta);
+                }
+        }
+}
+/*
+ *      Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+        __u32 ack_seq = ntohl(th->ack_seq);
+        /*
+         * Adjust ack_seq with delta-offset for
+         * the packets AFTER most recent resized pkt has caused a shift
+         * for packets before most recent resized pkt, use previous_delta
+         */
+        if (vseq->delta || vseq->previous_delta) {
+                /* since ack_seq is the number of octet that is expected
+                   to receive next, so compare it with init_seq+delta */
+                if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+                        th->ack_seq = htonl(ack_seq - vseq->delta);
+                        IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
+                                  "(%d) from ack_seq\n", vseq->delta);
+                } else {
+                        th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+                        IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
+                                  "previous_delta (%d) from ack_seq\n",
+                                  vseq->previous_delta);
+                }
+        }
+}
+/*
+ *      Updates ip_vs_seq if pkt has been resized
+ *      Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+                                 unsigned flag, __u32 seq, int diff)
+{
+        /* spinlock is to keep updating cp->flags atomic */
+        spin_lock(&cp->lock);
+        if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+                vseq->previous_delta = vseq->delta;
+                vseq->delta += diff;
+                vseq->init_seq = seq;
+                cp->flags |= flag;
+        }
+        spin_unlock(&cp->lock);
+}
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
+                                  struct ip_vs_app *app)
+{
+        int diff;
+        unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+        struct tcphdr *th;
+        __u32 seq;
+        if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+                return 0;
+        th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+        /*
+         *      Remember seq number in case this pkt gets resized
+         */
+        seq = ntohl(th->seq);
+        /*
+         *      Fix seq stuff if flagged as so.
+         */
+        if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+                vs_fix_seq(&cp->out_seq, th);
+        if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+                vs_fix_ack_seq(&cp->in_seq, th);
+        /*
+         *      Call private output hook function
+         */
+        if (app->pkt_out == NULL)
+                return 1;
+        if (!app->pkt_out(app, cp, pskb, &diff))
+                return 0;
+        /*
+         *      Update ip_vs seq stuff if len has changed.
+         */
+        if (diff != 0)
+                vs_seq_update(cp, &cp->out_seq,
+                              IP_VS_CONN_F_OUT_SEQ, seq, diff);
+        return 1;
+}
+/*
+ *      Output pkt hook. Will call bound ip_vs_app specific function
+ *      called by ipvs packet handler, assumes previously checked cp!=NULL
+ *      returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+        struct ip_vs_app *app;
+        /*
+         *      check if application module is bound to
+         *      this ip_vs_conn.
+         */
+        if ((app = cp->app) == NULL)
+                return 1;
+        /* TCP is complicated */
+        if (cp->protocol == IPPROTO_TCP)
+                return app_tcp_pkt_out(cp, pskb, app);
+        /*
+         *      Call private output hook function
+         */
+        if (app->pkt_out == NULL)
+                return 1;
+        return app->pkt_out(app, cp, pskb, NULL);
+}
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
+                                 struct ip_vs_app *app)
+{
+        int diff;
+        unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+        struct tcphdr *th;
+        __u32 seq;
+        if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+                return 0;
+        th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+        /*
+         *      Remember seq number in case this pkt gets resized
+         */
+        seq = ntohl(th->seq);
+        /*
+         *      Fix seq stuff if flagged as so.
+         */
+        if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+                vs_fix_seq(&cp->in_seq, th);
+        if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+                vs_fix_ack_seq(&cp->out_seq, th);
+        /*
+         *      Call private input hook function
+         */
+        if (app->pkt_in == NULL)
+                return 1;
+        if (!app->pkt_in(app, cp, pskb, &diff))
+                return 0;
+        /*
+         *      Update ip_vs seq stuff if len has changed.
+         */
+        if (diff != 0)
+                vs_seq_update(cp, &cp->in_seq,
+                              IP_VS_CONN_F_IN_SEQ, seq, diff);
+        return 1;
+}
+/*
+ *      Input pkt hook. Will call bound ip_vs_app specific function
+ *      called by ipvs packet handler, assumes previously checked cp!=NULL.
+ *      returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+        struct ip_vs_app *app;
+        /*
+         *      check if application module is bound to
+         *      this ip_vs_conn.
+         */
+        if ((app = cp->app) == NULL)
+                return 1;
+        /* TCP is complicated */
+        if (cp->protocol == IPPROTO_TCP)
+                return app_tcp_pkt_in(cp, pskb, app);
+        /*
+         *      Call private input hook function
+         */
+        if (app->pkt_in == NULL)
+                return 1;
+        return app->pkt_in(app, cp, pskb, NULL);
+}
+#ifdef CONFIG_PROC_FS
+/*
+ *      /proc/net/ip_vs_app entry function
+ */
+static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+{
+        struct ip_vs_app *app, *inc;
+        list_for_each_entry(app, &ip_vs_app_list, a_list) {
+                list_for_each_entry(inc, &app->incs_list, a_list) {
+                        if (pos-- == 0)
+                                return inc;
+                }
+        }
+        return NULL;
+}
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        down(&__ip_vs_app_mutex);
+        return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct ip_vs_app *inc, *app;
+        struct list_head *e;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return ip_vs_app_idx(0);
+        inc = v;
+        app = inc->app;
+        if ((e = inc->a_list.next) != &app->incs_list)
+                return list_entry(e, struct ip_vs_app, a_list);
+        /* go on to next application */
+        for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+                app = list_entry(e, struct ip_vs_app, a_list);
+                list_for_each_entry(inc, &app->incs_list, a_list) {
+                        return inc;
+                }
+        }
+        return NULL;
+}
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+        up(&__ip_vs_app_mutex);
+}
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_puts(seq, "prot port    usecnt name\n");
+        else {
+                const struct ip_vs_app *inc = v;
+                seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
+                           ip_vs_proto_name(inc->protocol),
+                           ntohs(inc->port),
+                           atomic_read(&inc->usecnt),
+                           inc->name);
+        }
+        return 0;
+}
+static struct seq_operations ip_vs_app_seq_ops = {
+        .start = ip_vs_app_seq_start,
+        .next  = ip_vs_app_seq_next,
+        .stop  = ip_vs_app_seq_stop,
+        .show  = ip_vs_app_seq_show,
+};
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &ip_vs_app_seq_ops);
+}
+static struct file_operations ip_vs_app_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ip_vs_app_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+#endif
+/*
+ *      Replace a segment of data with a new segment
+ */
+int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+                      char *o_buf, int o_len, char *n_buf, int n_len)
+{
+        struct iphdr *iph;
+        int diff;
+        int o_offset;
+        int o_left;
+        EnterFunction(9);
+        diff = n_len - o_len;
+        o_offset = o_buf - (char *)skb->data;
+        /* The length of left data after o_buf+o_len in the skb data */
+        o_left = skb->len - (o_offset + o_len);
+        if (diff <= 0) {
+                memmove(o_buf + n_len, o_buf + o_len, o_left);
+                memcpy(o_buf, n_buf, n_len);
+                skb_trim(skb, skb->len + diff);
+        } else if (diff <= skb_tailroom(skb)) {
+                skb_put(skb, diff);
+                memmove(o_buf + n_len, o_buf + o_len, o_left);
+                memcpy(o_buf, n_buf, n_len);
+        } else {
+                if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
+                        return -ENOMEM;
+                skb_put(skb, diff);
+                memmove(skb->data + o_offset + n_len,
+                        skb->data + o_offset + o_len, o_left);
+                memcpy(skb->data + o_offset, n_buf, n_len);
+        }
+        /* must update the iph total length here */
+        iph = skb->nh.iph;
+        iph->tot_len = htons(skb->len);
+        LeaveFunction(9);
+        return 0;
+}
+int ip_vs_app_init(void)
+{
+        /* we will replace it with proc_net_ipvs_create() soon */
+        proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
+        return 0;
+}
+void ip_vs_app_cleanup(void)
+{
+        proc_net_remove("ip_vs_app");
+}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
new file mode 100644
index 000000000000..fd6feb5499fe
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -0,0 +1,920 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>              /* for proc_net_* */
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/ip_vs.h>
+/*
+ *  Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+/*  SLAB cache for IPVS connections */
+static kmem_cache_t *ip_vs_conn_cachep;
+/*  counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+/*  counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+/*
+ *  Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS  4
+#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
+struct ip_vs_aligned_lock
+{
+        rwlock_t        l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+static inline void ct_read_lock(unsigned key)
+{
+        read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_read_unlock(unsigned key)
+{
+        read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_lock(unsigned key)
+{
+        write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_unlock(unsigned key)
+{
+        write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_read_lock_bh(unsigned key)
+{
+        read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_read_unlock_bh(unsigned key)
+{
+        read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_lock_bh(unsigned key)
+{
+        write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_unlock_bh(unsigned key)
+{
+        write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+/*
+ *      Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+        return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
+                & IP_VS_CONN_TAB_MASK;
+}
+/*
+ *      Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ *      returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+        unsigned hash;
+        int ret;
+        /* Hash by protocol, client address and port */
+        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+        ct_write_lock(hash);
+        if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+                list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+                cp->flags |= IP_VS_CONN_F_HASHED;
+                atomic_inc(&cp->refcnt);
+                ret = 1;
+        } else {
+                IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                ret = 0;
+        }
+        ct_write_unlock(hash);
+        return ret;
+}
+/*
+ *      UNhashes ip_vs_conn from ip_vs_conn_tab.
+ *      returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+        unsigned hash;
+        int ret;
+        /* unhash it and decrease its reference counter */
+        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+        ct_write_lock(hash);
+        if (cp->flags & IP_VS_CONN_F_HASHED) {
+                list_del(&cp->c_list);
+                cp->flags &= ~IP_VS_CONN_F_HASHED;
+                atomic_dec(&cp->refcnt);
+                ret = 1;
+        } else
+                ret = 0;
+        ct_write_unlock(hash);
+        return ret;
+}
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from OUTside-to-INside.
+ *      s_addr, s_port: pkt source address (foreign host)
+ *      d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+        unsigned hash;
+        struct ip_vs_conn *cp;
+        hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+        ct_read_lock(hash);
+        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+                if (s_addr==cp->caddr && s_port==cp->cport &&
+                    d_port==cp->vport && d_addr==cp->vaddr &&
+                    protocol==cp->protocol) {
+                        /* HIT */
+                        atomic_inc(&cp->refcnt);
+                        ct_read_unlock(hash);
+                        return cp;
+                }
+        }
+        ct_read_unlock(hash);
+        return NULL;
+}
+struct ip_vs_conn *ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+        struct ip_vs_conn *cp;
+        cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
+        if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+                cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
+        IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+                  ip_vs_proto_name(protocol),
+                  NIPQUAD(s_addr), ntohs(s_port),
+                  NIPQUAD(d_addr), ntohs(d_port),
+                  cp?"hit":"not hit");
+        return cp;
+}
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from inside-to-OUTside.
+ *      s_addr, s_port: pkt source address (inside host)
+ *      d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+        unsigned hash;
+        struct ip_vs_conn *cp, *ret=NULL;
+        /*
+         *      Check for "full" addressed entries
+         */
+        hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
+        ct_read_lock(hash);
+        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+                if (d_addr == cp->caddr && d_port == cp->cport &&
+                    s_port == cp->dport && s_addr == cp->daddr &&
+                    protocol == cp->protocol) {
+                        /* HIT */
+                        atomic_inc(&cp->refcnt);
+                        ret = cp;
+                        break;
+                }
+        }
+        ct_read_unlock(hash);
+        IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+                  ip_vs_proto_name(protocol),
+                  NIPQUAD(s_addr), ntohs(s_port),
+                  NIPQUAD(d_addr), ntohs(d_port),
+                  ret?"hit":"not hit");
+        return ret;
+}
+/*
+ *      Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+        /* reset it expire in its timeout */
+        mod_timer(&cp->timer, jiffies+cp->timeout);
+        __ip_vs_conn_put(cp);
+}
+/*
+ *      Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
+{
+        if (ip_vs_conn_unhash(cp)) {
+                spin_lock(&cp->lock);
+                if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+                        atomic_dec(&ip_vs_conn_no_cport_cnt);
+                        cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+                        cp->cport = cport;
+                }
+                spin_unlock(&cp->lock);
+                /* hash on new dport */
+                ip_vs_conn_hash(cp);
+        }
+}
+/*
+ *      Bind a connection entry with the corresponding packet_xmit.
+ *      Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+        switch (IP_VS_FWD_METHOD(cp)) {
+        case IP_VS_CONN_F_MASQ:
+                cp->packet_xmit = ip_vs_nat_xmit;
+                break;
+        case IP_VS_CONN_F_TUNNEL:
+                cp->packet_xmit = ip_vs_tunnel_xmit;
+                break;
+        case IP_VS_CONN_F_DROUTE:
+                cp->packet_xmit = ip_vs_dr_xmit;
+                break;
+        case IP_VS_CONN_F_LOCALNODE:
+                cp->packet_xmit = ip_vs_null_xmit;
+                break;
+        case IP_VS_CONN_F_BYPASS:
+                cp->packet_xmit = ip_vs_bypass_xmit;
+                break;
+        }
+}
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+        return atomic_read(&dest->activeconns)
+                + atomic_read(&dest->inactconns);
+}
+/*
+ *      Bind a connection entry with a virtual service destination
+ *      Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+        /* if dest is NULL, then return directly */
+        if (!dest)
+                return;
+        /* Increase the refcnt counter of the dest */
+        atomic_inc(&dest->refcnt);
+        /* Bind with the destination and its corresponding transmitter */
+        cp->flags |= atomic_read(&dest->conn_flags);
+        cp->dest = dest;
+        IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+                  ip_vs_proto_name(cp->protocol),
+                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                  NIPQUAD(cp->daddr), ntohs(cp->dport),
+                  ip_vs_fwd_tag(cp), cp->state,
+                  cp->flags, atomic_read(&cp->refcnt),
+                  atomic_read(&dest->refcnt));
+        /* Update the connection counters */
+        if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+                /* It is a normal connection, so increase the inactive
+                   connection counter because it is in TCP SYNRECV
+                   state (inactive) or other protocol inacive state */
+                atomic_inc(&dest->inactconns);
+        } else {
+                /* It is a persistent connection/template, so increase
+                   the peristent connection counter */
+                atomic_inc(&dest->persistconns);
+        }
+        if (dest->u_threshold != 0 &&
+            ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+                dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+/*
+ *      Unbind a connection entry with its VS destination
+ *      Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+        struct ip_vs_dest *dest = cp->dest;
+        if (!dest)
+                return;
+        IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+                  ip_vs_proto_name(cp->protocol),
+                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                  NIPQUAD(cp->daddr), ntohs(cp->dport),
+                  ip_vs_fwd_tag(cp), cp->state,
+                  cp->flags, atomic_read(&cp->refcnt),
+                  atomic_read(&dest->refcnt));
+        /* Update the connection counters */
+        if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+                /* It is a normal connection, so decrease the inactconns
+                   or activeconns counter */
+                if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+                        atomic_dec(&dest->inactconns);
+                } else {
+                        atomic_dec(&dest->activeconns);
+                }
+        } else {
+                /* It is a persistent connection/template, so decrease
+                   the peristent connection counter */
+                atomic_dec(&dest->persistconns);
+        }
+        if (dest->l_threshold != 0) {
+                if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        } else if (dest->u_threshold != 0) {
+                if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        } else {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        }
+        /*
+         * Simply decrease the refcnt of the dest, because the
+         * dest will be either in service's destination list
+         * or in the trash.
+         */
+        atomic_dec(&dest->refcnt);
+}
+/*
+ *      Checking if the destination of a connection template is available.
+ *      If available, return 1, otherwise invalidate this connection
+ *      template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+        struct ip_vs_dest *dest = ct->dest;
+        /*
+         * Checking the dest server status.
+         */
+        if ((dest == NULL) ||
+            !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 
+            (sysctl_ip_vs_expire_quiescent_template && 
+             (atomic_read(&dest->weight) == 0))) {
+                IP_VS_DBG(9, "check_template: dest not available for "
+                          "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                          "-> d:%u.%u.%u.%u:%d\n",
+                          ip_vs_proto_name(ct->protocol),
+                          NIPQUAD(ct->caddr), ntohs(ct->cport),
+                          NIPQUAD(ct->vaddr), ntohs(ct->vport),
+                          NIPQUAD(ct->daddr), ntohs(ct->dport));
+                /*
+                 * Invalidate the connection template
+                 */
+                if (ct->cport) {
+                        if (ip_vs_conn_unhash(ct)) {
+                                ct->dport = 65535;
+                                ct->vport = 65535;
+                                ct->cport = 0;
+                                ip_vs_conn_hash(ct);
+                        }
+                }
+                /*
+                 * Simply decrease the refcnt of the template,
+                 * don't restart its timer.
+                 */
+                atomic_dec(&ct->refcnt);
+                return 0;
+        }
+        return 1;
+}
+static void ip_vs_conn_expire(unsigned long data)
+{
+        struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+        cp->timeout = 60*HZ;
+        /*
+         *      hey, I'm using it
+         */
+        atomic_inc(&cp->refcnt);
+        /*
+         *      do I control anybody?
+         */
+        if (atomic_read(&cp->n_control))
+                goto expire_later;
+        /*
+         *      unhash it if it is hashed in the conn table
+         */
+        if (!ip_vs_conn_unhash(cp))
+                goto expire_later;
+        /*
+         *      refcnt==1 implies I'm the only one referrer
+         */
+        if (likely(atomic_read(&cp->refcnt) == 1)) {
+                /* delete the timer if it is activated by other users */
+                if (timer_pending(&cp->timer))
+                        del_timer(&cp->timer);
+                /* does anybody control me? */
+                if (cp->control)
+                        ip_vs_control_del(cp);
+                if (unlikely(cp->app != NULL))
+                        ip_vs_unbind_app(cp);
+                ip_vs_unbind_dest(cp);
+                if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+                        atomic_dec(&ip_vs_conn_no_cport_cnt);
+                atomic_dec(&ip_vs_conn_count);
+                kmem_cache_free(ip_vs_conn_cachep, cp);
+                return;
+        }
+        /* hash it back to the table */
+        ip_vs_conn_hash(cp);
+  expire_later:
+        IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+                  atomic_read(&cp->refcnt)-1,
+                  atomic_read(&cp->n_control));
+        ip_vs_conn_put(cp);
+}
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+        if (del_timer(&cp->timer))
+                mod_timer(&cp->timer, jiffies);
+        __ip_vs_conn_put(cp);
+}
+/*
+ *      Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
+               __u32 daddr, __u16 dport, unsigned flags,
+               struct ip_vs_dest *dest)
+{
+        struct ip_vs_conn *cp;
+        struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+        cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+        if (cp == NULL) {
+                IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+                return NULL;
+        }
+        memset(cp, 0, sizeof(*cp));
+        INIT_LIST_HEAD(&cp->c_list);
+        init_timer(&cp->timer);
+        cp->timer.data     = (unsigned long)cp;
+        cp->timer.function = ip_vs_conn_expire;
+        cp->protocol       = proto;
+        cp->caddr          = caddr;
+        cp->cport          = cport;
+        cp->vaddr          = vaddr;
+        cp->vport          = vport;
+        cp->daddr          = daddr;
+        cp->dport          = dport;
+        cp->flags          = flags;
+        spin_lock_init(&cp->lock);
+        /*
+         * Set the entry is referenced by the current thread before hashing
+         * it in the table, so that other thread run ip_vs_random_dropentry
+         * but cannot drop this entry.
+         */
+        atomic_set(&cp->refcnt, 1);
+        atomic_set(&cp->n_control, 0);
+        atomic_set(&cp->in_pkts, 0);
+        atomic_inc(&ip_vs_conn_count);
+        if (flags & IP_VS_CONN_F_NO_CPORT)
+                atomic_inc(&ip_vs_conn_no_cport_cnt);
+        /* Bind the connection with a destination server */
+        ip_vs_bind_dest(cp, dest);
+        /* Set its state and timeout */
+        cp->state = 0;
+        cp->timeout = 3*HZ;
+        /* Bind its packet transmitter */
+        ip_vs_bind_xmit(cp);
+        if (unlikely(pp && atomic_read(&pp->appcnt)))
+                ip_vs_bind_app(cp, pp);
+        /* Hash it in the ip_vs_conn_tab finally */
+        ip_vs_conn_hash(cp);
+        return cp;
+}
+/*
+ *      /proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+        int idx;
+        struct ip_vs_conn *cp;
+        
+        for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+                ct_read_lock_bh(idx);
+                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                        if (pos-- == 0) {
+                                seq->private = &ip_vs_conn_tab[idx];
+                                return cp;
+                        }
+                }
+                ct_read_unlock_bh(idx);
+        }
+        return NULL;
+}
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        seq->private = NULL;
+        return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct ip_vs_conn *cp = v;
+        struct list_head *e, *l = seq->private;
+        int idx;
+        ++*pos;
+        if (v == SEQ_START_TOKEN) 
+                return ip_vs_conn_array(seq, 0);
+        /* more on same hash chain? */
+        if ((e = cp->c_list.next) != l)
+                return list_entry(e, struct ip_vs_conn, c_list);
+        idx = l - ip_vs_conn_tab;
+        ct_read_unlock_bh(idx);
+        while (++idx < IP_VS_CONN_TAB_SIZE) {
+                ct_read_lock_bh(idx);
+                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                        seq->private = &ip_vs_conn_tab[idx];
+                        return cp;
+                }       
+                ct_read_unlock_bh(idx);
+        }
+        seq->private = NULL;
+        return NULL;
+}
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+{
+        struct list_head *l = seq->private;
+        if (l)
+                ct_read_unlock_bh(l - ip_vs_conn_tab);
+}
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+        else {
+                const struct ip_vs_conn *cp = v;
+                seq_printf(seq,
+                        "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
+                                ip_vs_proto_name(cp->protocol),
+                                ntohl(cp->caddr), ntohs(cp->cport),
+                                ntohl(cp->vaddr), ntohs(cp->vport),
+                                ntohl(cp->daddr), ntohs(cp->dport),
+                                ip_vs_state_name(cp->protocol, cp->state),
+                                (cp->timer.expires-jiffies)/HZ);
+        }
+        return 0;
+}
+static struct seq_operations ip_vs_conn_seq_ops = {
+        .start = ip_vs_conn_seq_start,
+        .next  = ip_vs_conn_seq_next,
+        .stop  = ip_vs_conn_seq_stop,
+        .show  = ip_vs_conn_seq_show,
+};
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &ip_vs_conn_seq_ops);
+}
+static struct file_operations ip_vs_conn_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ip_vs_conn_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+#endif
+/*
+ *      Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+        /*
+         * The drop rate array needs tuning for real environments.
+         * Called from timer bh only => no locking
+         */
+        static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+        static char todrop_counter[9] = {0};
+        int i;
+        /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+           This will leave enough time for normal connection to get
+           through. */
+        if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+                return 0;
+        /* Don't drop the entry if its number of incoming packets is not
+           located in [0, 8] */
+        i = atomic_read(&cp->in_pkts);
+        if (i > 8 || i < 0) return 0;
+        if (!todrop_rate[i]) return 0;
+        if (--todrop_counter[i] > 0) return 0;
+        todrop_counter[i] = todrop_rate[i];
+        return 1;
+}
+void ip_vs_random_dropentry(void)
+{
+        int idx;
+        struct ip_vs_conn *cp;
+        struct ip_vs_conn *ct;
+        /*
+         * Randomly scan 1/32 of the whole table every second
+         */
+        for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
+                unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+                /*
+                 *  Lock is actually needed in this loop.
+                 */
+                ct_write_lock(hash);
+                list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+                        if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+                                /* connection template */
+                                continue;
+                        if (cp->protocol == IPPROTO_TCP) {
+                                switch(cp->state) {
+                                case IP_VS_TCP_S_SYN_RECV:
+                                case IP_VS_TCP_S_SYNACK:
+                                        break;
+                                case IP_VS_TCP_S_ESTABLISHED:
+                                        if (todrop_entry(cp))
+                                                break;
+                                        continue;
+                                default:
+                                        continue;
+                                }
+                        } else {
+                                if (!todrop_entry(cp))
+                                        continue;
+                        }
+                        /*
+                         * Drop the entry, and drop its ct if not referenced
+                         */
+                        atomic_inc(&cp->refcnt);
+                        ct_write_unlock(hash);
+                        if ((ct = cp->control))
+                                atomic_inc(&ct->refcnt);
+                        IP_VS_DBG(4, "del connection\n");
+                        ip_vs_conn_expire_now(cp);
+                        if (ct) {
+                                IP_VS_DBG(4, "del conn template\n");
+                                ip_vs_conn_expire_now(ct);
+                        }
+                        ct_write_lock(hash);
+                }
+                ct_write_unlock(hash);
+        }
+}
+/*
+ *      Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+        int idx;
+        struct ip_vs_conn *cp;
+        struct ip_vs_conn *ct;
+  flush_again:
+        for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+                /*
+                 *  Lock is actually needed in this loop.
+                 */
+                ct_write_lock_bh(idx);
+                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                        atomic_inc(&cp->refcnt);
+                        ct_write_unlock(idx);
+                        if ((ct = cp->control))
+                                atomic_inc(&ct->refcnt);
+                        IP_VS_DBG(4, "del connection\n");
+                        ip_vs_conn_expire_now(cp);
+                        if (ct) {
+                                IP_VS_DBG(4, "del conn template\n");
+                                ip_vs_conn_expire_now(ct);
+                        }
+                        ct_write_lock(idx);
+                }
+                ct_write_unlock_bh(idx);
+        }
+        /* the counter may be not NULL, because maybe some conn entries
+           are run by slow timer handler or unhashed but still referred */
+        if (atomic_read(&ip_vs_conn_count) != 0) {
+                schedule();
+                goto flush_again;
+        }
+}
+int ip_vs_conn_init(void)
+{
+        int idx;
+        /*
+         * Allocate the connection hash table and initialize its list heads
+         */
+        ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+        if (!ip_vs_conn_tab)
+                return -ENOMEM;
+        /* Allocate ip_vs_conn slab cache */
+        ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+                                              sizeof(struct ip_vs_conn), 0,
+                                              SLAB_HWCACHE_ALIGN, NULL, NULL);
+        if (!ip_vs_conn_cachep) {
+                vfree(ip_vs_conn_tab);
+                return -ENOMEM;
+        }
+        IP_VS_INFO("Connection hash table configured "
+                   "(size=%d, memory=%ldKbytes)\n",
+                   IP_VS_CONN_TAB_SIZE,
+                   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+        IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+                  sizeof(struct ip_vs_conn));
+        for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+                INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+        }
+        for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
+                rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+        }
+        proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
+        /* calculate the random value for connection hash */
+        get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+        return 0;
+}
+void ip_vs_conn_cleanup(void)
+{
+        /* flush all the connection entries first */
+        ip_vs_conn_flush();
+        /* Release the empty cache */
+        kmem_cache_destroy(ip_vs_conn_cachep);
+        proc_net_remove("ip_vs_conn");
+        vfree(ip_vs_conn_tab);
+}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
new file mode 100644
index 000000000000..5fb257dd07cb
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -0,0 +1,1191 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ *      Paul `Rusty' Russell            properly handle non-linear skbs
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_skb_replace);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+EXPORT_SYMBOL(ip_vs_make_skb_writable);
+/* ID used in ICMP lookups */
+#define icmp_id(icmph)          (((icmph)->un).echo.id)
+const char *ip_vs_proto_name(unsigned proto)
+{
+        static char buf[20];
+        switch (proto) {
+        case IPPROTO_IP:
+                return "IP";
+        case IPPROTO_UDP:
+                return "UDP";
+        case IPPROTO_TCP:
+                return "TCP";
+        case IPPROTO_ICMP:
+                return "ICMP";
+        default:
+                sprintf(buf, "IP_%d", proto);
+                return buf;
+        }
+}
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+        while (--rows >= 0)
+                INIT_LIST_HEAD(&table[rows]);
+}
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest = cp->dest;
+        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                spin_lock(&dest->stats.lock);
+                dest->stats.inpkts++;
+                dest->stats.inbytes += skb->len;
+                spin_unlock(&dest->stats.lock);
+                spin_lock(&dest->svc->stats.lock);
+                dest->svc->stats.inpkts++;
+                dest->svc->stats.inbytes += skb->len;
+                spin_unlock(&dest->svc->stats.lock);
+                spin_lock(&ip_vs_stats.lock);
+                ip_vs_stats.inpkts++;
+                ip_vs_stats.inbytes += skb->len;
+                spin_unlock(&ip_vs_stats.lock);
+        }
+}
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest = cp->dest;
+        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                spin_lock(&dest->stats.lock);
+                dest->stats.outpkts++;
+                dest->stats.outbytes += skb->len;
+                spin_unlock(&dest->stats.lock);
+                spin_lock(&dest->svc->stats.lock);
+                dest->svc->stats.outpkts++;
+                dest->svc->stats.outbytes += skb->len;
+                spin_unlock(&dest->svc->stats.lock);
+                spin_lock(&ip_vs_stats.lock);
+                ip_vs_stats.outpkts++;
+                ip_vs_stats.outbytes += skb->len;
+                spin_unlock(&ip_vs_stats.lock);
+        }
+}
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+        spin_lock(&cp->dest->stats.lock);
+        cp->dest->stats.conns++;
+        spin_unlock(&cp->dest->stats.lock);
+        spin_lock(&svc->stats.lock);
+        svc->stats.conns++;
+        spin_unlock(&svc->stats.lock);
+        spin_lock(&ip_vs_stats.lock);
+        ip_vs_stats.conns++;
+        spin_unlock(&ip_vs_stats.lock);
+}
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+                const struct sk_buff *skb,
+                struct ip_vs_protocol *pp)
+{
+        if (unlikely(!pp->state_transition))
+                return 0;
+        return pp->state_transition(cp, direction, skb, pp);
+}
+int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
+{
+        struct sk_buff *skb = *pskb;
+        /* skb is already used, better copy skb and its payload */
+        if (unlikely(skb_shared(skb) || skb->sk))
+                goto copy_skb;
+        /* skb data is already used, copy it */
+        if (unlikely(skb_cloned(skb)))
+                goto copy_data;
+        return pskb_may_pull(skb, writable_len);
+  copy_data:
+        if (unlikely(writable_len > skb->len))
+                return 0;
+        return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+  copy_skb:
+        if (unlikely(writable_len > skb->len))
+                return 0;
+        skb = skb_copy(skb, GFP_ATOMIC);
+        if (!skb)
+                return 0;
+        BUG_ON(skb_is_nonlinear(skb));
+        /* Rest of kernel will get very unhappy if we pass it a
+           suddenly-orphaned skbuff */
+        if ((*pskb)->sk)
+                skb_set_owner_w(skb, (*pskb)->sk);
+        kfree_skb(*pskb);
+        *pskb = skb;
+        return 1;
+}
+/*
+ *  IPVS persistent scheduling function
+ *  It creates a connection entry according to its template if exists,
+ *  or selects a server and creates a connection entry plus a template.
+ *  Locking: we are svc user (svc->refcnt), so we hold all dests too
+ *  Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+                    const struct sk_buff *skb,
+                    __u16 ports[2])
+{
+        struct ip_vs_conn *cp = NULL;
+        struct iphdr *iph = skb->nh.iph;
+        struct ip_vs_dest *dest;
+        struct ip_vs_conn *ct;
+        __u16  dport;    /* destination port to forward */
+        __u32  snet;     /* source network of the client, after masking */
+        /* Mask saddr with the netmask to adjust template granularity */
+        snet = iph->saddr & svc->netmask;
+        IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
+                  "mnet %u.%u.%u.%u\n",
+                  NIPQUAD(iph->saddr), ntohs(ports[0]),
+                  NIPQUAD(iph->daddr), ntohs(ports[1]),
+                  NIPQUAD(snet));
+        /*
+         * As far as we know, FTP is a very complicated network protocol, and
+         * it uses control connection and data connections. For active FTP,
+         * FTP server initialize data connection to the client, its source port
+         * is often 20. For passive FTP, FTP server tells the clients the port
+         * that it passively listens to,  and the client issues the data
+         * connection. In the tunneling or direct routing mode, the load
+         * balancer is on the client-to-server half of connection, the port
+         * number is unknown to the load balancer. So, a conn template like
+         * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+         * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+         * is created for other persistent services.
+         */
+        if (ports[1] == svc->port) {
+                /* Check if a template already exists */
+                if (svc->port != FTPPORT)
+                        ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+                                               iph->daddr, ports[1]);
+                else
+                        ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+                                               iph->daddr, 0);
+                if (!ct || !ip_vs_check_template(ct)) {
+                        /*
+                         * No template found or the dest of the connection
+                         * template is not available.
+                         */
+                        dest = svc->scheduler->schedule(svc, skb);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                                return NULL;
+                        }
+                        /*
+                         * Create a template like <protocol,caddr,0,
+                         * vaddr,vport,daddr,dport> for non-ftp service,
+                         * and <protocol,caddr,0,vaddr,0,daddr,0>
+                         * for ftp service.
+                         */
+                        if (svc->port != FTPPORT)
+                                ct = ip_vs_conn_new(iph->protocol,
+                                                    snet, 0,
+                                                    iph->daddr,
+                                                    ports[1],
+                                                    dest->addr, dest->port,
+                                                    0,
+                                                    dest);
+                        else
+                                ct = ip_vs_conn_new(iph->protocol,
+                                                    snet, 0,
+                                                    iph->daddr, 0,
+                                                    dest->addr, 0,
+                                                    0,
+                                                    dest);
+                        if (ct == NULL)
+                                return NULL;
+                        ct->timeout = svc->timeout;
+                } else {
+                        /* set destination with the found template */
+                        dest = ct->dest;
+                }
+                dport = dest->port;
+        } else {
+                /*
+                 * Note: persistent fwmark-based services and persistent
+                 * port zero service are handled here.
+                 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+                 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+                 */
+                if (svc->fwmark)
+                        ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
+                                               htonl(svc->fwmark), 0);
+                else
+                        ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+                                               iph->daddr, 0);
+                if (!ct || !ip_vs_check_template(ct)) {
+                        /*
+                         * If it is not persistent port zero, return NULL,
+                         * otherwise create a connection template.
+                         */
+                        if (svc->port)
+                                return NULL;
+                        dest = svc->scheduler->schedule(svc, skb);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                                return NULL;
+                        }
+                        /*
+                         * Create a template according to the service
+                         */
+                        if (svc->fwmark)
+                                ct = ip_vs_conn_new(IPPROTO_IP,
+                                                    snet, 0,
+                                                    htonl(svc->fwmark), 0,
+                                                    dest->addr, 0,
+                                                    0,
+                                                    dest);
+                        else
+                                ct = ip_vs_conn_new(iph->protocol,
+                                                    snet, 0,
+                                                    iph->daddr, 0,
+                                                    dest->addr, 0,
+                                                    0,
+                                                    dest);
+                        if (ct == NULL)
+                                return NULL;
+                        ct->timeout = svc->timeout;
+                } else {
+                        /* set destination with the found template */
+                        dest = ct->dest;
+                }
+                dport = ports[1];
+        }
+        /*
+         *    Create a new connection according to the template
+         */
+        cp = ip_vs_conn_new(iph->protocol,
+                            iph->saddr, ports[0],
+                            iph->daddr, ports[1],
+                            dest->addr, dport,
+                            0,
+                            dest);
+        if (cp == NULL) {
+                ip_vs_conn_put(ct);
+                return NULL;
+        }
+        /*
+         *    Add its control
+         */
+        ip_vs_control_add(cp, ct);
+        ip_vs_conn_put(ct);
+        ip_vs_conn_stats(cp, svc);
+        return cp;
+}
+/*
+ *  IPVS main scheduling function
+ *  It selects a server according to the virtual service, and
+ *  creates a connection entry.
+ *  Protocols supported: TCP, UDP
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_conn *cp = NULL;
+        struct iphdr *iph = skb->nh.iph;
+        struct ip_vs_dest *dest;
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, iph->ihl*4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        /*
+         *    Persistent service
+         */
+        if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+                return ip_vs_sched_persist(svc, skb, pptr);
+        /*
+         *    Non-persistent service
+         */
+        if (!svc->fwmark && pptr[1] != svc->port) {
+                if (!svc->port)
+                        IP_VS_ERR("Schedule: port zero only supported "
+                                  "in persistent services, "
+                                  "check your ipvs configuration\n");
+                return NULL;
+        }
+        dest = svc->scheduler->schedule(svc, skb);
+        if (dest == NULL) {
+                IP_VS_DBG(1, "Schedule: no dest found.\n");
+                return NULL;
+        }
+        /*
+         *    Create a connection entry.
+         */
+        cp = ip_vs_conn_new(iph->protocol,
+                            iph->saddr, pptr[0],
+                            iph->daddr, pptr[1],
+                            dest->addr, dest->port?dest->port:pptr[1],
+                            0,
+                            dest);
+        if (cp == NULL)
+                return NULL;
+        IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
+                  "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+                  ip_vs_fwd_tag(cp),
+                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                  NIPQUAD(cp->daddr), ntohs(cp->dport),
+                  cp->flags, atomic_read(&cp->refcnt));
+        ip_vs_conn_stats(cp, svc);
+        return cp;
+}
+/*
+ *  Pass or drop the packet.
+ *  Called by ip_vs_in, when the virtual service is available but
+ *  no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+                struct ip_vs_protocol *pp)
+{
+        __u16 _ports[2], *pptr;
+        struct iphdr *iph = skb->nh.iph;
+        pptr = skb_header_pointer(skb, iph->ihl*4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL) {
+                ip_vs_service_put(svc);
+                return NF_DROP;
+        }
+        /* if it is fwmark-based service, the cache_bypass sysctl is up
+           and the destination is RTN_UNICAST (and not local), then create
+           a cache_bypass connection entry */
+        if (sysctl_ip_vs_cache_bypass && svc->fwmark
+            && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+                int ret, cs;
+                struct ip_vs_conn *cp;
+                ip_vs_service_put(svc);
+                /* create a new connection entry */
+                IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
+                cp = ip_vs_conn_new(iph->protocol,
+                                    iph->saddr, pptr[0],
+                                    iph->daddr, pptr[1],
+                                    0, 0,
+                                    IP_VS_CONN_F_BYPASS,
+                                    NULL);
+                if (cp == NULL)
+                        return NF_DROP;
+                /* statistics */
+                ip_vs_in_stats(cp, skb);
+                /* set state */
+                cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+                /* transmit the first SYN packet */
+                ret = cp->packet_xmit(skb, cp, pp);
+                /* do not touch skb anymore */
+                atomic_inc(&cp->in_pkts);
+                ip_vs_conn_put(cp);
+                return ret;
+        }
+        /*
+         * When the virtual ftp service is presented, packets destined
+         * for other services on the VIP may get here (except services
+         * listed in the ipvs table), pass the packets, because it is
+         * not ipvs job to decide to drop the packets.
+         */
+        if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
+                ip_vs_service_put(svc);
+                return NF_ACCEPT;
+        }
+        ip_vs_service_put(svc);
+        /*
+         * Notify the client that the destination is unreachable, and
+         * release the socket buffer.
+         * Since it is in IP layer, the TCP socket is not actually
+         * created, the TCP RST packet cannot be sent, instead that
+         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+         */
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+        return NF_DROP;
+}
+/*
+ *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ *      chain, and is used for VS/NAT.
+ *      It detects packets for VS/NAT connections and sends the packets
+ *      immediately. This can avoid that iptable_nat mangles the packets
+ *      for VS/NAT.
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+                                       struct sk_buff **pskb,
+                                       const struct net_device *in,
+                                       const struct net_device *out,
+                                       int (*okfn)(struct sk_buff *))
+{
+        if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+                return NF_ACCEPT;
+        /* The packet was sent from IPVS, exit this chain */
+        (*okfn)(*pskb);
+        return NF_STOLEN;
+}
+u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+        return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+static inline struct sk_buff *
+ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+        skb = ip_defrag(skb, user);
+        if (skb)
+                ip_send_check(skb->nh.iph);
+        return skb;
+}
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                    struct ip_vs_conn *cp, int inout)
+{
+        struct iphdr *iph        = skb->nh.iph;
+        unsigned int icmp_offset = iph->ihl*4;
+        struct icmphdr *icmph    = (struct icmphdr *)(skb->nh.raw + icmp_offset);
+        struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
+        if (inout) {
+                iph->saddr = cp->vaddr;
+                ip_send_check(iph);
+                ciph->daddr = cp->vaddr;
+                ip_send_check(ciph);
+        } else {
+                iph->daddr = cp->daddr;
+                ip_send_check(iph);
+                ciph->saddr = cp->daddr;
+                ip_send_check(ciph);
+        }
+        /* the TCP/UDP port */
+        if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
+                __u16 *ports = (void *)ciph + ciph->ihl*4;
+                if (inout)
+                        ports[1] = cp->vport;
+                else
+                        ports[0] = cp->dport;
+        }
+        /* And finally the ICMP checksum */
+        icmph->checksum = 0;
+        icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+        skb->ip_summed = CHECKSUM_UNNECESSARY;
+        if (inout)
+                IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                        "Forwarding altered outgoing ICMP");
+        else
+                IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                        "Forwarding altered incoming ICMP");
+}
+/*
+ *      Handle ICMP messages in the inside-to-outside direction (outgoing).
+ *      Find any that might be relevant, check against existing connections,
+ *      forward to the right destination host if relevant.
+ *      Currently handles error types - unreachable, quench, ttl exceeded.
+ *      (Only used in VS/NAT)
+ */
+static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
+{
+        struct sk_buff *skb = *pskb;
+        struct iphdr *iph;
+        struct icmphdr  _icmph, *ic;
+        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
+        struct ip_vs_conn *cp;
+        struct ip_vs_protocol *pp;
+        unsigned int offset, ihl, verdict;
+        *related = 1;
+        /* reassemble IP fragments */
+        if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+                skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+                if (!skb)
+                        return NF_STOLEN;
+                *pskb = skb;
+        }
+        iph = skb->nh.iph;
+        offset = ihl = iph->ihl * 4;
+        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+        if (ic == NULL)
+                return NF_DROP;
+        IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+                  ic->type, ntohs(icmp_id(ic)),
+                  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+        /*
+         * Work through seeing if this is for us.
+         * These checks are supposed to be in an order that means easy
+         * things are checked first to speed up processing.... however
+         * this means that some packets will manage to get a long way
+         * down this stack and then be rejected, but that's life.
+         */
+        if ((ic->type != ICMP_DEST_UNREACH) &&
+            (ic->type != ICMP_SOURCE_QUENCH) &&
+            (ic->type != ICMP_TIME_EXCEEDED)) {
+                *related = 0;
+                return NF_ACCEPT;
+        }
+        /* Now find the contained IP header */
+        offset += sizeof(_icmph);
+        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+        if (cih == NULL)
+                return NF_ACCEPT; /* The packet looks wrong, ignore */
+        pp = ip_vs_proto_get(cih->protocol);
+        if (!pp)
+                return NF_ACCEPT;
+        /* Is the embedded protocol header present? */
+        if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+                     pp->dont_defrag))
+                return NF_ACCEPT;
+        IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+        offset += cih->ihl * 4;
+        /* The embedded headers contain source and dest in reverse order */
+        cp = pp->conn_out_get(skb, pp, cih, offset, 1);
+        if (!cp)
+                return NF_ACCEPT;
+        verdict = NF_DROP;
+        if (IP_VS_FWD_METHOD(cp) != 0) {
+                IP_VS_ERR("shouldn't reach here, because the box is on the"
+                          "half connection in the tun/dr module.\n");
+        }
+        /* Ensure the checksum is correct */
+        if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+            ip_vs_checksum_complete(skb, ihl)) {
+                /* Failed checksum! */
+                IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
+                          NIPQUAD(iph->saddr));
+                goto out;
+        }
+        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+                offset += 2 * sizeof(__u16);
+        if (!ip_vs_make_skb_writable(pskb, offset))
+                goto out;
+        skb = *pskb;
+        ip_vs_nat_icmp(skb, pp, cp, 1);
+        /* do the statistics and put it back */
+        ip_vs_out_stats(cp, skb);
+        skb->nfcache |= NFC_IPVS_PROPERTY;
+        verdict = NF_ACCEPT;
+  out:
+        __ip_vs_conn_put(cp);
+        return verdict;
+}
+static inline int is_tcp_reset(const struct sk_buff *skb)
+{
+        struct tcphdr _tcph, *th;
+        th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL)
+                return 0;
+        return th->rst;
+}
+/*
+ *      It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ *      Check if outgoing packet belongs to the established ip_vs_conn,
+ *      rewrite addresses of the packet and send it on its way...
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
+          const struct net_device *in, const struct net_device *out,
+          int (*okfn)(struct sk_buff *))
+{
+        struct sk_buff  *skb = *pskb;
+        struct iphdr    *iph;
+        struct ip_vs_protocol *pp;
+        struct ip_vs_conn *cp;
+        int ihl;
+        EnterFunction(11);
+        if (skb->nfcache & NFC_IPVS_PROPERTY)
+                return NF_ACCEPT;
+        iph = skb->nh.iph;
+        if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+                int related, verdict = ip_vs_out_icmp(pskb, &related);
+                if (related)
+                        return verdict;
+                skb = *pskb;
+                iph = skb->nh.iph;
+        }
+        pp = ip_vs_proto_get(iph->protocol);
+        if (unlikely(!pp))
+                return NF_ACCEPT;
+        /* reassemble IP fragments */
+        if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
+                     !pp->dont_defrag)) {
+                skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+                if (!skb)
+                        return NF_STOLEN;
+                iph = skb->nh.iph;
+                *pskb = skb;
+        }
+        ihl = iph->ihl << 2;
+        /*
+         * Check if the packet belongs to an existing entry
+         */
+        cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
+        if (unlikely(!cp)) {
+                if (sysctl_ip_vs_nat_icmp_send &&
+                    (pp->protocol == IPPROTO_TCP ||
+                     pp->protocol == IPPROTO_UDP)) {
+                        __u16 _ports[2], *pptr;
+                        pptr = skb_header_pointer(skb, ihl,
+                                                  sizeof(_ports), _ports);
+                        if (pptr == NULL)
+                                return NF_ACCEPT;       /* Not for me */
+                        if (ip_vs_lookup_real_service(iph->protocol,
+                                                      iph->saddr, pptr[0])) {
+                                /*
+                                 * Notify the real server: there is no
+                                 * existing entry if it is not RST
+                                 * packet or not TCP packet.
+                                 */
+                                if (iph->protocol != IPPROTO_TCP
+                                    || !is_tcp_reset(skb)) {
+                                        icmp_send(skb,ICMP_DEST_UNREACH,
+                                                  ICMP_PORT_UNREACH, 0);
+                                        return NF_DROP;
+                                }
+                        }
+                }
+                IP_VS_DBG_PKT(12, pp, skb, 0,
+                              "packet continues traversal as normal");
+                return NF_ACCEPT;
+        }
+        IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+        if (!ip_vs_make_skb_writable(pskb, ihl))
+                goto drop;
+        /* mangle the packet */
+        if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
+                goto drop;
+        skb = *pskb;
+        skb->nh.iph->saddr = cp->vaddr;
+        ip_send_check(skb->nh.iph);
+        IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+        ip_vs_out_stats(cp, skb);
+        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+        ip_vs_conn_put(cp);
+        skb->nfcache |= NFC_IPVS_PROPERTY;
+        LeaveFunction(11);
+        return NF_ACCEPT;
+  drop:
+        ip_vs_conn_put(cp);
+        kfree_skb(*pskb);
+        return NF_STOLEN;
+}
+/*
+ *      Handle ICMP messages in the outside-to-inside direction (incoming).
+ *      Find any that might be relevant, check against existing connections,
+ *      forward to the right destination host if relevant.
+ *      Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int 
+ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
+{
+        struct sk_buff *skb = *pskb;
+        struct iphdr *iph;
+        struct icmphdr  _icmph, *ic;
+        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
+        struct ip_vs_conn *cp;
+        struct ip_vs_protocol *pp;
+        unsigned int offset, ihl, verdict;
+        *related = 1;
+        /* reassemble IP fragments */
+        if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+                skb = ip_vs_gather_frags(skb,
+                                         hooknum == NF_IP_LOCAL_IN ?
+                                         IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
+                if (!skb)
+                        return NF_STOLEN;
+                *pskb = skb;
+        }
+        iph = skb->nh.iph;
+        offset = ihl = iph->ihl * 4;
+        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+        if (ic == NULL)
+                return NF_DROP;
+        IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+                  ic->type, ntohs(icmp_id(ic)),
+                  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+        /*
+         * Work through seeing if this is for us.
+         * These checks are supposed to be in an order that means easy
+         * things are checked first to speed up processing.... however
+         * this means that some packets will manage to get a long way
+         * down this stack and then be rejected, but that's life.
+         */
+        if ((ic->type != ICMP_DEST_UNREACH) &&
+            (ic->type != ICMP_SOURCE_QUENCH) &&
+            (ic->type != ICMP_TIME_EXCEEDED)) {
+                *related = 0;
+                return NF_ACCEPT;
+        }
+        /* Now find the contained IP header */
+        offset += sizeof(_icmph);
+        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+        if (cih == NULL)
+                return NF_ACCEPT; /* The packet looks wrong, ignore */
+        pp = ip_vs_proto_get(cih->protocol);
+        if (!pp)
+                return NF_ACCEPT;
+        /* Is the embedded protocol header present? */
+        if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+                     pp->dont_defrag))
+                return NF_ACCEPT;
+        IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+        offset += cih->ihl * 4;
+        /* The embedded headers contain source and dest in reverse order */
+        cp = pp->conn_in_get(skb, pp, cih, offset, 1);
+        if (!cp)
+                return NF_ACCEPT;
+        verdict = NF_DROP;
+        /* Ensure the checksum is correct */
+        if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+            ip_vs_checksum_complete(skb, ihl)) {
+                /* Failed checksum! */
+                IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
+                          NIPQUAD(iph->saddr));
+                goto out;
+        }
+        /* do the statistics and put it back */
+        ip_vs_in_stats(cp, skb);
+        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+                offset += 2 * sizeof(__u16);
+        verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
+        /* do not touch skb anymore */
+  out:
+        __ip_vs_conn_put(cp);
+        return verdict;
+}
+/*
+ *      Check if it's for virtual services, look it up,
+ *      and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
+         const struct net_device *in, const struct net_device *out,
+         int (*okfn)(struct sk_buff *))
+{
+        struct sk_buff  *skb = *pskb;
+        struct iphdr    *iph;
+        struct ip_vs_protocol *pp;
+        struct ip_vs_conn *cp;
+        int ret, restart;
+        int ihl;
+        /*
+         *      Big tappo: only PACKET_HOST (neither loopback nor mcasts)
+         *      ... don't know why 1st test DOES NOT include 2nd (?)
+         */
+        if (unlikely(skb->pkt_type != PACKET_HOST
+                     || skb->dev == &loopback_dev || skb->sk)) {
+                IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
+                          skb->pkt_type,
+                          skb->nh.iph->protocol,
+                          NIPQUAD(skb->nh.iph->daddr));
+                return NF_ACCEPT;
+        }
+        iph = skb->nh.iph;
+        if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+                int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
+                if (related)
+                        return verdict;
+                skb = *pskb;
+                iph = skb->nh.iph;
+        }
+        /* Protocol supported? */
+        pp = ip_vs_proto_get(iph->protocol);
+        if (unlikely(!pp))
+                return NF_ACCEPT;
+        ihl = iph->ihl << 2;
+        /*
+         * Check if the packet belongs to an existing connection entry
+         */
+        cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
+        if (unlikely(!cp)) {
+                int v;
+                if (!pp->conn_schedule(skb, pp, &v, &cp))
+                        return v;
+        }
+        if (unlikely(!cp)) {
+                /* sorry, all this trouble for a no-hit :) */
+                IP_VS_DBG_PKT(12, pp, skb, 0,
+                              "packet continues traversal as normal");
+                return NF_ACCEPT;
+        }
+        IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+        /* Check the server status */
+        if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                /* the destination server is not available */
+                if (sysctl_ip_vs_expire_nodest_conn) {
+                        /* try to expire the connection immediately */
+                        ip_vs_conn_expire_now(cp);
+                } else {
+                        /* don't restart its timer, and silently
+                           drop the packet. */
+                        __ip_vs_conn_put(cp);
+                }
+                return NF_DROP;
+        }
+        ip_vs_in_stats(cp, skb);
+        restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+        if (cp->packet_xmit)
+                ret = cp->packet_xmit(skb, cp, pp);
+                /* do not touch skb anymore */
+        else {
+                IP_VS_DBG_RL("warning: packet_xmit is null");
+                ret = NF_ACCEPT;
+        }
+        /* increase its packet counter and check if it is needed
+           to be synchronized */
+        atomic_inc(&cp->in_pkts);
+        if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+            (cp->protocol != IPPROTO_TCP ||
+             cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+            (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+             == sysctl_ip_vs_sync_threshold[0]))
+                ip_vs_sync_conn(cp);
+        ip_vs_conn_put(cp);
+        return ret;
+}
+/*
+ *      It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ *      related packets destined for 0.0.0.0/0.
+ *      When fwmark-based virtual service is used, such as transparent
+ *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ *      sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ *      and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
+                   const struct net_device *in, const struct net_device *out,
+                   int (*okfn)(struct sk_buff *))
+{
+        int r;
+        if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
+                return NF_ACCEPT;
+        return ip_vs_in_icmp(pskb, &r, hooknum);
+}
+/* After packet filtering, forward packet through VS/DR, VS/TUN,
+   or VS/NAT(change destination), so that filtering rules can be
+   applied to IPVS. */
+static struct nf_hook_ops ip_vs_in_ops = {
+        .hook           = ip_vs_in,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_LOCAL_IN,
+        .priority       = 100,
+};
+/* After packet filtering, change source only for VS/NAT */
+static struct nf_hook_ops ip_vs_out_ops = {
+        .hook           = ip_vs_out,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_FORWARD,
+        .priority       = 100,
+};
+/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+   destined for 0.0.0.0/0, which is for incoming IPVS connections */
+static struct nf_hook_ops ip_vs_forward_icmp_ops = {
+        .hook           = ip_vs_forward_icmp,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_FORWARD,
+        .priority       = 99,
+};
+/* Before the netfilter connection tracking, exit from POST_ROUTING */
+static struct nf_hook_ops ip_vs_post_routing_ops = {
+        .hook           = ip_vs_post_routing,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_POST_ROUTING,
+        .priority       = NF_IP_PRI_NAT_SRC-1,
+};
+/*
+ *      Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+        int ret;
+        ret = ip_vs_control_init();
+        if (ret < 0) {
+                IP_VS_ERR("can't setup control.\n");
+                goto cleanup_nothing;
+        }
+        ip_vs_protocol_init();
+        ret = ip_vs_app_init();
+        if (ret < 0) {
+                IP_VS_ERR("can't setup application helper.\n");
+                goto cleanup_protocol;
+        }
+        ret = ip_vs_conn_init();
+        if (ret < 0) {
+                IP_VS_ERR("can't setup connection table.\n");
+                goto cleanup_app;
+        }
+        ret = nf_register_hook(&ip_vs_in_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register in hook.\n");
+                goto cleanup_conn;
+        }
+        ret = nf_register_hook(&ip_vs_out_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register out hook.\n");
+                goto cleanup_inops;
+        }
+        ret = nf_register_hook(&ip_vs_post_routing_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register post_routing hook.\n");
+                goto cleanup_outops;
+        }
+        ret = nf_register_hook(&ip_vs_forward_icmp_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register forward_icmp hook.\n");
+                goto cleanup_postroutingops;
+        }
+        IP_VS_INFO("ipvs loaded.\n");
+        return ret;
+  cleanup_postroutingops:
+        nf_unregister_hook(&ip_vs_post_routing_ops);
+  cleanup_outops:
+        nf_unregister_hook(&ip_vs_out_ops);
+  cleanup_inops:
+        nf_unregister_hook(&ip_vs_in_ops);
+  cleanup_conn:
+        ip_vs_conn_cleanup();
+  cleanup_app:
+        ip_vs_app_cleanup();
+  cleanup_protocol:
+        ip_vs_protocol_cleanup();
+        ip_vs_control_cleanup();
+  cleanup_nothing:
+        return ret;
+}
+static void __exit ip_vs_cleanup(void)
+{
+        nf_unregister_hook(&ip_vs_forward_icmp_ops);
+        nf_unregister_hook(&ip_vs_post_routing_ops);
+        nf_unregister_hook(&ip_vs_out_ops);
+        nf_unregister_hook(&ip_vs_in_ops);
+        ip_vs_conn_cleanup();
+        ip_vs_app_cleanup();
+        ip_vs_protocol_cleanup();
+        ip_vs_control_cleanup();
+        IP_VS_INFO("ipvs unloaded.\n");
+}
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
new file mode 100644
index 000000000000..218d9701036e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -0,0 +1,2391 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <net/ip_vs.h>
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DECLARE_MUTEX(__ip_vs_mutex);
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_svc_lock);
+/* lock for table with the real services */
+static DEFINE_RWLOCK(__ip_vs_rs_lock);
+/* lock for state and timeout tables */
+static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+/* lock for drop entry handling */
+static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
+/* lock for drop packet handling */
+static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
+/* 1/rate drop and drop-entry variables */
+int ip_vs_drop_rate = 0;
+int ip_vs_drop_counter = 0;
+static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
+/* number of virtual services */
+static int ip_vs_num_services = 0;
+/* sysctl variables */
+static int sysctl_ip_vs_drop_entry = 0;
+static int sysctl_ip_vs_drop_packet = 0;
+static int sysctl_ip_vs_secure_tcp = 0;
+static int sysctl_ip_vs_amemthresh = 1024;
+static int sysctl_ip_vs_am_droprate = 10;
+int sysctl_ip_vs_cache_bypass = 0;
+int sysctl_ip_vs_expire_nodest_conn = 0;
+int sysctl_ip_vs_expire_quiescent_template = 0;
+int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+int ip_vs_get_debug_level(void)
+{
+        return sysctl_ip_vs_debug_level;
+}
+#endif
+/*
+ *      update_defense_level is called from keventd and from sysctl.
+ */
+static void update_defense_level(void)
+{
+        struct sysinfo i;
+        static int old_secure_tcp = 0;
+        int availmem;
+        int nomem;
+        int to_change = -1;
+        /* we only count free and buffered memory (in pages) */
+        si_meminfo(&i);
+        availmem = i.freeram + i.bufferram;
+        /* however in linux 2.5 the i.bufferram is total page cache size,
+           we need adjust it */
+        /* si_swapinfo(&i); */
+        /* availmem = availmem - (i.totalswap - i.freeswap); */
+        nomem = (availmem < sysctl_ip_vs_amemthresh);
+        /* drop_entry */
+        spin_lock(&__ip_vs_dropentry_lock);
+        switch (sysctl_ip_vs_drop_entry) {
+        case 0:
+                atomic_set(&ip_vs_dropentry, 0);
+                break;
+        case 1:
+                if (nomem) {
+                        atomic_set(&ip_vs_dropentry, 1);
+                        sysctl_ip_vs_drop_entry = 2;
+                } else {
+                        atomic_set(&ip_vs_dropentry, 0);
+                }
+                break;
+        case 2:
+                if (nomem) {
+                        atomic_set(&ip_vs_dropentry, 1);
+                } else {
+                        atomic_set(&ip_vs_dropentry, 0);
+                        sysctl_ip_vs_drop_entry = 1;
+                };
+                break;
+        case 3:
+                atomic_set(&ip_vs_dropentry, 1);
+                break;
+        }
+        spin_unlock(&__ip_vs_dropentry_lock);
+        /* drop_packet */
+        spin_lock(&__ip_vs_droppacket_lock);
+        switch (sysctl_ip_vs_drop_packet) {
+        case 0:
+                ip_vs_drop_rate = 0;
+                break;
+        case 1:
+                if (nomem) {
+                        ip_vs_drop_rate = ip_vs_drop_counter
+                                = sysctl_ip_vs_amemthresh /
+                                (sysctl_ip_vs_amemthresh-availmem);
+                        sysctl_ip_vs_drop_packet = 2;
+                } else {
+                        ip_vs_drop_rate = 0;
+                }
+                break;
+        case 2:
+                if (nomem) {
+                        ip_vs_drop_rate = ip_vs_drop_counter
+                                = sysctl_ip_vs_amemthresh /
+                                (sysctl_ip_vs_amemthresh-availmem);
+                } else {
+                        ip_vs_drop_rate = 0;
+                        sysctl_ip_vs_drop_packet = 1;
+                }
+                break;
+        case 3:
+                ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+                break;
+        }
+        spin_unlock(&__ip_vs_droppacket_lock);
+        /* secure_tcp */
+        write_lock(&__ip_vs_securetcp_lock);
+        switch (sysctl_ip_vs_secure_tcp) {
+        case 0:
+                if (old_secure_tcp >= 2)
+                        to_change = 0;
+                break;
+        case 1:
+                if (nomem) {
+                        if (old_secure_tcp < 2)
+                                to_change = 1;
+                        sysctl_ip_vs_secure_tcp = 2;
+                } else {
+                        if (old_secure_tcp >= 2)
+                                to_change = 0;
+                }
+                break;
+        case 2:
+                if (nomem) {
+                        if (old_secure_tcp < 2)
+                                to_change = 1;
+                } else {
+                        if (old_secure_tcp >= 2)
+                                to_change = 0;
+                        sysctl_ip_vs_secure_tcp = 1;
+                }
+                break;
+        case 3:
+                if (old_secure_tcp < 2)
+                        to_change = 1;
+                break;
+        }
+        old_secure_tcp = sysctl_ip_vs_secure_tcp;
+        if (to_change >= 0)
+                ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+        write_unlock(&__ip_vs_securetcp_lock);
+}
+/*
+ *      Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD    1*HZ
+static void defense_work_handler(void *data);
+static DECLARE_WORK(defense_work, defense_work_handler, NULL);
+static void defense_work_handler(void *data)
+{
+        update_defense_level();
+        if (atomic_read(&ip_vs_dropentry))
+                ip_vs_random_dropentry();
+        schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+}
+int
+ip_vs_use_count_inc(void)
+{
+        return try_module_get(THIS_MODULE);
+}
+void
+ip_vs_use_count_dec(void)
+{
+        module_put(THIS_MODULE);
+}
+/*
+ *      Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+/*
+ *      Hash table: for real service lookups
+ */
+#define IP_VS_RTAB_BITS 4
+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
+/*
+ *      Trash for destinations
+ */
+static LIST_HEAD(ip_vs_dest_trash);
+/*
+ *      FTP & NULL virtual service counters
+ */
+static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
+static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+/*
+ *      Returns hash value for virtual service
+ */
+static __inline__ unsigned
+ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+        register unsigned porth = ntohs(port);
+        return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+                & IP_VS_SVC_TAB_MASK;
+}
+/*
+ *      Returns hash value of fwmark for virtual service lookup
+ */
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+{
+        return fwmark & IP_VS_SVC_TAB_MASK;
+}
+/*
+ *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ *      or in the ip_vs_svc_fwm_table by fwmark.
+ *      Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+        unsigned hash;
+        if (svc->flags & IP_VS_SVC_F_HASHED) {
+                IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        if (svc->fwmark == 0) {
+                /*
+                 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
+                 */
+                hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
+                list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+        } else {
+                /*
+                 *  Hash it by fwmark in ip_vs_svc_fwm_table
+                 */
+                hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+                list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+        }
+        svc->flags |= IP_VS_SVC_F_HASHED;
+        /* increase its refcnt because it is referenced by the svc table */
+        atomic_inc(&svc->refcnt);
+        return 1;
+}
+/*
+ *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ *      Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+        if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+                IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        if (svc->fwmark == 0) {
+                /* Remove it from the ip_vs_svc_table table */
+                list_del(&svc->s_list);
+        } else {
+                /* Remove it from the ip_vs_svc_fwm_table table */
+                list_del(&svc->f_list);
+        }
+        svc->flags &= ~IP_VS_SVC_F_HASHED;
+        atomic_dec(&svc->refcnt);
+        return 1;
+}
+/*
+ *      Get service by {proto,addr,port} in the service table.
+ */
+static __inline__ struct ip_vs_service *
+__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
+{
+        unsigned hash;
+        struct ip_vs_service *svc;
+        /* Check for "full" addressed entries */
+        hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
+        list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+                if ((svc->addr == vaddr)
+                    && (svc->port == vport)
+                    && (svc->protocol == protocol)) {
+                        /* HIT */
+                        atomic_inc(&svc->usecnt);
+                        return svc;
+                }
+        }
+        return NULL;
+}
+/*
+ *      Get service by {fwmark} in the service table.
+ */
+static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
+{
+        unsigned hash;
+        struct ip_vs_service *svc;
+        /* Check for fwmark addressed entries */
+        hash = ip_vs_svc_fwm_hashkey(fwmark);
+        list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+                if (svc->fwmark == fwmark) {
+                        /* HIT */
+                        atomic_inc(&svc->usecnt);
+                        return svc;
+                }
+        }
+        return NULL;
+}
+struct ip_vs_service *
+ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
+{
+        struct ip_vs_service *svc;
+        read_lock(&__ip_vs_svc_lock);
+        /*
+         *      Check the table hashed by fwmark first
+         */
+        if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
+                goto out;
+        /*
+         *      Check the table hashed by <protocol,addr,port>
+         *      for "full" addressed entries
+         */
+        svc = __ip_vs_service_get(protocol, vaddr, vport);
+        if (svc == NULL
+            && protocol == IPPROTO_TCP
+            && atomic_read(&ip_vs_ftpsvc_counter)
+            && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+                /*
+                 * Check if ftp service entry exists, the packet
+                 * might belong to FTP data connections.
+                 */
+                svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
+        }
+        if (svc == NULL
+            && atomic_read(&ip_vs_nullsvc_counter)) {
+                /*
+                 * Check if the catch-all port (port zero) exists
+                 */
+                svc = __ip_vs_service_get(protocol, vaddr, 0);
+        }
+  out:
+        read_unlock(&__ip_vs_svc_lock);
+        IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+                  fwmark, ip_vs_proto_name(protocol),
+                  NIPQUAD(vaddr), ntohs(vport),
+                  svc?"hit":"not hit");
+        return svc;
+}
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+        atomic_inc(&svc->refcnt);
+        dest->svc = svc;
+}
+static inline void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+        struct ip_vs_service *svc = dest->svc;
+        dest->svc = NULL;
+        if (atomic_dec_and_test(&svc->refcnt))
+                kfree(svc);
+}
+/*
+ *      Returns hash value for real service
+ */
+static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
+{
+        register unsigned porth = ntohs(port);
+        return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
+                & IP_VS_RTAB_MASK;
+}
+/*
+ *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ *      should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+{
+        unsigned hash;
+        if (!list_empty(&dest->d_list)) {
+                return 0;
+        }
+        /*
+         *      Hash by proto,addr,port,
+         *      which are the parameters of the real service.
+         */
+        hash = ip_vs_rs_hashkey(dest->addr, dest->port);
+        list_add(&dest->d_list, &ip_vs_rtable[hash]);
+        return 1;
+}
+/*
+ *      UNhashes ip_vs_dest from ip_vs_rtable.
+ *      should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+        /*
+         * Remove it from the ip_vs_rtable table.
+         */
+        if (!list_empty(&dest->d_list)) {
+                list_del(&dest->d_list);
+                INIT_LIST_HEAD(&dest->d_list);
+        }
+        return 1;
+}
+/*
+ *      Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
+{
+        unsigned hash;
+        struct ip_vs_dest *dest;
+        /*
+         *      Check for "full" addressed entries
+         *      Return the first found entry
+         */
+        hash = ip_vs_rs_hashkey(daddr, dport);
+        read_lock(&__ip_vs_rs_lock);
+        list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+                if ((dest->addr == daddr)
+                    && (dest->port == dport)
+                    && ((dest->protocol == protocol) ||
+                        dest->vfwmark)) {
+                        /* HIT */
+                        read_unlock(&__ip_vs_rs_lock);
+                        return dest;
+                }
+        }
+        read_unlock(&__ip_vs_rs_lock);
+        return NULL;
+}
+/*
+ *      Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+        struct ip_vs_dest *dest;
+        /*
+         * Find the destination for the given service
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if ((dest->addr == daddr) && (dest->port == dport)) {
+                        /* HIT */
+                        return dest;
+                }
+        }
+        return NULL;
+}
+/*
+ *  Lookup dest by {svc,addr,port} in the destination trash.
+ *  The destination trash is used to hold the destinations that are removed
+ *  from the service table but are still referenced by some conn entries.
+ *  The reason to add the destination trash is when the dest is temporary
+ *  down (either by administrator or by monitor program), the dest can be
+ *  picked back from the trash, the remaining connections to the dest can
+ *  continue, and the counting information of the dest is also useful for
+ *  scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+        struct ip_vs_dest *dest, *nxt;
+        /*
+         * Find the destination in trash
+         */
+        list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+                IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
+                          "refcnt=%d\n",
+                          dest->vfwmark,
+                          NIPQUAD(dest->addr), ntohs(dest->port),
+                          atomic_read(&dest->refcnt));
+                if (dest->addr == daddr &&
+                    dest->port == dport &&
+                    dest->vfwmark == svc->fwmark &&
+                    dest->protocol == svc->protocol &&
+                    (svc->fwmark ||
+                     (dest->vaddr == svc->addr &&
+                      dest->vport == svc->port))) {
+                        /* HIT */
+                        return dest;
+                }
+                /*
+                 * Try to purge the destination from trash if not referenced
+                 */
+                if (atomic_read(&dest->refcnt) == 1) {
+                        IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
+                                  "from trash\n",
+                                  dest->vfwmark,
+                                  NIPQUAD(dest->addr), ntohs(dest->port));
+                        list_del(&dest->n_list);
+                        ip_vs_dst_reset(dest);
+                        __ip_vs_unbind_svc(dest);
+                        kfree(dest);
+                }
+        }
+        return NULL;
+}
+/*
+ *  Clean up all the destinations in the trash
+ *  Called by the ip_vs_control_cleanup()
+ *
+ *  When the ip_vs_control_clearup is activated by ipvs module exit,
+ *  the service tables must have been flushed and all the connections
+ *  are expired, and the refcnt of each destination in the trash must
+ *  be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(void)
+{
+        struct ip_vs_dest *dest, *nxt;
+        list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+                list_del(&dest->n_list);
+                ip_vs_dst_reset(dest);
+                __ip_vs_unbind_svc(dest);
+                kfree(dest);
+        }
+}
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+        spin_lock_bh(&stats->lock);
+        memset(stats, 0, (char *)&stats->lock - (char *)stats);
+        spin_unlock_bh(&stats->lock);
+        ip_vs_zero_estimator(stats);
+}
+/*
+ *      Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc,
+                    struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
+{
+        int conn_flags;
+        /* set the weight and the flags */
+        atomic_set(&dest->weight, udest->weight);
+        conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+        /* check if local node and update the flags */
+        if (inet_addr_type(udest->addr) == RTN_LOCAL) {
+                conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+                        | IP_VS_CONN_F_LOCALNODE;
+        }
+        /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+        if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+                conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+        } else {
+                /*
+                 *    Put the real service in ip_vs_rtable if not present.
+                 *    For now only for NAT!
+                 */
+                write_lock_bh(&__ip_vs_rs_lock);
+                ip_vs_rs_hash(dest);
+                write_unlock_bh(&__ip_vs_rs_lock);
+        }
+        atomic_set(&dest->conn_flags, conn_flags);
+        /* bind the service */
+        if (!dest->svc) {
+                __ip_vs_bind_svc(dest, svc);
+        } else {
+                if (dest->svc != svc) {
+                        __ip_vs_unbind_svc(dest);
+                        ip_vs_zero_stats(&dest->stats);
+                        __ip_vs_bind_svc(dest, svc);
+                }
+        }
+        /* set the dest status flags */
+        dest->flags |= IP_VS_DEST_F_AVAILABLE;
+        if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+                dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        dest->u_threshold = udest->u_threshold;
+        dest->l_threshold = udest->l_threshold;
+}
+/*
+ *      Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
+               struct ip_vs_dest **dest_p)
+{
+        struct ip_vs_dest *dest;
+        unsigned atype;
+        EnterFunction(2);
+        atype = inet_addr_type(udest->addr);
+        if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+                return -EINVAL;
+        dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+        if (dest == NULL) {
+                IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
+                return -ENOMEM;
+        }
+        memset(dest, 0, sizeof(struct ip_vs_dest));
+        dest->protocol = svc->protocol;
+        dest->vaddr = svc->addr;
+        dest->vport = svc->port;
+        dest->vfwmark = svc->fwmark;
+        dest->addr = udest->addr;
+        dest->port = udest->port;
+        atomic_set(&dest->activeconns, 0);
+        atomic_set(&dest->inactconns, 0);
+        atomic_set(&dest->persistconns, 0);
+        atomic_set(&dest->refcnt, 0);
+        INIT_LIST_HEAD(&dest->d_list);
+        spin_lock_init(&dest->dst_lock);
+        spin_lock_init(&dest->stats.lock);
+        __ip_vs_update_dest(svc, dest, udest);
+        ip_vs_new_estimator(&dest->stats);
+        *dest_p = dest;
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+        struct ip_vs_dest *dest;
+        __u32 daddr = udest->addr;
+        __u16 dport = udest->port;
+        int ret;
+        EnterFunction(2);
+        if (udest->weight < 0) {
+                IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+                return -ERANGE;
+        }
+        if (udest->l_threshold > udest->u_threshold) {
+                IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
+                          "upper threshold\n");
+                return -ERANGE;
+        }
+        /*
+         * Check if the dest already exists in the list
+         */
+        dest = ip_vs_lookup_dest(svc, daddr, dport);
+        if (dest != NULL) {
+                IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
+                return -EEXIST;
+        }
+        /*
+         * Check if the dest already exists in the trash and
+         * is from the same service
+         */
+        dest = ip_vs_trash_get_dest(svc, daddr, dport);
+        if (dest != NULL) {
+                IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
+                          "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+                          NIPQUAD(daddr), ntohs(dport),
+                          atomic_read(&dest->refcnt),
+                          dest->vfwmark,
+                          NIPQUAD(dest->vaddr),
+                          ntohs(dest->vport));
+                __ip_vs_update_dest(svc, dest, udest);
+                /*
+                 * Get the destination from the trash
+                 */
+                list_del(&dest->n_list);
+                ip_vs_new_estimator(&dest->stats);
+                write_lock_bh(&__ip_vs_svc_lock);
+                /*
+                 * Wait until all other svc users go away.
+                 */
+                IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+                list_add(&dest->n_list, &svc->destinations);
+                svc->num_dests++;
+                /* call the update_service function of its scheduler */
+                svc->scheduler->update_service(svc);
+                write_unlock_bh(&__ip_vs_svc_lock);
+                return 0;
+        }
+        /*
+         * Allocate and initialize the dest structure
+         */
+        ret = ip_vs_new_dest(svc, udest, &dest);
+        if (ret) {
+                return ret;
+        }
+        /*
+         * Add the dest entry into the list
+         */
+        atomic_inc(&dest->refcnt);
+        write_lock_bh(&__ip_vs_svc_lock);
+        /*
+         * Wait until all other svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        list_add(&dest->n_list, &svc->destinations);
+        svc->num_dests++;
+        /* call the update_service function of its scheduler */
+        svc->scheduler->update_service(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+        struct ip_vs_dest *dest;
+        __u32 daddr = udest->addr;
+        __u16 dport = udest->port;
+        EnterFunction(2);
+        if (udest->weight < 0) {
+                IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
+                return -ERANGE;
+        }
+        if (udest->l_threshold > udest->u_threshold) {
+                IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
+                          "upper threshold\n");
+                return -ERANGE;
+        }
+        /*
+         *  Lookup the destination list
+         */
+        dest = ip_vs_lookup_dest(svc, daddr, dport);
+        if (dest == NULL) {
+                IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
+                return -ENOENT;
+        }
+        __ip_vs_update_dest(svc, dest, udest);
+        write_lock_bh(&__ip_vs_svc_lock);
+        /* Wait until all other svc users go away */
+        while (atomic_read(&svc->usecnt) > 1) {};
+        /* call the update_service, because server weight may be changed */
+        svc->scheduler->update_service(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+{
+        ip_vs_kill_estimator(&dest->stats);
+        /*
+         *  Remove it from the d-linked list with the real services.
+         */
+        write_lock_bh(&__ip_vs_rs_lock);
+        ip_vs_rs_unhash(dest);
+        write_unlock_bh(&__ip_vs_rs_lock);
+        /*
+         *  Decrease the refcnt of the dest, and free the dest
+         *  if nobody refers to it (refcnt=0). Otherwise, throw
+         *  the destination into the trash.
+         */
+        if (atomic_dec_and_test(&dest->refcnt)) {
+                ip_vs_dst_reset(dest);
+                /* simply decrease svc->refcnt here, let the caller check
+                   and release the service if nobody refers to it.
+                   Only user context can release destination and service,
+                   and only one user context can update virtual service at a
+                   time, so the operation here is OK */
+                atomic_dec(&dest->svc->refcnt);
+                kfree(dest);
+        } else {
+                IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+                          NIPQUAD(dest->addr), ntohs(dest->port),
+                          atomic_read(&dest->refcnt));
+                list_add(&dest->n_list, &ip_vs_dest_trash);
+                atomic_inc(&dest->refcnt);
+        }
+}
+/*
+ *      Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+                                struct ip_vs_dest *dest,
+                                int svcupd)
+{
+        dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+        /*
+         *  Remove it from the d-linked destination list.
+         */
+        list_del(&dest->n_list);
+        svc->num_dests--;
+        if (svcupd) {
+                /*
+                 *  Call the update_service function of its scheduler
+                 */
+                svc->scheduler->update_service(svc);
+        }
+}
+/*
+ *      Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
+{
+        struct ip_vs_dest *dest;
+        __u32 daddr = udest->addr;
+        __u16 dport = udest->port;
+        EnterFunction(2);
+        dest = ip_vs_lookup_dest(svc, daddr, dport);
+        if (dest == NULL) {
+                IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
+                return -ENOENT;
+        }
+        write_lock_bh(&__ip_vs_svc_lock);
+        /*
+         *      Wait until all other svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        /*
+         *      Unlink dest from the service
+         */
+        __ip_vs_unlink_dest(svc, dest, 1);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        /*
+         *      Delete the destination
+         */
+        __ip_vs_del_dest(dest);
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
+{
+        int ret = 0;
+        struct ip_vs_scheduler *sched = NULL;
+        struct ip_vs_service *svc = NULL;
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        /* Lookup the scheduler by 'u->sched_name' */
+        sched = ip_vs_scheduler_get(u->sched_name);
+        if (sched == NULL) {
+                IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+                           u->sched_name);
+                ret = -ENOENT;
+                goto out_mod_dec;
+        }
+        svc = (struct ip_vs_service *)
+                kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+        if (svc == NULL) {
+                IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
+                ret = -ENOMEM;
+                goto out_err;
+        }
+        memset(svc, 0, sizeof(struct ip_vs_service));
+        /* I'm the first user of the service */
+        atomic_set(&svc->usecnt, 1);
+        atomic_set(&svc->refcnt, 0);
+        svc->protocol = u->protocol;
+        svc->addr = u->addr;
+        svc->port = u->port;
+        svc->fwmark = u->fwmark;
+        svc->flags = u->flags;
+        svc->timeout = u->timeout * HZ;
+        svc->netmask = u->netmask;
+        INIT_LIST_HEAD(&svc->destinations);
+        rwlock_init(&svc->sched_lock);
+        spin_lock_init(&svc->stats.lock);
+        /* Bind the scheduler */
+        ret = ip_vs_bind_scheduler(svc, sched);
+        if (ret)
+                goto out_err;
+        sched = NULL;
+        /* Update the virtual service counters */
+        if (svc->port == FTPPORT)
+                atomic_inc(&ip_vs_ftpsvc_counter);
+        else if (svc->port == 0)
+                atomic_inc(&ip_vs_nullsvc_counter);
+        ip_vs_new_estimator(&svc->stats);
+        ip_vs_num_services++;
+        /* Hash the service into the service table */
+        write_lock_bh(&__ip_vs_svc_lock);
+        ip_vs_svc_hash(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        *svc_p = svc;
+        return 0;
+  out_err:
+        if (svc != NULL) {
+                if (svc->scheduler)
+                        ip_vs_unbind_scheduler(svc);
+                if (svc->inc) {
+                        local_bh_disable();
+                        ip_vs_app_inc_put(svc->inc);
+                        local_bh_enable();
+                }
+                kfree(svc);
+        }
+        ip_vs_scheduler_put(sched);
+  out_mod_dec:
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        return ret;
+}
+/*
+ *      Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
+{
+        struct ip_vs_scheduler *sched, *old_sched;
+        int ret = 0;
+        /*
+         * Lookup the scheduler, by 'u->sched_name'
+         */
+        sched = ip_vs_scheduler_get(u->sched_name);
+        if (sched == NULL) {
+                IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+                           u->sched_name);
+                return -ENOENT;
+        }
+        old_sched = sched;
+        write_lock_bh(&__ip_vs_svc_lock);
+        /*
+         * Wait until all other svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        /*
+         * Set the flags and timeout value
+         */
+        svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+        svc->timeout = u->timeout * HZ;
+        svc->netmask = u->netmask;
+        old_sched = svc->scheduler;
+        if (sched != old_sched) {
+                /*
+                 * Unbind the old scheduler
+                 */
+                if ((ret = ip_vs_unbind_scheduler(svc))) {
+                        old_sched = sched;
+                        goto out;
+                }
+                /*
+                 * Bind the new scheduler
+                 */
+                if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+                        /*
+                         * If ip_vs_bind_scheduler fails, restore the old
+                         * scheduler.
+                         * The main reason of failure is out of memory.
+                         *
+                         * The question is if the old scheduler can be
+                         * restored all the time. TODO: if it cannot be
+                         * restored some time, we must delete the service,
+                         * otherwise the system may crash.
+                         */
+                        ip_vs_bind_scheduler(svc, old_sched);
+                        old_sched = sched;
+                        goto out;
+                }
+        }
+  out:
+        write_unlock_bh(&__ip_vs_svc_lock);
+        if (old_sched)
+                ip_vs_scheduler_put(old_sched);
+        return ret;
+}
+/*
+ *      Delete a service from the service list
+ *      - The service must be unlinked, unlocked and not referenced!
+ *      - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest, *nxt;
+        struct ip_vs_scheduler *old_sched;
+        ip_vs_num_services--;
+        ip_vs_kill_estimator(&svc->stats);
+        /* Unbind scheduler */
+        old_sched = svc->scheduler;
+        ip_vs_unbind_scheduler(svc);
+        if (old_sched)
+                ip_vs_scheduler_put(old_sched);
+        /* Unbind app inc */
+        if (svc->inc) {
+                ip_vs_app_inc_put(svc->inc);
+                svc->inc = NULL;
+        }
+        /*
+         *    Unlink the whole destination list
+         */
+        list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+                __ip_vs_unlink_dest(svc, dest, 0);
+                __ip_vs_del_dest(dest);
+        }
+        /*
+         *    Update the virtual service counters
+         */
+        if (svc->port == FTPPORT)
+                atomic_dec(&ip_vs_ftpsvc_counter);
+        else if (svc->port == 0)
+                atomic_dec(&ip_vs_nullsvc_counter);
+        /*
+         *    Free the service if nobody refers to it
+         */
+        if (atomic_read(&svc->refcnt) == 0)
+                kfree(svc);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+}
+/*
+ *      Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+        if (svc == NULL)
+                return -EEXIST;
+        /*
+         * Unhash it from the service table
+         */
+        write_lock_bh(&__ip_vs_svc_lock);
+        ip_vs_svc_unhash(svc);
+        /*
+         * Wait until all the svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        __ip_vs_del_service(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        return 0;
+}
+/*
+ *      Flush all the virtual services
+ */
+static int ip_vs_flush(void)
+{
+        int idx;
+        struct ip_vs_service *svc, *nxt;
+        /*
+         * Flush the service table hashed by <protocol,addr,port>
+         */
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
+                        write_lock_bh(&__ip_vs_svc_lock);
+                        ip_vs_svc_unhash(svc);
+                        /*
+                         * Wait until all the svc users go away.
+                         */
+                        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+                        __ip_vs_del_service(svc);
+                        write_unlock_bh(&__ip_vs_svc_lock);
+                }
+        }
+        /*
+         * Flush the service table hashed by fwmark
+         */
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry_safe(svc, nxt,
+                                         &ip_vs_svc_fwm_table[idx], f_list) {
+                        write_lock_bh(&__ip_vs_svc_lock);
+                        ip_vs_svc_unhash(svc);
+                        /*
+                         * Wait until all the svc users go away.
+                         */
+                        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+                        __ip_vs_del_service(svc);
+                        write_unlock_bh(&__ip_vs_svc_lock);
+                }
+        }
+        return 0;
+}
+/*
+ *      Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest;
+        write_lock_bh(&__ip_vs_svc_lock);
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                ip_vs_zero_stats(&dest->stats);
+        }
+        ip_vs_zero_stats(&svc->stats);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        return 0;
+}
+static int ip_vs_zero_all(void)
+{
+        int idx;
+        struct ip_vs_service *svc;
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                        ip_vs_zero_service(svc);
+                }
+        }
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                        ip_vs_zero_service(svc);
+                }
+        }
+        ip_vs_zero_stats(&ip_vs_stats);
+        return 0;
+}
+static int
+proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int *valp = table->data;
+        int val = *valp;
+        int rc;
+        rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+        if (write && (*valp != val)) {
+                if ((*valp < 0) || (*valp > 3)) {
+                        /* Restore the correct value */
+                        *valp = val;
+                } else {
+                        local_bh_disable();
+                        update_defense_level();
+                        local_bh_enable();
+                }
+        }
+        return rc;
+}
+static int
+proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int *valp = table->data;
+        int val[2];
+        int rc;
+        /* backup the value first */
+        memcpy(val, valp, sizeof(val));
+        rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+        if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+                /* Restore the correct value */
+                memcpy(valp, val, sizeof(val));
+        }
+        return rc;
+}
+/*
+ *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ */
+static struct ctl_table vs_vars[] = {
+        {
+                .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
+                .procname       = "amemthresh",
+                .data           = &sysctl_ip_vs_amemthresh,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#ifdef CONFIG_IP_VS_DEBUG
+        {
+                .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
+                .procname       = "debug_level",
+                .data           = &sysctl_ip_vs_debug_level,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
+        {
+                .ctl_name       = NET_IPV4_VS_AMDROPRATE,
+                .procname       = "am_droprate",
+                .data           = &sysctl_ip_vs_am_droprate,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
+                .procname       = "drop_entry",
+                .data           = &sysctl_ip_vs_drop_entry,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_defense_mode,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_DROP_PACKET,
+                .procname       = "drop_packet",
+                .data           = &sysctl_ip_vs_drop_packet,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_defense_mode,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_SECURE_TCP,
+                .procname       = "secure_tcp",
+                .data           = &sysctl_ip_vs_secure_tcp,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_defense_mode,
+        },
+#if 0
+        {
+                .ctl_name       = NET_IPV4_VS_TO_ES,
+                .procname       = "timeout_established",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_SS,
+                .procname       = "timeout_synsent",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_SR,
+                .procname       = "timeout_synrecv",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_FW,
+                .procname       = "timeout_finwait",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_TW,
+                .procname       = "timeout_timewait",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_CL,
+                .procname       = "timeout_close",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_CW,
+                .procname       = "timeout_closewait",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_LA,
+                .procname       = "timeout_lastack",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_LI,
+                .procname       = "timeout_listen",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_SA,
+                .procname       = "timeout_synack",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_UDP,
+                .procname       = "timeout_udp",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_ICMP,
+                .procname       = "timeout_icmp",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+#endif
+        {
+                .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
+                .procname       = "cache_bypass",
+                .data           = &sysctl_ip_vs_cache_bypass,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
+                .procname       = "expire_nodest_conn",
+                .data           = &sysctl_ip_vs_expire_nodest_conn,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
+                .procname       = "expire_quiescent_template",
+                .data           = &sysctl_ip_vs_expire_quiescent_template,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
+                .procname       = "sync_threshold",
+                .data           = &sysctl_ip_vs_sync_threshold,
+                .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_sync_threshold,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
+                .procname       = "nat_icmp_send",
+                .data           = &sysctl_ip_vs_nat_icmp_send,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS,
+                .procname       = "vs",
+                .mode           = 0555,
+                .child          = vs_vars
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4",
+                .mode           = 0555,
+                .child          = vs_table,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_root_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net",
+                .mode           = 0555,
+                .child          = ipv4_table,
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header * sysctl_header;
+#ifdef CONFIG_PROC_FS
+struct ip_vs_iter {
+        struct list_head *table;
+        int bucket;
+};
+/*
+ *      Write the contents of the VS rule table to a PROCfs file.
+ *      (It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned flags)
+{
+        switch (flags & IP_VS_CONN_F_FWD_MASK) {
+        case IP_VS_CONN_F_LOCALNODE:
+                return "Local";
+        case IP_VS_CONN_F_TUNNEL:
+                return "Tunnel";
+        case IP_VS_CONN_F_DROUTE:
+                return "Route";
+        default:
+                return "Masq";
+        }
+}
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+        struct ip_vs_iter *iter = seq->private;
+        int idx;
+        struct ip_vs_service *svc;
+        /* look in hash by protocol */
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                        if (pos-- == 0){
+                                iter->table = ip_vs_svc_table;
+                                iter->bucket = idx;
+                                return svc;
+                        }
+                }
+        }
+        /* keep looking in fwmark */
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                        if (pos-- == 0) {
+                                iter->table = ip_vs_svc_fwm_table;
+                                iter->bucket = idx;
+                                return svc;
+                        }
+                }
+        }
+        return NULL;
+}
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock_bh(&__ip_vs_svc_lock);
+        return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct list_head *e;
+        struct ip_vs_iter *iter;
+        struct ip_vs_service *svc;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return ip_vs_info_array(seq,0);
+        svc = v;
+        iter = seq->private;
+        if (iter->table == ip_vs_svc_table) {
+                /* next service in table hashed by protocol */
+                if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
+                        return list_entry(e, struct ip_vs_service, s_list);
+                while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+                        list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
+                                            s_list) {
+                                return svc;
+                        }
+                }
+                iter->table = ip_vs_svc_fwm_table;
+                iter->bucket = -1;
+                goto scan_fwmark;
+        }
+        /* next service in hashed by fwmark */
+        if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
+                return list_entry(e, struct ip_vs_service, f_list);
+ scan_fwmark:
+        while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
+                                    f_list)
+                        return svc;
+        }
+        return NULL;
+}
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+{
+        read_unlock_bh(&__ip_vs_svc_lock);
+}
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq,
+                        "IP Virtual Server version %d.%d.%d (size=%d)\n",
+                        NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+                seq_puts(seq,
+                         "Prot LocalAddress:Port Scheduler Flags\n");
+                seq_puts(seq,
+                         "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+        } else {
+                const struct ip_vs_service *svc = v;
+                const struct ip_vs_iter *iter = seq->private;
+                const struct ip_vs_dest *dest;
+                if (iter->table == ip_vs_svc_table)
+                        seq_printf(seq, "%s  %08X:%04X %s ",
+                                   ip_vs_proto_name(svc->protocol),
+                                   ntohl(svc->addr),
+                                   ntohs(svc->port),
+                                   svc->scheduler->name);
+                else
+                        seq_printf(seq, "FWM  %08X %s ",
+                                   svc->fwmark, svc->scheduler->name);
+                if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+                        seq_printf(seq, "persistent %d %08X\n",
+                                svc->timeout,
+                                ntohl(svc->netmask));
+                else
+                        seq_putc(seq, '\n');
+                list_for_each_entry(dest, &svc->destinations, n_list) {
+                        seq_printf(seq,
+                                   "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
+                                   ntohl(dest->addr), ntohs(dest->port),
+                                   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+                                   atomic_read(&dest->weight),
+                                   atomic_read(&dest->activeconns),
+                                   atomic_read(&dest->inactconns));
+                }
+        }
+        return 0;
+}
+static struct seq_operations ip_vs_info_seq_ops = {
+        .start = ip_vs_info_seq_start,
+        .next  = ip_vs_info_seq_next,
+        .stop  = ip_vs_info_seq_stop,
+        .show  = ip_vs_info_seq_show,
+};
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                goto out;
+        rc = seq_open(file, &ip_vs_info_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq          = file->private_data;
+        seq->private = s;
+        memset(s, 0, sizeof(*s));
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations ip_vs_info_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ip_vs_info_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+#endif
+struct ip_vs_stats ip_vs_stats;
+#ifdef CONFIG_PROC_FS
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
+        seq_puts(seq,
+                 "   Total Incoming Outgoing         Incoming         Outgoing\n");
+        seq_printf(seq,
+                   "   Conns  Packets  Packets            Bytes            Bytes\n");
+        spin_lock_bh(&ip_vs_stats.lock);
+        seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
+                   ip_vs_stats.inpkts, ip_vs_stats.outpkts,
+                   (unsigned long long) ip_vs_stats.inbytes,
+                   (unsigned long long) ip_vs_stats.outbytes);
+/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+        seq_puts(seq,
+                   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+        seq_printf(seq,"%8X %8X %8X %16X %16X\n",
+                        ip_vs_stats.cps,
+                        ip_vs_stats.inpps,
+                        ip_vs_stats.outpps,
+                        ip_vs_stats.inbps,
+                        ip_vs_stats.outbps);
+        spin_unlock_bh(&ip_vs_stats.lock);
+        return 0;
+}
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, ip_vs_stats_show, NULL);
+}
+static struct file_operations ip_vs_stats_fops = {
+        .owner = THIS_MODULE,
+        .open = ip_vs_stats_seq_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+#endif
+/*
+ *      Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+{
+        IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+                  u->tcp_timeout,
+                  u->tcp_fin_timeout,
+                  u->udp_timeout);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+        if (u->tcp_timeout) {
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+                        = u->tcp_timeout * HZ;
+        }
+        if (u->tcp_fin_timeout) {
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+                        = u->tcp_fin_timeout * HZ;
+        }
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+        if (u->udp_timeout) {
+                ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+                        = u->udp_timeout * HZ;
+        }
+#endif
+        return 0;
+}
+#define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
+                                 sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN             SVCDEST_ARG_LEN
+static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+        [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
+        [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
+};
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+        int ret;
+        unsigned char arg[MAX_ARG_LEN];
+        struct ip_vs_service_user *usvc;
+        struct ip_vs_service *svc;
+        struct ip_vs_dest_user *udest;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        if (len != set_arglen[SET_CMDID(cmd)]) {
+                IP_VS_ERR("set_ctl: len %u != %u\n",
+                          len, set_arglen[SET_CMDID(cmd)]);
+                return -EINVAL;
+        }
+        if (copy_from_user(arg, user, len) != 0)
+                return -EFAULT;
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        if (down_interruptible(&__ip_vs_mutex)) {
+                ret = -ERESTARTSYS;
+                goto out_dec;
+        }
+        if (cmd == IP_VS_SO_SET_FLUSH) {
+                /* Flush the virtual service */
+                ret = ip_vs_flush();
+                goto out_unlock;
+        } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+                /* Set timeout values for (tcp tcpfin udp) */
+                ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+                goto out_unlock;
+        } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+                struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+                ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+                goto out_unlock;
+        } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+                struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+                ret = stop_sync_thread(dm->state);
+                goto out_unlock;
+        }
+        usvc = (struct ip_vs_service_user *)arg;
+        udest = (struct ip_vs_dest_user *)(usvc + 1);
+        if (cmd == IP_VS_SO_SET_ZERO) {
+                /* if no service address is set, zero counters in all */
+                if (!usvc->fwmark && !usvc->addr && !usvc->port) {
+                        ret = ip_vs_zero_all();
+                        goto out_unlock;
+                }
+        }
+        /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
+        if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
+                IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
+                          usvc->protocol, NIPQUAD(usvc->addr),
+                          ntohs(usvc->port), usvc->sched_name);
+                ret = -EFAULT;
+                goto out_unlock;
+        }
+        /* Lookup the exact service by <protocol, addr, port> or fwmark */
+        if (usvc->fwmark == 0)
+                svc = __ip_vs_service_get(usvc->protocol,
+                                          usvc->addr, usvc->port);
+        else
+                svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+        if (cmd != IP_VS_SO_SET_ADD
+            && (svc == NULL || svc->protocol != usvc->protocol)) {
+                ret = -ESRCH;
+                goto out_unlock;
+        }
+        switch (cmd) {
+        case IP_VS_SO_SET_ADD:
+                if (svc != NULL)
+                        ret = -EEXIST;
+                else
+                        ret = ip_vs_add_service(usvc, &svc);
+                break;
+        case IP_VS_SO_SET_EDIT:
+                ret = ip_vs_edit_service(svc, usvc);
+                break;
+        case IP_VS_SO_SET_DEL:
+                ret = ip_vs_del_service(svc);
+                if (!ret)
+                        goto out_unlock;
+                break;
+        case IP_VS_SO_SET_ZERO:
+                ret = ip_vs_zero_service(svc);
+                break;
+        case IP_VS_SO_SET_ADDDEST:
+                ret = ip_vs_add_dest(svc, udest);
+                break;
+        case IP_VS_SO_SET_EDITDEST:
+                ret = ip_vs_edit_dest(svc, udest);
+                break;
+        case IP_VS_SO_SET_DELDEST:
+                ret = ip_vs_del_dest(svc, udest);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        if (svc)
+                ip_vs_service_put(svc);
+  out_unlock:
+        up(&__ip_vs_mutex);
+  out_dec:
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        return ret;
+}
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+        spin_lock_bh(&src->lock);
+        memcpy(dst, src, (char*)&src->lock - (char*)src);
+        spin_unlock_bh(&src->lock);
+}
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+        dst->protocol = src->protocol;
+        dst->addr = src->addr;
+        dst->port = src->port;
+        dst->fwmark = src->fwmark;
+        strcpy(dst->sched_name, src->scheduler->name);
+        dst->flags = src->flags;
+        dst->timeout = src->timeout / HZ;
+        dst->netmask = src->netmask;
+        dst->num_dests = src->num_dests;
+        ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+static inline int
+__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+                            struct ip_vs_get_services __user *uptr)
+{
+        int idx, count=0;
+        struct ip_vs_service *svc;
+        struct ip_vs_service_entry entry;
+        int ret = 0;
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                        if (count >= get->num_services)
+                                goto out;
+                        ip_vs_copy_service(&entry, svc);
+                        if (copy_to_user(&uptr->entrytable[count],
+                                         &entry, sizeof(entry))) {
+                                ret = -EFAULT;
+                                goto out;
+                        }
+                        count++;
+                }
+        }
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                        if (count >= get->num_services)
+                                goto out;
+                        ip_vs_copy_service(&entry, svc);
+                        if (copy_to_user(&uptr->entrytable[count],
+                                         &entry, sizeof(entry))) {
+                                ret = -EFAULT;
+                                goto out;
+                        }
+                        count++;
+                }
+        }
+  out:
+        return ret;
+}
+static inline int
+__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+                         struct ip_vs_get_dests __user *uptr)
+{
+        struct ip_vs_service *svc;
+        int ret = 0;
+        if (get->fwmark)
+                svc = __ip_vs_svc_fwm_get(get->fwmark);
+        else
+                svc = __ip_vs_service_get(get->protocol,
+                                          get->addr, get->port);
+        if (svc) {
+                int count = 0;
+                struct ip_vs_dest *dest;
+                struct ip_vs_dest_entry entry;
+                list_for_each_entry(dest, &svc->destinations, n_list) {
+                        if (count >= get->num_dests)
+                                break;
+                        entry.addr = dest->addr;
+                        entry.port = dest->port;
+                        entry.conn_flags = atomic_read(&dest->conn_flags);
+                        entry.weight = atomic_read(&dest->weight);
+                        entry.u_threshold = dest->u_threshold;
+                        entry.l_threshold = dest->l_threshold;
+                        entry.activeconns = atomic_read(&dest->activeconns);
+                        entry.inactconns = atomic_read(&dest->inactconns);
+                        entry.persistconns = atomic_read(&dest->persistconns);
+                        ip_vs_copy_stats(&entry.stats, &dest->stats);
+                        if (copy_to_user(&uptr->entrytable[count],
+                                         &entry, sizeof(entry))) {
+                                ret = -EFAULT;
+                                break;
+                        }
+                        count++;
+                }
+                ip_vs_service_put(svc);
+        } else
+                ret = -ESRCH;
+        return ret;
+}
+static inline void
+__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+        u->tcp_timeout =
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+        u->tcp_fin_timeout =
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+        u->udp_timeout =
+                ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+#define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
+static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+        [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
+        [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
+};
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+        unsigned char arg[128];
+        int ret = 0;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        if (*len < get_arglen[GET_CMDID(cmd)]) {
+                IP_VS_ERR("get_ctl: len %u < %u\n",
+                          *len, get_arglen[GET_CMDID(cmd)]);
+                return -EINVAL;
+        }
+        if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+                return -EFAULT;
+        if (down_interruptible(&__ip_vs_mutex))
+                return -ERESTARTSYS;
+        switch (cmd) {
+        case IP_VS_SO_GET_VERSION:
+        {
+                char buf[64];
+                sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+                        NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+                if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                *len = strlen(buf)+1;
+        }
+        break;
+        case IP_VS_SO_GET_INFO:
+        {
+                struct ip_vs_getinfo info;
+                info.version = IP_VS_VERSION_CODE;
+                info.size = IP_VS_CONN_TAB_SIZE;
+                info.num_services = ip_vs_num_services;
+                if (copy_to_user(user, &info, sizeof(info)) != 0)
+                        ret = -EFAULT;
+        }
+        break;
+        case IP_VS_SO_GET_SERVICES:
+        {
+                struct ip_vs_get_services *get;
+                int size;
+                get = (struct ip_vs_get_services *)arg;
+                size = sizeof(*get) +
+                        sizeof(struct ip_vs_service_entry) * get->num_services;
+                if (*len != size) {
+                        IP_VS_ERR("length: %u != %u\n", *len, size);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                ret = __ip_vs_get_service_entries(get, user);
+        }
+        break;
+        case IP_VS_SO_GET_SERVICE:
+        {
+                struct ip_vs_service_entry *entry;
+                struct ip_vs_service *svc;
+                entry = (struct ip_vs_service_entry *)arg;
+                if (entry->fwmark)
+                        svc = __ip_vs_svc_fwm_get(entry->fwmark);
+                else
+                        svc = __ip_vs_service_get(entry->protocol,
+                                                  entry->addr, entry->port);
+                if (svc) {
+                        ip_vs_copy_service(entry, svc);
+                        if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+                                ret = -EFAULT;
+                        ip_vs_service_put(svc);
+                } else
+                        ret = -ESRCH;
+        }
+        break;
+        case IP_VS_SO_GET_DESTS:
+        {
+                struct ip_vs_get_dests *get;
+                int size;
+                get = (struct ip_vs_get_dests *)arg;
+                size = sizeof(*get) +
+                        sizeof(struct ip_vs_dest_entry) * get->num_dests;
+                if (*len != size) {
+                        IP_VS_ERR("length: %u != %u\n", *len, size);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                ret = __ip_vs_get_dest_entries(get, user);
+        }
+        break;
+        case IP_VS_SO_GET_TIMEOUT:
+        {
+                struct ip_vs_timeout_user t;
+                __ip_vs_get_timeouts(&t);
+                if (copy_to_user(user, &t, sizeof(t)) != 0)
+                        ret = -EFAULT;
+        }
+        break;
+        case IP_VS_SO_GET_DAEMON:
+        {
+                struct ip_vs_daemon_user d[2];
+                memset(&d, 0, sizeof(d));
+                if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+                        d[0].state = IP_VS_STATE_MASTER;
+                        strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+                        d[0].syncid = ip_vs_master_syncid;
+                }
+                if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+                        d[1].state = IP_VS_STATE_BACKUP;
+                        strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+                        d[1].syncid = ip_vs_backup_syncid;
+                }
+                if (copy_to_user(user, &d, sizeof(d)) != 0)
+                        ret = -EFAULT;
+        }
+        break;
+        default:
+                ret = -EINVAL;
+        }
+  out:
+        up(&__ip_vs_mutex);
+        return ret;
+}
+static struct nf_sockopt_ops ip_vs_sockopts = {
+        .pf             = PF_INET,
+        .set_optmin     = IP_VS_BASE_CTL,
+        .set_optmax     = IP_VS_SO_SET_MAX+1,
+        .set            = do_ip_vs_set_ctl,
+        .get_optmin     = IP_VS_BASE_CTL,
+        .get_optmax     = IP_VS_SO_GET_MAX+1,
+        .get            = do_ip_vs_get_ctl,
+};
+int ip_vs_control_init(void)
+{
+        int ret;
+        int idx;
+        EnterFunction(2);
+        ret = nf_register_sockopt(&ip_vs_sockopts);
+        if (ret) {
+                IP_VS_ERR("cannot register sockopt.\n");
+                return ret;
+        }
+        proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
+        proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
+        sysctl_header = register_sysctl_table(vs_root_table, 0);
+        /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+                INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+                INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+        }
+        for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+                INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+        }
+        memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
+        spin_lock_init(&ip_vs_stats.lock);
+        ip_vs_new_estimator(&ip_vs_stats);
+        /* Hook the defense timer */
+        schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+        LeaveFunction(2);
+        return 0;
+}
+void ip_vs_control_cleanup(void)
+{
+        EnterFunction(2);
+        ip_vs_trash_cleanup();
+        cancel_rearming_delayed_work(&defense_work);
+        ip_vs_kill_estimator(&ip_vs_stats);
+        unregister_sysctl_table(sysctl_header);
+        proc_net_remove("ip_vs_stats");
+        proc_net_remove("ip_vs");
+        nf_unregister_sockopt(&ip_vs_sockopts);
+        LeaveFunction(2);
+}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
new file mode 100644
index 000000000000..f3bc320dce93
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -0,0 +1,258 @@
+/*
+ * IPVS:        Destination Hashing scheduling module
+ *
+ * Version:     $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              Inspired by the consistent hashing scheduler patch from
+ *              Thomas Proell <proellt@gmx.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[dest_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) OR (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+/*
+ *      IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+        struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+/*
+ *     for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS        8
+#endif
+#define IP_VS_DH_TAB_BITS               CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
+/*
+ *      Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
+{
+        return (tbl[ip_vs_dh_hashkey(addr)]).dest;
+}
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_dh_bucket *b;
+        struct list_head *p;
+        struct ip_vs_dest *dest;
+        b = tbl;
+        p = &svc->destinations;
+        for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+                if (list_empty(p)) {
+                        b->dest = NULL;
+                } else {
+                        if (p == &svc->destinations)
+                                p = p->next;
+                        dest = list_entry(p, struct ip_vs_dest, n_list);
+                        atomic_inc(&dest->refcnt);
+                        b->dest = dest;
+                        p = p->next;
+                }
+                b++;
+        }
+        return 0;
+}
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+        int i;
+        struct ip_vs_dh_bucket *b;
+        b = tbl;
+        for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+                if (b->dest) {
+                        atomic_dec(&b->dest->refcnt);
+                        b->dest = NULL;
+                }
+                b++;
+        }
+}
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_dh_bucket *tbl;
+        /* allocate the DH table for this service */
+        tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+                      GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+        /* assign the hash buckets with the updated service */
+        ip_vs_dh_assign(tbl, svc);
+        return 0;
+}
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_dh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_dh_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+        return 0;
+}
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_dh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_dh_flush(tbl);
+        /* assign the hash buckets with the updated service */
+        ip_vs_dh_assign(tbl, svc);
+        return 0;
+}
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+        return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+/*
+ *      Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_dh_bucket *tbl;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+        dest = ip_vs_dh_get(tbl, iph->daddr);
+        if (!dest
+            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+            || atomic_read(&dest->weight) <= 0
+            || is_overloaded(dest)) {
+                return NULL;
+        }
+        IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(iph->daddr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+        .name =                 "dh",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_dh_init_svc,
+        .done_service =         ip_vs_dh_done_svc,
+        .update_service =       ip_vs_dh_update_svc,
+        .schedule =             ip_vs_dh_schedule,
+};
+static int __init ip_vs_dh_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+static void __exit ip_vs_dh_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
new file mode 100644
index 000000000000..67b3e2fc1fa1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -0,0 +1,200 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Version:     $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <net/ip_vs.h>
+/*
+  This code is to estimate rate in a shorter interval (such as 8
+  seconds) for virtual services and real servers. For measure rate in a
+  long interval, it is easy to implement a user level daemon which
+  periodically reads those statistical counters and measure rate.
+  Currently, the measurement is activated by slow timer handler. Hope
+  this measurement will not introduce too much load.
+  We measure rate during the last 8 seconds every 2 seconds:
+    avgrate = avgrate*(1-W) + rate*W
+    where W = 2^(-2)
+  NOTES.
+  * The stored value for average bps is scaled by 2^5, so that maximal
+    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+  * A lot code is taken from net/sched/estimator.c
+ */
+struct ip_vs_estimator
+{
+        struct ip_vs_estimator  *next;
+        struct ip_vs_stats      *stats;
+        u32                     last_conns;
+        u32                     last_inpkts;
+        u32                     last_outpkts;
+        u64                     last_inbytes;
+        u64                     last_outbytes;
+        u32                     cps;
+        u32                     inpps;
+        u32                     outpps;
+        u32                     inbps;
+        u32                     outbps;
+};
+static struct ip_vs_estimator *est_list = NULL;
+static DEFINE_RWLOCK(est_lock);
+static struct timer_list est_timer;
+static void estimation_timer(unsigned long arg)
+{
+        struct ip_vs_estimator *e;
+        struct ip_vs_stats *s;
+        u32 n_conns;
+        u32 n_inpkts, n_outpkts;
+        u64 n_inbytes, n_outbytes;
+        u32 rate;
+        read_lock(&est_lock);
+        for (e = est_list; e; e = e->next) {
+                s = e->stats;
+                spin_lock(&s->lock);
+                n_conns = s->conns;
+                n_inpkts = s->inpkts;
+                n_outpkts = s->outpkts;
+                n_inbytes = s->inbytes;
+                n_outbytes = s->outbytes;
+                /* scaled by 2^10, but divided 2 seconds */
+                rate = (n_conns - e->last_conns)<<9;
+                e->last_conns = n_conns;
+                e->cps += ((long)rate - (long)e->cps)>>2;
+                s->cps = (e->cps+0x1FF)>>10;
+                rate = (n_inpkts - e->last_inpkts)<<9;
+                e->last_inpkts = n_inpkts;
+                e->inpps += ((long)rate - (long)e->inpps)>>2;
+                s->inpps = (e->inpps+0x1FF)>>10;
+                rate = (n_outpkts - e->last_outpkts)<<9;
+                e->last_outpkts = n_outpkts;
+                e->outpps += ((long)rate - (long)e->outpps)>>2;
+                s->outpps = (e->outpps+0x1FF)>>10;
+                rate = (n_inbytes - e->last_inbytes)<<4;
+                e->last_inbytes = n_inbytes;
+                e->inbps += ((long)rate - (long)e->inbps)>>2;
+                s->inbps = (e->inbps+0xF)>>5;
+                rate = (n_outbytes - e->last_outbytes)<<4;
+                e->last_outbytes = n_outbytes;
+                e->outbps += ((long)rate - (long)e->outbps)>>2;
+                s->outbps = (e->outbps+0xF)>>5;
+                spin_unlock(&s->lock);
+        }
+        read_unlock(&est_lock);
+        mod_timer(&est_timer, jiffies + 2*HZ);
+}
+int ip_vs_new_estimator(struct ip_vs_stats *stats)
+{
+        struct ip_vs_estimator *est;
+        est = kmalloc(sizeof(*est), GFP_KERNEL);
+        if (est == NULL)
+                return -ENOMEM;
+        memset(est, 0, sizeof(*est));
+        est->stats = stats;
+        est->last_conns = stats->conns;
+        est->cps = stats->cps<<10;
+        est->last_inpkts = stats->inpkts;
+        est->inpps = stats->inpps<<10;
+        est->last_outpkts = stats->outpkts;
+        est->outpps = stats->outpps<<10;
+        est->last_inbytes = stats->inbytes;
+        est->inbps = stats->inbps<<5;
+        est->last_outbytes = stats->outbytes;
+        est->outbps = stats->outbps<<5;
+        write_lock_bh(&est_lock);
+        est->next = est_list;
+        if (est->next == NULL) {
+                init_timer(&est_timer);
+                est_timer.expires = jiffies + 2*HZ;
+                est_timer.function = estimation_timer;
+                add_timer(&est_timer);
+        }
+        est_list = est;
+        write_unlock_bh(&est_lock);
+        return 0;
+}
+void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+{
+        struct ip_vs_estimator *est, **pest;
+        int killed = 0;
+        write_lock_bh(&est_lock);
+        pest = &est_list;
+        while ((est=*pest) != NULL) {
+                if (est->stats != stats) {
+                        pest = &est->next;
+                        continue;
+                }
+                *pest = est->next;
+                kfree(est);
+                killed++;
+        }
+        if (killed && est_list == NULL)
+                del_timer_sync(&est_timer);
+        write_unlock_bh(&est_lock);
+}
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+        struct ip_vs_estimator *e;
+        write_lock_bh(&est_lock);
+        for (e = est_list; e; e = e->next) {
+                if (e->stats != stats)
+                        continue;
+                /* set counters zero */
+                e->last_conns = 0;
+                e->last_inpkts = 0;
+                e->last_outpkts = 0;
+                e->last_inbytes = 0;
+                e->last_outbytes = 0;
+                e->cps = 0;
+                e->inpps = 0;
+                e->outpps = 0;
+                e->inbps = 0;
+                e->outbps = 0;
+        }
+        write_unlock_bh(&est_lock);
+}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
new file mode 100644
index 000000000000..a19a33ceb811
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -0,0 +1,400 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Version:     $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ *              IP_MASQ_FTP ftp masquerading module
+ *
+ * Version:     @(#)ip_masq_ftp.c 0.04   02/05/96
+ *
+ * Author:      Wouter Gadeyne
+ *
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/ip_vs.h>
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, int, NULL, 0);
+/*
+ *      Debug level
+ */
+#ifdef CONFIG_IP_VS_DEBUG
+static int debug=0;
+module_param(debug, int, 0);
+#endif
+/*      Dummy variable */
+static int ip_vs_ftp_pasv;
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+        return 0;
+}
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+        return 0;
+}
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+                                  const char *pattern, size_t plen, char term,
+                                  __u32 *addr, __u16 *port,
+                                  char **start, char **end)
+{
+        unsigned char p[6];
+        int i = 0;
+        if (data_limit - data < plen) {
+                /* check if there is partial match */
+                if (strnicmp(data, pattern, data_limit - data) == 0)
+                        return -1;
+                else
+                        return 0;
+        }
+        if (strnicmp(data, pattern, plen) != 0) {
+                return 0;
+        }
+        *start = data + plen;
+        for (data = *start; *data != term; data++) {
+                if (data == data_limit)
+                        return -1;
+        }
+        *end = data;
+        memset(p, 0, sizeof(p));
+        for (data = *start; data != *end; data++) {
+                if (*data >= '0' && *data <= '9') {
+                        p[i] = p[i]*10 + *data - '0';
+                } else if (*data == ',' && i < 5) {
+                        i++;
+                } else {
+                        /* unexpected character */
+                        return -1;
+                }
+        }
+        if (i != 5)
+                return -1;
+        *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
+        *port = (p[5]<<8) | p[4];
+        return 1;
+}
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port.  Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+                         struct sk_buff **pskb, int *diff)
+{
+        struct iphdr *iph;
+        struct tcphdr *th;
+        char *data, *data_limit;
+        char *start, *end;
+        __u32 from;
+        __u16 port;
+        struct ip_vs_conn *n_cp;
+        char buf[24];           /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+        unsigned buf_len;
+        int ret;
+        *diff = 0;
+        /* Only useful for established sessions */
+        if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+                return 1;
+        /* Linear packets are much easier to deal with. */
+        if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+                return 0;
+        if (cp->app_data == &ip_vs_ftp_pasv) {
+                iph = (*pskb)->nh.iph;
+                th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+                data = (char *)th + (th->doff << 2);
+                data_limit = (*pskb)->tail;
+                if (ip_vs_ftp_get_addrport(data, data_limit,
+                                           SERVER_STRING,
+                                           sizeof(SERVER_STRING)-1, ')',
+                                           &from, &port,
+                                           &start, &end) != 1)
+                        return 1;
+                IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
+                          "%u.%u.%u.%u:%d detected\n",
+                          NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
+                /*
+                 * Now update or create an connection entry for it
+                 */
+                n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
+                                          cp->caddr, 0);
+                if (!n_cp) {
+                        n_cp = ip_vs_conn_new(IPPROTO_TCP,
+                                              cp->caddr, 0,
+                                              cp->vaddr, port,
+                                              from, port,
+                                              IP_VS_CONN_F_NO_CPORT,
+                                              cp->dest);
+                        if (!n_cp)
+                                return 0;
+                        /* add its controller */
+                        ip_vs_control_add(n_cp, cp);
+                }
+                /*
+                 * Replace the old passive address with the new one
+                 */
+                from = n_cp->vaddr;
+                port = n_cp->vport;
+                sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
+                        port&255, (port>>8)&255);
+                buf_len = strlen(buf);
+                /*
+                 * Calculate required delta-offset to keep TCP happy
+                 */
+                *diff = buf_len - (end-start);
+                if (*diff == 0) {
+                        /* simply replace it with new passive address */
+                        memcpy(start, buf, buf_len);
+                        ret = 1;
+                } else {
+                        ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
+                                          end-start, buf, buf_len);
+                }
+                cp->app_data = NULL;
+                ip_vs_tcp_conn_listen(n_cp);
+                ip_vs_conn_put(n_cp);
+                return ret;
+        }
+        return 1;
+}
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+                        struct sk_buff **pskb, int *diff)
+{
+        struct iphdr *iph;
+        struct tcphdr *th;
+        char *data, *data_start, *data_limit;
+        char *start, *end;
+        __u32 to;
+        __u16 port;
+        struct ip_vs_conn *n_cp;
+        /* no diff required for incoming packets */
+        *diff = 0;
+        /* Only useful for established sessions */
+        if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+                return 1;
+        /* Linear packets are much easier to deal with. */
+        if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+                return 0;
+        /*
+         * Detecting whether it is passive
+         */
+        iph = (*pskb)->nh.iph;
+        th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+        /* Since there may be OPTIONS in the TCP packet and the HLEN is
+           the length of the header in 32-bit multiples, it is accurate
+           to calculate data address by th+HLEN*4 */
+        data = data_start = (char *)th + (th->doff << 2);
+        data_limit = (*pskb)->tail;
+        while (data <= data_limit - 6) {
+                if (strnicmp(data, "PASV\r\n", 6) == 0) {
+                        /* Passive mode on */
+                        IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n",
+                                  data - data_start,
+                                  data_limit - data_start);
+                        cp->app_data = &ip_vs_ftp_pasv;
+                        return 1;
+                }
+                data++;
+        }
+        /*
+         * To support virtual FTP server, the scenerio is as follows:
+         *       FTP client ----> Load Balancer ----> FTP server
+         * First detect the port number in the application data,
+         * then create a new connection entry for the coming data
+         * connection.
+         */
+        if (ip_vs_ftp_get_addrport(data_start, data_limit,
+                                   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+                                   '\r', &to, &port,
+                                   &start, &end) != 1)
+                return 1;
+        IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
+                  NIPQUAD(to), ntohs(port));
+        /* Passive mode off */
+        cp->app_data = NULL;
+        /*
+         * Now update or create a connection entry for it
+         */
+        IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
+                  ip_vs_proto_name(iph->protocol),
+                  NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
+        n_cp = ip_vs_conn_in_get(iph->protocol,
+                                 to, port,
+                                 cp->vaddr, htons(ntohs(cp->vport)-1));
+        if (!n_cp) {
+                n_cp = ip_vs_conn_new(IPPROTO_TCP,
+                                      to, port,
+                                      cp->vaddr, htons(ntohs(cp->vport)-1),
+                                      cp->daddr, htons(ntohs(cp->dport)-1),
+                                      0,
+                                      cp->dest);
+                if (!n_cp)
+                        return 0;
+                /* add its controller */
+                ip_vs_control_add(n_cp, cp);
+        }
+        /*
+         *      Move tunnel to listen state
+         */
+        ip_vs_tcp_conn_listen(n_cp);
+        ip_vs_conn_put(n_cp);
+        return 1;
+}
+static struct ip_vs_app ip_vs_ftp = {
+        .name =         "ftp",
+        .type =         IP_VS_APP_TYPE_FTP,
+        .protocol =     IPPROTO_TCP,
+        .module =       THIS_MODULE,
+        .incs_list =    LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+        .init_conn =    ip_vs_ftp_init_conn,
+        .done_conn =    ip_vs_ftp_done_conn,
+        .bind_conn =    NULL,
+        .unbind_conn =  NULL,
+        .pkt_out =      ip_vs_ftp_out,
+        .pkt_in =       ip_vs_ftp_in,
+};
+/*
+ *      ip_vs_ftp initialization
+ */
+static int __init ip_vs_ftp_init(void)
+{
+        int i, ret;
+        struct ip_vs_app *app = &ip_vs_ftp;
+        ret = register_ip_vs_app(app);
+        if (ret)
+                return ret;
+        for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+                if (!ports[i])
+                        continue;
+                ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+                if (ret)
+                        break;
+                IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
+                          app->name, i, ports[i]);
+        }
+        if (ret)
+                unregister_ip_vs_app(app);
+        return ret;
+}
+/*
+ *      ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+        unregister_ip_vs_app(&ip_vs_ftp);
+}
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
new file mode 100644
index 000000000000..c035838b780a
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -0,0 +1,624 @@
+/*
+ * IPVS:        Locality-Based Least-Connection scheduling module
+ *
+ * Version:     $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Martin Hamilton         :    fixed the terrible locking bugs
+ *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
+ *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
+ *     Wensong Zhang           :    added doing full expiration check to
+ *                                   collect stale entries of 24+ hours when
+ *                                   no partial expire check in a half hour
+ *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
+ *                                   to avoid the possible race between timer
+ *                                   handler and del_timer thread in SMP
+ *
+ */
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ *       if cachenode[dest_ip] is null then
+ *               n, cachenode[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- cachenode[dest_ip];
+ *               if (n is dead) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                 n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ *       return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/ip_vs.h>
+/*
+ *    It is for garbage collection of stale IPVS lblc entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
+/*
+ *     for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS      10
+#endif
+#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
+/*
+ *      IPVS lblc entry represents an association between destination
+ *      IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+        struct list_head        list;
+        __u32                   addr;           /* destination IP address */
+        struct ip_vs_dest       *dest;          /* real server (cache) */
+        unsigned long           lastuse;        /* last used time */
+};
+/*
+ *      IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+        rwlock_t                lock;           /* lock for this table */
+        struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+        atomic_t                entries;        /* number of entries */
+        int                     max_size;       /* maximum size of entries */
+        struct timer_list       periodic_timer; /* collect stale entries */
+        int                     rover;          /* rover for expire check */
+        int                     counter;        /* counter for no expire */
+};
+/*
+ *      IPVS LBLC sysctl table
+ */
+static ctl_table vs_vars_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS_LBLC_EXPIRE,
+                .procname       = "lblc_expiration",
+                .data           = &sysctl_ip_vs_lblc_expiration,
+                .maxlen         = sizeof(int),
+                .mode           = 0644, 
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS,
+                .procname       = "vs",
+                .mode           = 0555, 
+                .child          = vs_vars_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4", 
+                .mode           = 0555,
+                .child          = vs_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table lblc_root_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net", 
+                .mode           = 0555, 
+                .child          = ipv4_table
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header * sysctl_header;
+/*
+ *      new/free a ip_vs_lblc_entry, which is a mapping of a destionation
+ *      IP address to a server.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
+{
+        struct ip_vs_lblc_entry *en;
+        en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
+        if (en == NULL) {
+                IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+                return NULL;
+        }
+        INIT_LIST_HEAD(&en->list);
+        en->addr = daddr;
+        atomic_inc(&dest->refcnt);
+        en->dest = dest;
+        return en;
+}
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+        list_del(&en->list);
+        /*
+         * We don't kfree dest because it is refered either by its service
+         * or the trash dest list.
+         */
+        atomic_dec(&en->dest->refcnt);
+        kfree(en);
+}
+/*
+ *      Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+/*
+ *      Hash an entry in the ip_vs_lblc_table.
+ *      returns bool success.
+ */
+static int
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+        unsigned hash;
+        if (!list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         *      Hash by destination IP address
+         */
+        hash = ip_vs_lblc_hashkey(en->addr);
+        write_lock(&tbl->lock);
+        list_add(&en->list, &tbl->bucket[hash]);
+        atomic_inc(&tbl->entries);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#if 0000
+/*
+ *      Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
+ *      returns bool success.
+ */
+static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
+                             struct ip_vs_lblc_entry *en)
+{
+        if (list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         * Remove it from the table
+         */
+        write_lock(&tbl->lock);
+        list_del(&en->list);
+        INIT_LIST_HEAD(&en->list);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#endif
+/*
+ *  Get ip_vs_lblc_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
+{
+        unsigned hash;
+        struct ip_vs_lblc_entry *en;
+        hash = ip_vs_lblc_hashkey(addr);
+        read_lock(&tbl->lock);
+        list_for_each_entry(en, &tbl->bucket[hash], list) {
+                if (en->addr == addr) {
+                        /* HIT */
+                        read_unlock(&tbl->lock);
+                        return en;
+                }
+        }
+        read_unlock(&tbl->lock);
+        return NULL;
+}
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+        int i;
+        struct ip_vs_lblc_entry *en, *nxt;
+        for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+                        ip_vs_lblc_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+}
+static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
+{
+        unsigned long now = jiffies;
+        int i, j;
+        struct ip_vs_lblc_entry *en, *nxt;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_before(now, 
+                                        en->lastuse + sysctl_ip_vs_lblc_expiration))
+                                continue;
+                        ip_vs_lblc_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+        tbl->rover = j;
+}
+/*
+ *      Periodical timer handler for IPVS lblc table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+        struct ip_vs_lblc_table *tbl;
+        unsigned long now = jiffies;
+        int goal;
+        int i, j;
+        struct ip_vs_lblc_entry *en, *nxt;
+        tbl = (struct ip_vs_lblc_table *)data;
+        if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+                /* do full expiration check */
+                ip_vs_lblc_full_check(tbl);
+                tbl->counter = 1;
+                goto out;
+        }
+        if (atomic_read(&tbl->entries) <= tbl->max_size) {
+                tbl->counter++;
+                goto out;
+        }
+        goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+        if (goal > tbl->max_size/2)
+                goal = tbl->max_size/2;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+                                continue;
+                        ip_vs_lblc_free(en);
+                        atomic_dec(&tbl->entries);
+                        goal--;
+                }
+                write_unlock(&tbl->lock);
+                if (goal <= 0)
+                        break;
+        }
+        tbl->rover = j;
+  out:
+        mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_lblc_table *tbl;
+        /*
+         *    Allocate the ip_vs_lblc_table for this service
+         */
+        tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_lblc_table));
+        /*
+         *    Initialize the hash buckets
+         */
+        for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                INIT_LIST_HEAD(&tbl->bucket[i]);
+        }
+        rwlock_init(&tbl->lock);
+        tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+        tbl->rover = 0;
+        tbl->counter = 1;
+        /*
+         *    Hook periodic timer for garbage collection
+         */
+        init_timer(&tbl->periodic_timer);
+        tbl->periodic_timer.data = (unsigned long)tbl;
+        tbl->periodic_timer.function = ip_vs_lblc_check_expire;
+        tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+        add_timer(&tbl->periodic_timer);
+        return 0;
+}
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_lblc_table *tbl = svc->sched_data;
+        /* remove periodic timer */
+        del_timer_sync(&tbl->periodic_timer);
+        /* got to clean up table entries here */
+        ip_vs_lblc_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_lblc_table));
+        return 0;
+}
+static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+        struct ip_vs_dest *dest, *least;
+        int loh, doh;
+        /*
+         * We think the overhead of processing active connections is fifty
+         * times higher than that of inactive connections in average. (This
+         * fifty times might not be accurate, we will change it later.) We
+         * use the following formula to estimate the overhead:
+         *                dest->activeconns*50 + dest->inactconns
+         * and the load:
+         *                (dest overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connection.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                if (atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = atomic_read(&least->activeconns) * 50
+                                + atomic_read(&least->inactconns);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+        if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+                struct ip_vs_dest *d;
+                list_for_each_entry(d, &svc->destinations, n_list) {
+                        if (atomic_read(&d->activeconns)*2
+                            < atomic_read(&d->weight)) {
+                                return 1;
+                        }
+                }
+        }
+        return 0;
+}
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_lblc_table *tbl;
+        struct ip_vs_lblc_entry *en;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_lblc_table *)svc->sched_data;
+        en = ip_vs_lblc_get(tbl, iph->daddr);
+        if (en == NULL) {
+                dest = __ip_vs_wlc_schedule(svc, iph);
+                if (dest == NULL) {
+                        IP_VS_DBG(1, "no destination available\n");
+                        return NULL;
+                }
+                en = ip_vs_lblc_new(iph->daddr, dest);
+                if (en == NULL) {
+                        return NULL;
+                }
+                ip_vs_lblc_hash(tbl, en);
+        } else {
+                dest = en->dest;
+                if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
+                    || atomic_read(&dest->weight) <= 0
+                    || is_overloaded(dest, svc)) {
+                        dest = __ip_vs_wlc_schedule(svc, iph);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "no destination available\n");
+                                return NULL;
+                        }
+                        atomic_dec(&en->dest->refcnt);
+                        atomic_inc(&dest->refcnt);
+                        en->dest = dest;
+                }
+        }
+        en->lastuse = jiffies;
+        IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(en->addr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+        .name =                 "lblc",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_lblc_init_svc,
+        .done_service =         ip_vs_lblc_done_svc,
+        .update_service =       ip_vs_lblc_update_svc,
+        .schedule =             ip_vs_lblc_schedule,
+};
+static int __init ip_vs_lblc_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
+        sysctl_header = register_sysctl_table(lblc_root_table, 0);
+        return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+static void __exit ip_vs_lblc_cleanup(void)
+{
+        unregister_sysctl_table(sysctl_header);
+        unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
new file mode 100644
index 000000000000..22b5dd55d271
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,888 @@
+/*
+ * IPVS:        Locality-Based Least-Connection with Replication scheduler
+ *
+ * Version:     $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Julian Anastasov        :    Added the missing (dest->weight>0)
+ *                                  condition in the ip_vs_dest_set_max.
+ *
+ */
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ *       if serverSet[dest_ip] is null then
+ *               n, serverSet[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- {least-conn (alive) node in serverSet[dest_ip]};
+ *               if (n is null) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                   n <- {weighted least-conn node};
+ *                   add n to serverSet[dest_ip];
+ *               if |serverSet[dest_ip]| > 1 AND
+ *                   now - serverSet[dest_ip].lastMod > T then
+ *                   m <- {most conn node in serverSet[dest_ip]};
+ *                   remove m from serverSet[dest_ip];
+ *       if serverSet[dest_ip] changed then
+ *               serverSet[dest_ip].lastMod <- now;
+ *
+ *       return n;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+/* for proc_net_create/proc_net_remove */
+#include <linux/proc_fs.h>
+#include <net/ip_vs.h>
+/*
+ *    It is for garbage collection of stale IPVS lblcr entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
+/*
+ *     for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
+#endif
+#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
+/*
+ *      IPVS destination set structure and operations
+ */
+struct ip_vs_dest_list {
+        struct ip_vs_dest_list  *next;          /* list link */
+        struct ip_vs_dest       *dest;          /* destination server */
+};
+struct ip_vs_dest_set {
+        atomic_t                size;           /* set size */
+        unsigned long           lastmod;        /* last modified time */
+        struct ip_vs_dest_list  *list;          /* destination list */
+        rwlock_t                lock;           /* lock for this list */
+};
+static struct ip_vs_dest_list *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+        struct ip_vs_dest_list *e;
+        for (e=set->list; e!=NULL; e=e->next) {
+                if (e->dest == dest)
+                        /* already existed */
+                        return NULL;
+        }
+        e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+        if (e == NULL) {
+                IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
+                return NULL;
+        }
+        atomic_inc(&dest->refcnt);
+        e->dest = dest;
+        /* link it to the list */
+        write_lock(&set->lock);
+        e->next = set->list;
+        set->list = e;
+        atomic_inc(&set->size);
+        write_unlock(&set->lock);
+        set->lastmod = jiffies;
+        return e;
+}
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+        struct ip_vs_dest_list *e, **ep;
+        write_lock(&set->lock);
+        for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+                if (e->dest == dest) {
+                        /* HIT */
+                        *ep = e->next;
+                        atomic_dec(&set->size);
+                        set->lastmod = jiffies;
+                        atomic_dec(&e->dest->refcnt);
+                        kfree(e);
+                        break;
+                }
+                ep = &e->next;
+        }
+        write_unlock(&set->lock);
+}
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+        struct ip_vs_dest_list *e, **ep;
+        write_lock(&set->lock);
+        for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+                *ep = e->next;
+                /*
+                 * We don't kfree dest because it is refered either
+                 * by its service or by the trash dest list.
+                 */
+                atomic_dec(&e->dest->refcnt);
+                kfree(e);
+        }
+        write_unlock(&set->lock);
+}
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+        register struct ip_vs_dest_list *e;
+        struct ip_vs_dest *dest, *least;
+        int loh, doh;
+        if (set == NULL)
+                return NULL;
+        read_lock(&set->lock);
+        /* select the first destination server, whose weight > 0 */
+        for (e=set->list; e!=NULL; e=e->next) {
+                least = e->dest;
+                if (least->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                if ((atomic_read(&least->weight) > 0)
+                    && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+                        loh = atomic_read(&least->activeconns) * 50
+                                + atomic_read(&least->inactconns);
+                        goto nextstage;
+                }
+        }
+        read_unlock(&set->lock);
+        return NULL;
+        /* find the destination with the weighted least load */
+  nextstage:
+        for (e=e->next; e!=NULL; e=e->next) {
+                dest = e->dest;
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                if ((loh * atomic_read(&dest->weight) >
+                     doh * atomic_read(&least->weight))
+                    && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        read_unlock(&set->lock);
+        IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+        register struct ip_vs_dest_list *e;
+        struct ip_vs_dest *dest, *most;
+        int moh, doh;
+        if (set == NULL)
+                return NULL;
+        read_lock(&set->lock);
+        /* select the first destination server, whose weight > 0 */
+        for (e=set->list; e!=NULL; e=e->next) {
+                most = e->dest;
+                if (atomic_read(&most->weight) > 0) {
+                        moh = atomic_read(&most->activeconns) * 50
+                                + atomic_read(&most->inactconns);
+                        goto nextstage;
+                }
+        }
+        read_unlock(&set->lock);
+        return NULL;
+        /* find the destination with the weighted most load */
+  nextstage:
+        for (e=e->next; e!=NULL; e=e->next) {
+                dest = e->dest;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+                if ((moh * atomic_read(&dest->weight) <
+                     doh * atomic_read(&most->weight))
+                    && (atomic_read(&dest->weight) > 0)) {
+                        most = dest;
+                        moh = doh;
+                }
+        }
+        read_unlock(&set->lock);
+        IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(most->addr), ntohs(most->port),
+                  atomic_read(&most->activeconns),
+                  atomic_read(&most->refcnt),
+                  atomic_read(&most->weight), moh);
+        return most;
+}
+/*
+ *      IPVS lblcr entry represents an association between destination
+ *      IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+        struct list_head        list;
+        __u32                   addr;           /* destination IP address */
+        struct ip_vs_dest_set   set;            /* destination server set */
+        unsigned long           lastuse;        /* last used time */
+};
+/*
+ *      IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+        rwlock_t                lock;           /* lock for this table */
+        struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
+        atomic_t                entries;        /* number of entries */
+        int                     max_size;       /* maximum size of entries */
+        struct timer_list       periodic_timer; /* collect stale entries */
+        int                     rover;          /* rover for expire check */
+        int                     counter;        /* counter for no expire */
+};
+/*
+ *      IPVS LBLCR sysctl table
+ */
+static ctl_table vs_vars_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS_LBLCR_EXPIRE,
+                .procname       = "lblcr_expiration",
+                .data           = &sysctl_ip_vs_lblcr_expiration,
+                .maxlen         = sizeof(int),
+                .mode           = 0644, 
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS,
+                .procname       = "vs",
+                .mode           = 0555,
+                .child          = vs_vars_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4", 
+                .mode           = 0555,
+                .child          = vs_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table lblcr_root_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net", 
+                .mode           = 0555, 
+                .child          = ipv4_table
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header * sysctl_header;
+/*
+ *      new/free a ip_vs_lblcr_entry, which is a mapping of a destination
+ *      IP address to a server.
+ */
+static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
+{
+        struct ip_vs_lblcr_entry *en;
+        en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
+        if (en == NULL) {
+                IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+                return NULL;
+        }
+        INIT_LIST_HEAD(&en->list);
+        en->addr = daddr;
+        /* initilize its dest set */
+        atomic_set(&(en->set.size), 0);
+        en->set.list = NULL;
+        rwlock_init(&en->set.lock);
+        return en;
+}
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+        list_del(&en->list);
+        ip_vs_dest_set_eraseall(&en->set);
+        kfree(en);
+}
+/*
+ *      Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+/*
+ *      Hash an entry in the ip_vs_lblcr_table.
+ *      returns bool success.
+ */
+static int
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+        unsigned hash;
+        if (!list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         *      Hash by destination IP address
+         */
+        hash = ip_vs_lblcr_hashkey(en->addr);
+        write_lock(&tbl->lock);
+        list_add(&en->list, &tbl->bucket[hash]);
+        atomic_inc(&tbl->entries);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#if 0000
+/*
+ *      Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
+ *      returns bool success.
+ */
+static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
+                             struct ip_vs_lblcr_entry *en)
+{
+        if (list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         * Remove it from the table
+         */
+        write_lock(&tbl->lock);
+        list_del(&en->list);
+        INIT_LIST_HEAD(&en->list);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#endif
+/*
+ *  Get ip_vs_lblcr_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
+{
+        unsigned hash;
+        struct ip_vs_lblcr_entry *en;
+        hash = ip_vs_lblcr_hashkey(addr);
+        read_lock(&tbl->lock);
+        list_for_each_entry(en, &tbl->bucket[hash], list) {
+                if (en->addr == addr) {
+                        /* HIT */
+                        read_unlock(&tbl->lock);
+                        return en;
+                }
+        }
+        read_unlock(&tbl->lock);
+        return NULL;
+}
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+        int i;
+        struct ip_vs_lblcr_entry *en, *nxt;
+        for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+                        ip_vs_lblcr_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+}
+static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+{
+        unsigned long now = jiffies;
+        int i, j;
+        struct ip_vs_lblcr_entry *en, *nxt;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
+                                       now))
+                                continue;
+                        ip_vs_lblcr_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+        tbl->rover = j;
+}
+/*
+ *      Periodical timer handler for IPVS lblcr table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+        struct ip_vs_lblcr_table *tbl;
+        unsigned long now = jiffies;
+        int goal;
+        int i, j;
+        struct ip_vs_lblcr_entry *en, *nxt;
+        tbl = (struct ip_vs_lblcr_table *)data;
+        if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+                /* do full expiration check */
+                ip_vs_lblcr_full_check(tbl);
+                tbl->counter = 1;
+                goto out;
+        }
+        if (atomic_read(&tbl->entries) <= tbl->max_size) {
+                tbl->counter++;
+                goto out;
+        }
+        goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+        if (goal > tbl->max_size/2)
+                goal = tbl->max_size/2;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+                                continue;
+                        ip_vs_lblcr_free(en);
+                        atomic_dec(&tbl->entries);
+                        goal--;
+                }
+                write_unlock(&tbl->lock);
+                if (goal <= 0)
+                        break;
+        }
+        tbl->rover = j;
+  out:
+        mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+static struct ip_vs_lblcr_table *lblcr_table_list;
+/*
+ *      /proc/net/ip_vs_lblcr to display the mappings of
+ *                  destination IP address <==> its serverSet
+ */
+static int
+ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+        off_t pos=0, begin;
+        int len=0, size;
+        struct ip_vs_lblcr_table *tbl;
+        unsigned long now = jiffies;
+        int i;
+        struct ip_vs_lblcr_entry *en;
+        tbl = lblcr_table_list;
+        size = sprintf(buffer, "LastTime Dest IP address  Server set\n");
+        pos += size;
+        len += size;
+        for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                read_lock_bh(&tbl->lock);
+                list_for_each_entry(en, &tbl->bucket[i], list) {
+                        char tbuf[16];
+                        struct ip_vs_dest_list *d;
+                        sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
+                        size = sprintf(buffer+len, "%8lu %-16s ",
+                                       now-en->lastuse, tbuf);
+                        read_lock(&en->set.lock);
+                        for (d=en->set.list; d!=NULL; d=d->next) {
+                                size += sprintf(buffer+len+size,
+                                                "%u.%u.%u.%u ",
+                                                NIPQUAD(d->dest->addr));
+                        }
+                        read_unlock(&en->set.lock);
+                        size += sprintf(buffer+len+size, "\n");
+                        len += size;
+                        pos += size;
+                        if (pos <= offset)
+                                len=0;
+                        if (pos >= offset+length) {
+                                read_unlock_bh(&tbl->lock);
+                                goto done;
+                        }
+                }
+                read_unlock_bh(&tbl->lock);
+        }
+  done:
+        begin = len - (pos - offset);
+        *start = buffer + begin;
+        len -= begin;
+        if(len>length)
+                len = length;
+        return len;
+}
+#endif
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_lblcr_table *tbl;
+        /*
+         *    Allocate the ip_vs_lblcr_table for this service
+         */
+        tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_lblcr_table));
+        /*
+         *    Initialize the hash buckets
+         */
+        for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                INIT_LIST_HEAD(&tbl->bucket[i]);
+        }
+        rwlock_init(&tbl->lock);
+        tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+        tbl->rover = 0;
+        tbl->counter = 1;
+        /*
+         *    Hook periodic timer for garbage collection
+         */
+        init_timer(&tbl->periodic_timer);
+        tbl->periodic_timer.data = (unsigned long)tbl;
+        tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
+        tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+        add_timer(&tbl->periodic_timer);
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+        lblcr_table_list = tbl;
+#endif
+        return 0;
+}
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_lblcr_table *tbl = svc->sched_data;
+        /* remove periodic timer */
+        del_timer_sync(&tbl->periodic_timer);
+        /* got to clean up table entries here */
+        ip_vs_lblcr_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_lblcr_table));
+        return 0;
+}
+static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+        struct ip_vs_dest *dest, *least;
+        int loh, doh;
+        /*
+         * We think the overhead of processing active connections is fifty
+         * times higher than that of inactive connections in average. (This
+         * fifty times might not be accurate, we will change it later.) We
+         * use the following formula to estimate the overhead:
+         *                dest->activeconns*50 + dest->inactconns
+         * and the load:
+         *                (dest overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connection.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                if (atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = atomic_read(&least->activeconns) * 50
+                                + atomic_read(&least->inactconns);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+        if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+                struct ip_vs_dest *d;
+                list_for_each_entry(d, &svc->destinations, n_list) {
+                        if (atomic_read(&d->activeconns)*2
+                            < atomic_read(&d->weight)) {
+                                return 1;
+                        }
+                }
+        }
+        return 0;
+}
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_lblcr_table *tbl;
+        struct ip_vs_lblcr_entry *en;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+        en = ip_vs_lblcr_get(tbl, iph->daddr);
+        if (en == NULL) {
+                dest = __ip_vs_wlc_schedule(svc, iph);
+                if (dest == NULL) {
+                        IP_VS_DBG(1, "no destination available\n");
+                        return NULL;
+                }
+                en = ip_vs_lblcr_new(iph->daddr);
+                if (en == NULL) {
+                        return NULL;
+                }
+                ip_vs_dest_set_insert(&en->set, dest);
+                ip_vs_lblcr_hash(tbl, en);
+        } else {
+                dest = ip_vs_dest_set_min(&en->set);
+                if (!dest || is_overloaded(dest, svc)) {
+                        dest = __ip_vs_wlc_schedule(svc, iph);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "no destination available\n");
+                                return NULL;
+                        }
+                        ip_vs_dest_set_insert(&en->set, dest);
+                }
+                if (atomic_read(&en->set.size) > 1 &&
+                    jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+                        struct ip_vs_dest *m;
+                        m = ip_vs_dest_set_max(&en->set);
+                        if (m)
+                                ip_vs_dest_set_erase(&en->set, m);
+                }
+        }
+        en->lastuse = jiffies;
+        IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(en->addr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+        .name =                 "lblcr",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_lblcr_init_svc,
+        .done_service =         ip_vs_lblcr_done_svc,
+        .update_service =       ip_vs_lblcr_update_svc,
+        .schedule =             ip_vs_lblcr_schedule,
+};
+static int __init ip_vs_lblcr_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
+        sysctl_header = register_sysctl_table(lblcr_root_table, 0);
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+        proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
+#endif
+        return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+        proc_net_remove("ip_vs_lblcr");
+#endif
+        unregister_sysctl_table(sysctl_header);
+        unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
new file mode 100644
index 000000000000..d88fef90a641
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -0,0 +1,123 @@
+/*
+ * IPVS:        Least-Connection Scheduling module
+ *
+ * Version:     $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     added the ip_vs_lc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We think the overhead of processing active connections is 256
+         * times higher than that of inactive connections in average. (This
+         * 256 times might not be accurate, we will change it later) We
+         * use the following formula to estimate the overhead now:
+         *                dest->activeconns*256 + dest->inactconns
+         */
+        return (atomic_read(&dest->activeconns) << 8) +
+                atomic_read(&dest->inactconns);
+}
+/*
+ *      Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least = NULL;
+        unsigned int loh = 0, doh;
+        IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
+        /*
+         * Simply select the server with the least number of
+         *        (activeconns<<5) + inactconns
+         * Except whose weight is equal to zero.
+         * If the weight is equal to zero, it means that the server is
+         * quiesced, the existing connections to the server still get
+         * served, but no new connection is assigned to the server.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+                    atomic_read(&dest->weight) == 0)
+                        continue;
+                doh = ip_vs_lc_dest_overhead(dest);
+                if (!least || doh < loh) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        if (least)
+        IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->inactconns));
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+        .name =                 "lc",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_lc_init_svc,
+        .done_service =         ip_vs_lc_done_svc,
+        .update_service =       ip_vs_lc_update_svc,
+        .schedule =             ip_vs_lc_schedule,
+};
+static int __init ip_vs_lc_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+static void __exit ip_vs_lc_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
new file mode 100644
index 000000000000..bc2a9e5f2a7b
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -0,0 +1,161 @@
+/*
+ * IPVS:        Never Queue scheduling module
+ *
+ * Version:     $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int
+ip_vs_nq_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_nq_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_nq_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We only use the active connection number in the cost
+         * calculation here.
+         */
+        return atomic_read(&dest->activeconns) + 1;
+}
+/*
+ *      Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least = NULL;
+        unsigned int loh = 0, doh;
+        IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
+        /*
+         * We calculate the load of each dest server as follows:
+         *      (server expected overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connections.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+                    !atomic_read(&dest->weight))
+                        continue;
+                doh = ip_vs_nq_dest_overhead(dest);
+                /* return the server directly if it is idle */
+                if (atomic_read(&dest->activeconns) == 0) {
+                        least = dest;
+                        loh = doh;
+                        goto out;
+                }
+                if (!least ||
+                    (loh * atomic_read(&dest->weight) >
+                     doh * atomic_read(&least->weight))) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        if (!least)
+                return NULL;
+  out:
+        IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+        .name =                 "nq",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_nq_init_svc,
+        .done_service =         ip_vs_nq_done_svc,
+        .update_service =       ip_vs_nq_update_svc,
+        .schedule =             ip_vs_nq_schedule,
+};
+static int __init ip_vs_nq_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+static void __exit ip_vs_nq_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
new file mode 100644
index 000000000000..253c46252bd5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -0,0 +1,244 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <net/ip_vs.h>
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+#define IP_VS_PROTO_TAB_SIZE            32      /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto)         ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+/*
+ *      register an ipvs protocol
+ */
+static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+        unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+        pp->next = ip_vs_proto_table[hash];
+        ip_vs_proto_table[hash] = pp;
+        if (pp->init != NULL)
+                pp->init(pp);
+        return 0;
+}
+/*
+ *      unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+        struct ip_vs_protocol **pp_p;
+        unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+        pp_p = &ip_vs_proto_table[hash];
+        for (; *pp_p; pp_p = &(*pp_p)->next) {
+                if (*pp_p == pp) {
+                        *pp_p = pp->next;
+                        if (pp->exit != NULL)
+                                pp->exit(pp);
+                        return 0;
+                }
+        }
+        return -ESRCH;
+}
+/*
+ *      get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+        struct ip_vs_protocol *pp;
+        unsigned hash = IP_VS_PROTO_HASH(proto);
+        for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+                if (pp->protocol == proto)
+                        return pp;
+        }
+        return NULL;
+}
+/*
+ *      Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(int flags)
+{
+        struct ip_vs_protocol *pp;
+        int i;
+        for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+                for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
+                        if (pp->timeout_change)
+                                pp->timeout_change(pp, flags);
+                }
+        }
+}
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+        int *t;
+        t = kmalloc(size, GFP_ATOMIC);
+        if (t == NULL)
+                return NULL;
+        memcpy(t, table, size);
+        return t;
+}
+/*
+ *      Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
+{
+        int i;
+        if (!table || !name || !to)
+                return -EINVAL;
+        for (i = 0; i < num; i++) {
+                if (strcmp(names[i], name))
+                        continue;
+                table[i] = to * HZ;
+                return 0;
+        }
+        return -ENOENT;
+}
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+        struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+        if (pp == NULL || pp->state_name == NULL)
+                return "ERR!";
+        return pp->state_name(state);
+}
+void
+ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+                          const struct sk_buff *skb,
+                          int offset,
+                          const char *msg)
+{
+        char buf[128];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else if (ih->frag_off & __constant_htons(IP_OFFSET))
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        else {
+                __u16 _ports[2], *pptr
+;
+                pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+                                          sizeof(_ports), _ports);
+                if (pptr == NULL)
+                        sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
+                                pp->name,
+                                NIPQUAD(ih->saddr),
+                                NIPQUAD(ih->daddr));
+                else
+                        sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
+                                pp->name,
+                                NIPQUAD(ih->saddr),
+                                ntohs(pptr[0]),
+                                NIPQUAD(ih->daddr),
+                                ntohs(pptr[1]));
+        }
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+int ip_vs_protocol_init(void)
+{
+        char protocols[64];
+#define REGISTER_PROTOCOL(p)                    \
+        do {                                    \
+                register_ip_vs_protocol(p);     \
+                strcat(protocols, ", ");        \
+                strcat(protocols, (p)->name);   \
+        } while (0)
+        protocols[0] = '\0';
+        protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+        REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+        REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ICMP
+        REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+        REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+        REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+        IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
+        return 0;
+}
+void ip_vs_protocol_cleanup(void)
+{
+        struct ip_vs_protocol *pp;
+        int i;
+        /* unregister all the ipvs protocols */
+        for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+                while ((pp = ip_vs_proto_table[i]) != NULL)
+                        unregister_ip_vs_protocol(pp);
+        }
+}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
new file mode 100644
index 000000000000..453e94a0bbd7
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -0,0 +1,177 @@
+/*
+ * ip_vs_proto_ah.c:    AH IPSec load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors:     Julian Anastasov <ja@ssi.bg>, February 2002
+ *              Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              version 2 as published by the Free Software Foundation;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+/* TODO:
+struct isakmp_hdr {
+        __u8            icookie[8];
+        __u8            rcookie[8];
+        __u8            np;
+        __u8            version;
+        __u8            xchgtype;
+        __u8            flags;
+        __u32           msgid;
+        __u32           length;
+};
+*/
+#define PORT_ISAKMP     500
+static struct ip_vs_conn *
+ah_conn_in_get(const struct sk_buff *skb,
+               struct ip_vs_protocol *pp,
+               const struct iphdr *iph,
+               unsigned int proto_off,
+               int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                /*
+                 * We are not sure if the packet is from our
+                 * service, so our conn_schedule hook should return NF_ACCEPT
+                 */
+                IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static struct ip_vs_conn *
+ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static int
+ah_conn_schedule(struct sk_buff *skb,
+                 struct ip_vs_protocol *pp,
+                 int *verdict, struct ip_vs_conn **cpp)
+{
+        /*
+         * AH is only related traffic. Pass the packet to IP stack.
+         */
+        *verdict = NF_ACCEPT;
+        return 0;
+}
+static void
+ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+                int offset, const char *msg)
+{
+        char buf[256];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+static void ah_init(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+static void ah_exit(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+struct ip_vs_protocol ip_vs_protocol_ah = {
+        .name =                 "AH",
+        .protocol =             IPPROTO_AH,
+        .dont_defrag =          1,
+        .init =                 ah_init,
+        .exit =                 ah_exit,
+        .conn_schedule =        ah_conn_schedule,
+        .conn_in_get =          ah_conn_in_get,
+        .conn_out_get =         ah_conn_out_get,
+        .snat_handler =         NULL,
+        .dnat_handler =         NULL,
+        .csum_check =           NULL,
+        .state_transition =     NULL,
+        .register_app =         NULL,
+        .unregister_app =       NULL,
+        .app_conn_bind =        NULL,
+        .debug_packet =         ah_debug_packet,
+        .timeout_change =       NULL,           /* ISAKMP */
+        .set_state_timeout =    NULL,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
new file mode 100644
index 000000000000..478e5c7c7e8e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -0,0 +1,175 @@
+/*
+ * ip_vs_proto_esp.c:   ESP IPSec load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors:     Julian Anastasov <ja@ssi.bg>, February 2002
+ *              Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              version 2 as published by the Free Software Foundation;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+/* TODO:
+struct isakmp_hdr {
+        __u8            icookie[8];
+        __u8            rcookie[8];
+        __u8            np;
+        __u8            version;
+        __u8            xchgtype;
+        __u8            flags;
+        __u32           msgid;
+        __u32           length;
+};
+*/
+#define PORT_ISAKMP     500
+static struct ip_vs_conn *
+esp_conn_in_get(const struct sk_buff *skb,
+                struct ip_vs_protocol *pp,
+                const struct iphdr *iph,
+                unsigned int proto_off,
+                int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                /*
+                 * We are not sure if the packet is from our
+                 * service, so our conn_schedule hook should return NF_ACCEPT
+                 */
+                IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static struct ip_vs_conn *
+esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static int
+esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                  int *verdict, struct ip_vs_conn **cpp)
+{
+        /*
+         * ESP is only related traffic. Pass the packet to IP stack.
+         */
+        *verdict = NF_ACCEPT;
+        return 0;
+}
+static void
+esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+                 int offset, const char *msg)
+{
+        char buf[256];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+static void esp_init(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+static void esp_exit(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+struct ip_vs_protocol ip_vs_protocol_esp = {
+        .name =                 "ESP",
+        .protocol =             IPPROTO_ESP,
+        .dont_defrag =          1,
+        .init =                 esp_init,
+        .exit =                 esp_exit,
+        .conn_schedule =        esp_conn_schedule,
+        .conn_in_get =          esp_conn_in_get,
+        .conn_out_get =         esp_conn_out_get,
+        .snat_handler =         NULL,
+        .dnat_handler =         NULL,
+        .csum_check =           NULL,
+        .state_transition =     NULL,
+        .register_app =         NULL,
+        .unregister_app =       NULL,
+        .app_conn_bind =        NULL,
+        .debug_packet =         esp_debug_packet,
+        .timeout_change =       NULL,           /* ISAKMP */
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
new file mode 100644
index 000000000000..191e94aa1c1f
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c
@@ -0,0 +1,182 @@
+/*
+ * ip_vs_proto_icmp.c:  ICMP load balancing support for IP Virtual Server
+ *
+ * Authors:     Julian Anastasov <ja@ssi.bg>, March 2002
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              version 2 as published by the Free Software Foundation;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+static int icmp_timeouts[1] =           { 1*60*HZ };
+static char * icmp_state_name_table[1] = { "ICMP" };
+static struct ip_vs_conn *
+icmp_conn_in_get(const struct sk_buff *skb,
+                 struct ip_vs_protocol *pp,
+                 const struct iphdr *iph,
+                 unsigned int proto_off,
+                 int inverse)
+{
+#if 0
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                        iph->saddr, 0,
+                        iph->daddr, 0);
+        } else {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                        iph->daddr, 0,
+                        iph->saddr, 0);
+        }
+        return cp;
+#else
+        return NULL;
+#endif
+}
+static struct ip_vs_conn *
+icmp_conn_out_get(const struct sk_buff *skb,
+                  struct ip_vs_protocol *pp,
+                  const struct iphdr *iph,
+                  unsigned int proto_off,
+                  int inverse)
+{
+#if 0
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(iph->protocol,
+                        iph->saddr, 0,
+                        iph->daddr, 0);
+        } else {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                        iph->daddr, 0,
+                        iph->saddr, 0);
+        }
+        return cp;
+#else
+        return NULL;
+#endif
+}
+static int
+icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                   int *verdict, struct ip_vs_conn **cpp)
+{
+        *verdict = NF_ACCEPT;
+        return 0;
+}
+static int
+icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+        if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
+                if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+                        if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
+                                IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
+                                return 0;
+                        }
+                }
+        }
+        return 1;
+}
+static void
+icmp_debug_packet(struct ip_vs_protocol *pp,
+                  const struct sk_buff *skb,
+                  int offset,
+                  const char *msg)
+{
+        char buf[256];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else if (ih->frag_off & __constant_htons(IP_OFFSET))
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        else {
+                struct icmphdr _icmph, *ic;
+                ic = skb_header_pointer(skb, offset + ih->ihl*4,
+                                        sizeof(_icmph), &_icmph);
+                if (ic == NULL)
+                        sprintf(buf, "%s TRUNCATED to %u bytes\n",
+                                pp->name, skb->len - offset);
+                else
+                        sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
+                                pp->name, NIPQUAD(ih->saddr),
+                                NIPQUAD(ih->daddr),
+                                ic->type, ic->code);
+        }
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+static int
+icmp_state_transition(struct ip_vs_conn *cp, int direction,
+                      const struct sk_buff *skb,
+                      struct ip_vs_protocol *pp)
+{
+        cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
+        return 1;
+}
+static int
+icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+        int num;
+        char **names;
+        num = IP_VS_ICMP_S_LAST;
+        names = icmp_state_name_table;
+        return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
+}
+static void icmp_init(struct ip_vs_protocol *pp)
+{
+        pp->timeout_table = icmp_timeouts;
+}
+static void icmp_exit(struct ip_vs_protocol *pp)
+{
+}
+struct ip_vs_protocol ip_vs_protocol_icmp = {
+        .name =                 "ICMP",
+        .protocol =             IPPROTO_ICMP,
+        .dont_defrag =          0,
+        .init =                 icmp_init,
+        .exit =                 icmp_exit,
+        .conn_schedule =        icmp_conn_schedule,
+        .conn_in_get =          icmp_conn_in_get,
+        .conn_out_get =         icmp_conn_out_get,
+        .snat_handler =         NULL,
+        .dnat_handler =         NULL,
+        .csum_check =           icmp_csum_check,
+        .state_transition =     icmp_state_transition,
+        .register_app =         NULL,
+        .unregister_app =       NULL,
+        .app_conn_bind =        NULL,
+        .debug_packet =         icmp_debug_packet,
+        .timeout_change =       NULL,
+        .set_state_timeout =    icmp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 000000000000..e65de675da74
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,640 @@
+/*
+ * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+static struct ip_vs_conn *
+tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                return ip_vs_conn_in_get(iph->protocol,
+                                         iph->saddr, pptr[0],
+                                         iph->daddr, pptr[1]);
+        } else {
+                return ip_vs_conn_in_get(iph->protocol,
+                                         iph->daddr, pptr[1],
+                                         iph->saddr, pptr[0]);
+        }
+}
+static struct ip_vs_conn *
+tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                return ip_vs_conn_out_get(iph->protocol,
+                                          iph->saddr, pptr[0],
+                                          iph->daddr, pptr[1]);
+        } else {
+                return ip_vs_conn_out_get(iph->protocol,
+                                          iph->daddr, pptr[1],
+                                          iph->saddr, pptr[0]);
+        }
+}
+static int
+tcp_conn_schedule(struct sk_buff *skb,
+                  struct ip_vs_protocol *pp,
+                  int *verdict, struct ip_vs_conn **cpp)
+{
+        struct ip_vs_service *svc;
+        struct tcphdr _tcph, *th;
+        th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL) {
+                *verdict = NF_DROP;
+                return 0;
+        }
+        if (th->syn &&
+            (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+                                     skb->nh.iph->daddr, th->dest))) {
+                if (ip_vs_todrop()) {
+                        /*
+                         * It seems that we are very loaded.
+                         * We have to drop this packet :(
+                         */
+                        ip_vs_service_put(svc);
+                        *verdict = NF_DROP;
+                        return 0;
+                }
+                /*
+                 * Let the virtual server select a real server for the
+                 * incoming connection, and create a connection entry.
+                 */
+                *cpp = ip_vs_schedule(svc, skb);
+                if (!*cpp) {
+                        *verdict = ip_vs_leave(svc, skb, pp);
+                        return 0;
+                }
+                ip_vs_service_put(svc);
+        }
+        return 1;
+}
+static inline void
+tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
+                     u16 oldport, u16 newport)
+{
+        tcph->check =
+                ip_vs_check_diff(~oldip, newip,
+                                 ip_vs_check_diff(oldport ^ 0xFFFF,
+                                                  newport, tcph->check));
+}
+static int
+tcp_snat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct tcphdr *tcph;
+        unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /* Call application helper if needed */
+                if (!ip_vs_app_pkt_out(cp, pskb))
+                        return 0;
+        }
+        tcph = (void *)(*pskb)->nh.iph + tcphoff;
+        tcph->source = cp->vport;
+        /* Adjust TCP checksums */
+        if (!cp->app) {
+                /* Only port and addr are changed, do fast csum update */
+                tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
+                                     cp->dport, cp->vport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                tcph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, tcphoff,
+                                             (*pskb)->len - tcphoff, 0);
+                tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+                                                (*pskb)->len - tcphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+                          pp->name, tcph->check,
+                          (char*)&(tcph->check) - (char*)tcph);
+        }
+        return 1;
+}
+static int
+tcp_dnat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct tcphdr *tcph;
+        unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /*
+                 *      Attempt ip_vs_app call.
+                 *      It will fix ip_vs_conn and iph ack_seq stuff
+                 */
+                if (!ip_vs_app_pkt_in(cp, pskb))
+                        return 0;
+        }
+        tcph = (void *)(*pskb)->nh.iph + tcphoff;
+        tcph->dest = cp->dport;
+        /*
+         *      Adjust TCP checksums
+         */
+        if (!cp->app) {
+                /* Only port and addr are changed, do fast csum update */
+                tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
+                                     cp->vport, cp->dport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                tcph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, tcphoff,
+                                             (*pskb)->len - tcphoff, 0);
+                tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+                                                (*pskb)->len - tcphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+        }
+        return 1;
+}
+static int
+tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+        unsigned int tcphoff = skb->nh.iph->ihl*4;
+        switch (skb->ip_summed) {
+        case CHECKSUM_NONE:
+                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+        case CHECKSUM_HW:
+                if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
+                                      skb->len - tcphoff,
+                                      skb->nh.iph->protocol, skb->csum)) {
+                        IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                         "Failed checksum for");
+                        return 0;
+                }
+                break;
+        default:
+                /* CHECKSUM_UNNECESSARY */
+                break;
+        }
+        return 1;
+}
+#define TCP_DIR_INPUT           0
+#define TCP_DIR_OUTPUT          4
+#define TCP_DIR_INPUT_ONLY      8
+static int tcp_state_off[IP_VS_DIR_LAST] = {
+        [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
+        [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
+        [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
+};
+/*
+ *      Timeout table[state]
+ */
+static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+        [IP_VS_TCP_S_NONE]              =       2*HZ,
+        [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
+        [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
+        [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
+        [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
+        [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
+        [IP_VS_TCP_S_CLOSE]             =       10*HZ,
+        [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
+        [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
+        [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
+        [IP_VS_TCP_S_SYNACK]            =       120*HZ,
+        [IP_VS_TCP_S_LAST]              =       2*HZ,
+};
+#if 0
+/* FIXME: This is going to die */
+static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
+        [IP_VS_TCP_S_NONE]              =       2*HZ,
+        [IP_VS_TCP_S_ESTABLISHED]       =       8*60*HZ,
+        [IP_VS_TCP_S_SYN_SENT]          =       60*HZ,
+        [IP_VS_TCP_S_SYN_RECV]          =       10*HZ,
+        [IP_VS_TCP_S_FIN_WAIT]          =       60*HZ,
+        [IP_VS_TCP_S_TIME_WAIT]         =       60*HZ,
+        [IP_VS_TCP_S_CLOSE]             =       10*HZ,
+        [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
+        [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
+        [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
+        [IP_VS_TCP_S_SYNACK]            =       100*HZ,
+        [IP_VS_TCP_S_LAST]              =       2*HZ,
+};
+#endif
+static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+        [IP_VS_TCP_S_NONE]              =       "NONE",
+        [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
+        [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
+        [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
+        [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
+        [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
+        [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
+        [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
+        [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
+        [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
+        [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
+        [IP_VS_TCP_S_LAST]              =       "BUG!",
+};
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+struct tcp_states_t {
+        int next_state[IP_VS_TCP_S_LAST];
+};
+static const char * tcp_state_name(int state)
+{
+        if (state >= IP_VS_TCP_S_LAST)
+                return "ERR!";
+        return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+static struct tcp_states_t tcp_states [] = {
+/*      INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+/*      OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+/*      INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+static struct tcp_states_t tcp_states_dos [] = {
+/*      INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+/*      OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+/*      INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+static struct tcp_states_t *tcp_state_table = tcp_states;
+static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+{
+        int on = (flags & 1);           /* secure_tcp */
+        /*
+        ** FIXME: change secure_tcp to independent sysctl var
+        ** or make it per-service or per-app because it is valid
+        ** for most if not for all of the applications. Something
+        ** like "capabilities" (flags) for each object.
+        */
+        tcp_state_table = (on? tcp_states_dos : tcp_states);
+}
+static int
+tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+        return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
+                                       tcp_state_name_table, sname, to);
+}
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+        if (th->rst)
+                return 3;
+        if (th->syn)
+                return 0;
+        if (th->fin)
+                return 1;
+        if (th->ack)
+                return 2;
+        return -1;
+}
+static inline void
+set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+              int direction, struct tcphdr *th)
+{
+        int state_idx;
+        int new_state = IP_VS_TCP_S_CLOSE;
+        int state_off = tcp_state_off[direction];
+        /*
+         *    Update state offset to INPUT_ONLY if necessary
+         *    or delete NO_OUTPUT flag if output packet detected
+         */
+        if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+                if (state_off == TCP_DIR_OUTPUT)
+                        cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+                else
+                        state_off = TCP_DIR_INPUT_ONLY;
+        }
+        if ((state_idx = tcp_state_idx(th)) < 0) {
+                IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+                goto tcp_state_out;
+        }
+        new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+  tcp_state_out:
+        if (new_state != cp->state) {
+                struct ip_vs_dest *dest = cp->dest;
+                IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
+                          "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+                          pp->name,
+                          (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
+                          th->syn? 'S' : '.',
+                          th->fin? 'F' : '.',
+                          th->ack? 'A' : '.',
+                          th->rst? 'R' : '.',
+                          NIPQUAD(cp->daddr), ntohs(cp->dport),
+                          NIPQUAD(cp->caddr), ntohs(cp->cport),
+                          tcp_state_name(cp->state),
+                          tcp_state_name(new_state),
+                          atomic_read(&cp->refcnt));
+                if (dest) {
+                        if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                            (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+                                atomic_dec(&dest->activeconns);
+                                atomic_inc(&dest->inactconns);
+                                cp->flags |= IP_VS_CONN_F_INACTIVE;
+                        } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                                   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+                                atomic_inc(&dest->activeconns);
+                                atomic_dec(&dest->inactconns);
+                                cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+                        }
+                }
+        }
+        cp->timeout = pp->timeout_table[cp->state = new_state];
+}
+/*
+ *      Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+                     const struct sk_buff *skb,
+                     struct ip_vs_protocol *pp)
+{
+        struct tcphdr _tcph, *th;
+        th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL)
+                return 0;
+        spin_lock(&cp->lock);
+        set_tcp_state(pp, cp, direction, th);
+        spin_unlock(&cp->lock);
+        return 1;
+}
+/*
+ *      Hash table for TCP application incarnations
+ */
+#define TCP_APP_TAB_BITS        4
+#define TCP_APP_TAB_SIZE        (1 << TCP_APP_TAB_BITS)
+#define TCP_APP_TAB_MASK        (TCP_APP_TAB_SIZE - 1)
+static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(tcp_app_lock);
+static inline __u16 tcp_app_hashkey(__u16 port)
+{
+        return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
+}
+static int tcp_register_app(struct ip_vs_app *inc)
+{
+        struct ip_vs_app *i;
+        __u16 hash, port = inc->port;
+        int ret = 0;
+        hash = tcp_app_hashkey(port);
+        spin_lock_bh(&tcp_app_lock);
+        list_for_each_entry(i, &tcp_apps[hash], p_list) {
+                if (i->port == port) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+        }
+        list_add(&inc->p_list, &tcp_apps[hash]);
+        atomic_inc(&ip_vs_protocol_tcp.appcnt);
+  out:
+        spin_unlock_bh(&tcp_app_lock);
+        return ret;
+}
+static void
+tcp_unregister_app(struct ip_vs_app *inc)
+{
+        spin_lock_bh(&tcp_app_lock);
+        atomic_dec(&ip_vs_protocol_tcp.appcnt);
+        list_del(&inc->p_list);
+        spin_unlock_bh(&tcp_app_lock);
+}
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+        int hash;
+        struct ip_vs_app *inc;
+        int result = 0;
+        /* Default binding: bind app only for NAT */
+        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+                return 0;
+        /* Lookup application incarnations and bind the right one */
+        hash = tcp_app_hashkey(cp->vport);
+        spin_lock(&tcp_app_lock);
+        list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+                if (inc->port == cp->vport) {
+                        if (unlikely(!ip_vs_app_inc_get(inc)))
+                                break;
+                        spin_unlock(&tcp_app_lock);
+                        IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+                                  "%u.%u.%u.%u:%u to app %s on port %u\n",
+                                  __FUNCTION__,
+                                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                                  inc->name, ntohs(inc->port));
+                        cp->app = inc;
+                        if (inc->init_conn)
+                                result = inc->init_conn(inc, cp);
+                        goto out;
+                }
+        }
+        spin_unlock(&tcp_app_lock);
+  out:
+        return result;
+}
+/*
+ *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+{
+        spin_lock(&cp->lock);
+        cp->state = IP_VS_TCP_S_LISTEN;
+        cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+        spin_unlock(&cp->lock);
+}
+static void tcp_init(struct ip_vs_protocol *pp)
+{
+        IP_VS_INIT_HASH_TABLE(tcp_apps);
+        pp->timeout_table = tcp_timeouts;
+}
+static void tcp_exit(struct ip_vs_protocol *pp)
+{
+}
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+        .name =                 "TCP",
+        .protocol =             IPPROTO_TCP,
+        .dont_defrag =          0,
+        .appcnt =               ATOMIC_INIT(0),
+        .init =                 tcp_init,
+        .exit =                 tcp_exit,
+        .register_app =         tcp_register_app,
+        .unregister_app =       tcp_unregister_app,
+        .conn_schedule =        tcp_conn_schedule,
+        .conn_in_get =          tcp_conn_in_get,
+        .conn_out_get =         tcp_conn_out_get,
+        .snat_handler =         tcp_snat_handler,
+        .dnat_handler =         tcp_dnat_handler,
+        .csum_check =           tcp_csum_check,
+        .state_name =           tcp_state_name,
+        .state_transition =     tcp_state_transition,
+        .app_conn_bind =        tcp_app_conn_bind,
+        .debug_packet =         ip_vs_tcpudp_debug_packet,
+        .timeout_change =       tcp_timeout_change,
+        .set_state_timeout =    tcp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 000000000000..8ae5f2e0aefa
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,427 @@
+/*
+ * ip_vs_proto_udp.c:   UDP load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+static struct ip_vs_conn *
+udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                                       iph->saddr, pptr[0],
+                                       iph->daddr, pptr[1]);
+        } else {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                                       iph->daddr, pptr[1],
+                                       iph->saddr, pptr[0]);
+        }
+        return cp;
+}
+static struct ip_vs_conn *
+udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(iph->protocol,
+                                        iph->saddr, pptr[0],
+                                        iph->daddr, pptr[1]);
+        } else {
+                cp = ip_vs_conn_out_get(iph->protocol,
+                                        iph->daddr, pptr[1],
+                                        iph->saddr, pptr[0]);
+        }
+        return cp;
+}
+static int
+udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                  int *verdict, struct ip_vs_conn **cpp)
+{
+        struct ip_vs_service *svc;
+        struct udphdr _udph, *uh;
+        uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_udph), &_udph);
+        if (uh == NULL) {
+                *verdict = NF_DROP;
+                return 0;
+        }
+        if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+                                     skb->nh.iph->daddr, uh->dest))) {
+                if (ip_vs_todrop()) {
+                        /*
+                         * It seems that we are very loaded.
+                         * We have to drop this packet :(
+                         */
+                        ip_vs_service_put(svc);
+                        *verdict = NF_DROP;
+                        return 0;
+                }
+                /*
+                 * Let the virtual server select a real server for the
+                 * incoming connection, and create a connection entry.
+                 */
+                *cpp = ip_vs_schedule(svc, skb);
+                if (!*cpp) {
+                        *verdict = ip_vs_leave(svc, skb, pp);
+                        return 0;
+                }
+                ip_vs_service_put(svc);
+        }
+        return 1;
+}
+static inline void
+udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
+                     u16 oldport, u16 newport)
+{
+        uhdr->check =
+                ip_vs_check_diff(~oldip, newip,
+                                 ip_vs_check_diff(oldport ^ 0xFFFF,
+                                                  newport, uhdr->check));
+        if (!uhdr->check)
+                uhdr->check = 0xFFFF;
+}
+static int
+udp_snat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct udphdr *udph;
+        unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /*
+                 *      Call application helper if needed
+                 */
+                if (!ip_vs_app_pkt_out(cp, pskb))
+                        return 0;
+        }
+        udph = (void *)(*pskb)->nh.iph + udphoff;
+        udph->source = cp->vport;
+        /*
+         *      Adjust UDP checksums
+         */
+        if (!cp->app && (udph->check != 0)) {
+                /* Only port and addr are changed, do fast csum update */
+                udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
+                                     cp->dport, cp->vport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                udph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, udphoff,
+                                             (*pskb)->len - udphoff, 0);
+                udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+                                                (*pskb)->len - udphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                if (udph->check == 0)
+                        udph->check = 0xFFFF;
+                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+                          pp->name, udph->check,
+                          (char*)&(udph->check) - (char*)udph);
+        }
+        return 1;
+}
+static int
+udp_dnat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct udphdr *udph;
+        unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /*
+                 *      Attempt ip_vs_app call.
+                 *      It will fix ip_vs_conn
+                 */
+                if (!ip_vs_app_pkt_in(cp, pskb))
+                        return 0;
+        }
+        udph = (void *)(*pskb)->nh.iph + udphoff;
+        udph->dest = cp->dport;
+        /*
+         *      Adjust UDP checksums
+         */
+        if (!cp->app && (udph->check != 0)) {
+                /* Only port and addr are changed, do fast csum update */
+                udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
+                                     cp->vport, cp->dport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                udph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, udphoff,
+                                             (*pskb)->len - udphoff, 0);
+                udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+                                                (*pskb)->len - udphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                if (udph->check == 0)
+                        udph->check = 0xFFFF;
+                (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+        }
+        return 1;
+}
+static int
+udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+        struct udphdr _udph, *uh;
+        unsigned int udphoff = skb->nh.iph->ihl*4;
+        uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+        if (uh == NULL)
+                return 0;
+        if (uh->check != 0) {
+                switch (skb->ip_summed) {
+                case CHECKSUM_NONE:
+                        skb->csum = skb_checksum(skb, udphoff,
+                                                 skb->len - udphoff, 0);
+                case CHECKSUM_HW:
+                        if (csum_tcpudp_magic(skb->nh.iph->saddr,
+                                              skb->nh.iph->daddr,
+                                              skb->len - udphoff,
+                                              skb->nh.iph->protocol,
+                                              skb->csum)) {
+                                IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                                 "Failed checksum for");
+                                return 0;
+                        }
+                        break;
+                default:
+                        /* CHECKSUM_UNNECESSARY */
+                        break;
+                }
+        }
+        return 1;
+}
+/*
+ *      Note: the caller guarantees that only one of register_app,
+ *      unregister_app or app_conn_bind is called each time.
+ */
+#define UDP_APP_TAB_BITS        4
+#define UDP_APP_TAB_SIZE        (1 << UDP_APP_TAB_BITS)
+#define UDP_APP_TAB_MASK        (UDP_APP_TAB_SIZE - 1)
+static struct list_head udp_apps[UDP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(udp_app_lock);
+static inline __u16 udp_app_hashkey(__u16 port)
+{
+        return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
+}
+static int udp_register_app(struct ip_vs_app *inc)
+{
+        struct ip_vs_app *i;
+        __u16 hash, port = inc->port;
+        int ret = 0;
+        hash = udp_app_hashkey(port);
+        spin_lock_bh(&udp_app_lock);
+        list_for_each_entry(i, &udp_apps[hash], p_list) {
+                if (i->port == port) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+        }
+        list_add(&inc->p_list, &udp_apps[hash]);
+        atomic_inc(&ip_vs_protocol_udp.appcnt);
+  out:
+        spin_unlock_bh(&udp_app_lock);
+        return ret;
+}
+static void
+udp_unregister_app(struct ip_vs_app *inc)
+{
+        spin_lock_bh(&udp_app_lock);
+        atomic_dec(&ip_vs_protocol_udp.appcnt);
+        list_del(&inc->p_list);
+        spin_unlock_bh(&udp_app_lock);
+}
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+        int hash;
+        struct ip_vs_app *inc;
+        int result = 0;
+        /* Default binding: bind app only for NAT */
+        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+                return 0;
+        /* Lookup application incarnations and bind the right one */
+        hash = udp_app_hashkey(cp->vport);
+        spin_lock(&udp_app_lock);
+        list_for_each_entry(inc, &udp_apps[hash], p_list) {
+                if (inc->port == cp->vport) {
+                        if (unlikely(!ip_vs_app_inc_get(inc)))
+                                break;
+                        spin_unlock(&udp_app_lock);
+                        IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+                                  "%u.%u.%u.%u:%u to app %s on port %u\n",
+                                  __FUNCTION__,
+                                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                                  inc->name, ntohs(inc->port));
+                        cp->app = inc;
+                        if (inc->init_conn)
+                                result = inc->init_conn(inc, cp);
+                        goto out;
+                }
+        }
+        spin_unlock(&udp_app_lock);
+  out:
+        return result;
+}
+static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+        [IP_VS_UDP_S_NORMAL]            =       5*60*HZ,
+        [IP_VS_UDP_S_LAST]              =       2*HZ,
+};
+static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+        [IP_VS_UDP_S_NORMAL]            =       "UDP",
+        [IP_VS_UDP_S_LAST]              =       "BUG!",
+};
+static int
+udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+        return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
+                                       udp_state_name_table, sname, to);
+}
+static const char * udp_state_name(int state)
+{
+        if (state >= IP_VS_UDP_S_LAST)
+                return "ERR!";
+        return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+static int
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+                     const struct sk_buff *skb,
+                     struct ip_vs_protocol *pp)
+{
+        cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+        return 1;
+}
+static void udp_init(struct ip_vs_protocol *pp)
+{
+        IP_VS_INIT_HASH_TABLE(udp_apps);
+        pp->timeout_table = udp_timeouts;
+}
+static void udp_exit(struct ip_vs_protocol *pp)
+{
+}
+struct ip_vs_protocol ip_vs_protocol_udp = {
+        .name =                 "UDP",
+        .protocol =             IPPROTO_UDP,
+        .dont_defrag =          0,
+        .init =                 udp_init,
+        .exit =                 udp_exit,
+        .conn_schedule =        udp_conn_schedule,
+        .conn_in_get =          udp_conn_in_get,
+        .conn_out_get =         udp_conn_out_get,
+        .snat_handler =         udp_snat_handler,
+        .dnat_handler =         udp_dnat_handler,
+        .csum_check =           udp_csum_check,
+        .state_transition =     udp_state_transition,
+        .state_name =           udp_state_name,
+        .register_app =         udp_register_app,
+        .unregister_app =       udp_unregister_app,
+        .app_conn_bind =        udp_app_conn_bind,
+        .debug_packet =         ip_vs_tcpudp_debug_packet,
+        .timeout_change =       NULL,
+        .set_state_timeout =    udp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
new file mode 100644
index 000000000000..b23bab231cab
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -0,0 +1,118 @@
+/*
+ * IPVS:        Round-Robin Scheduling module
+ *
+ * Version:     $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
+ *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_rr_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct list_head *p, *q;
+        struct ip_vs_dest *dest;
+        IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
+        write_lock(&svc->sched_lock);
+        p = (struct list_head *)svc->sched_data;
+        p = p->next;
+        q = p;
+        do {
+                /* skip list head */
+                if (q == &svc->destinations) {
+                        q = q->next;
+                        continue;
+                }
+                
+                dest = list_entry(q, struct ip_vs_dest, n_list);
+                if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                    atomic_read(&dest->weight) > 0)
+                        /* HIT */
+                        goto out;
+                q = q->next;
+        } while (q != p);
+        write_unlock(&svc->sched_lock);
+        return NULL;
+  out:
+        svc->sched_data = q;
+        write_unlock(&svc->sched_lock);
+        IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d\n",
+                  NIPQUAD(dest->addr), ntohs(dest->port),
+                  atomic_read(&dest->activeconns),
+                  atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+        return dest;
+}
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+        .name =                 "rr",                   /* name */
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_rr_init_svc,
+        .done_service =         ip_vs_rr_done_svc,
+        .update_service =       ip_vs_rr_update_svc,
+        .schedule =             ip_vs_rr_schedule,
+};
+static int __init ip_vs_rr_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+static void __exit ip_vs_rr_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
new file mode 100644
index 000000000000..0f7c56a225bd
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <net/ip_vs.h>
+/*
+ *  IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_sched_lock);
+/*
+ *  Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+                         struct ip_vs_scheduler *scheduler)
+{
+        int ret;
+        if (svc == NULL) {
+                IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
+                return -EINVAL;
+        }
+        if (scheduler == NULL) {
+                IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
+                return -EINVAL;
+        }
+        svc->scheduler = scheduler;
+        if (scheduler->init_service) {
+                ret = scheduler->init_service(svc);
+                if (ret) {
+                        IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
+                        return ret;
+                }
+        }
+        return 0;
+}
+/*
+ *  Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+        struct ip_vs_scheduler *sched;
+        if (svc == NULL) {
+                IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
+                return -EINVAL;
+        }
+        sched = svc->scheduler;
+        if (sched == NULL) {
+                IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
+                return -EINVAL;
+        }
+        if (sched->done_service) {
+                if (sched->done_service(svc) != 0) {
+                        IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
+                        return -EINVAL;
+                }
+        }
+        svc->scheduler = NULL;
+        return 0;
+}
+/*
+ *  Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+        struct ip_vs_scheduler *sched;
+        IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
+                  sched_name);
+        read_lock_bh(&__ip_vs_sched_lock);
+        list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+                /*
+                 * Test and get the modules atomically
+                 */
+                if (sched->module && !try_module_get(sched->module)) {
+                        /*
+                         * This scheduler is just deleted
+                         */
+                        continue;
+                }
+                if (strcmp(sched_name, sched->name)==0) {
+                        /* HIT */
+                        read_unlock_bh(&__ip_vs_sched_lock);
+                        return sched;
+                }
+                if (sched->module)
+                        module_put(sched->module);
+        }
+        read_unlock_bh(&__ip_vs_sched_lock);
+        return NULL;
+}
+/*
+ *  Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+        struct ip_vs_scheduler *sched;
+        /*
+         *  Search for the scheduler by sched_name
+         */
+        sched = ip_vs_sched_getbyname(sched_name);
+        /*
+         *  If scheduler not found, load the module and search again
+         */
+        if (sched == NULL) {
+                request_module("ip_vs_%s", sched_name);
+                sched = ip_vs_sched_getbyname(sched_name);
+        }
+        return sched;
+}
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+        if (scheduler->module)
+                module_put(scheduler->module);
+}
+/*
+ *  Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+        struct ip_vs_scheduler *sched;
+        if (!scheduler) {
+                IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
+                return -EINVAL;
+        }
+        if (!scheduler->name) {
+                IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
+                return -EINVAL;
+        }
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        /*
+         *  Make sure that the scheduler with this name doesn't exist
+         *  in the scheduler list.
+         */
+        sched = ip_vs_sched_getbyname(scheduler->name);
+        if (sched) {
+                ip_vs_scheduler_put(sched);
+                ip_vs_use_count_dec();
+                IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+                          "already existed in the system\n", scheduler->name);
+                return -EINVAL;
+        }
+        write_lock_bh(&__ip_vs_sched_lock);
+        if (scheduler->n_list.next != &scheduler->n_list) {
+                write_unlock_bh(&__ip_vs_sched_lock);
+                ip_vs_use_count_dec();
+                IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+                          "already linked\n", scheduler->name);
+                return -EINVAL;
+        }
+        /*
+         *      Add it into the d-linked scheduler list
+         */
+        list_add(&scheduler->n_list, &ip_vs_schedulers);
+        write_unlock_bh(&__ip_vs_sched_lock);
+        IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
+        return 0;
+}
+/*
+ *  Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+        if (!scheduler) {
+                IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
+                return -EINVAL;
+        }
+        write_lock_bh(&__ip_vs_sched_lock);
+        if (scheduler->n_list.next == &scheduler->n_list) {
+                write_unlock_bh(&__ip_vs_sched_lock);
+                IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
+                          "is not in the list. failed\n", scheduler->name);
+                return -EINVAL;
+        }
+        /*
+         *      Remove it from the d-linked scheduler list
+         */
+        list_del(&scheduler->n_list);
+        write_unlock_bh(&__ip_vs_sched_lock);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
+        return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
new file mode 100644
index 000000000000..ff366f7390d9
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -0,0 +1,163 @@
+/*
+ * IPVS:        Shortest Expected Delay scheduling module
+ *
+ * Version:     $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int
+ip_vs_sed_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_sed_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_sed_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We only use the active connection number in the cost
+         * calculation here.
+         */
+        return atomic_read(&dest->activeconns) + 1;
+}
+/*
+ *      Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least;
+        unsigned int loh, doh;
+        IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
+        /*
+         * We calculate the load of each dest server as follows:
+         *      (server expected overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connections.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                    atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = ip_vs_sed_dest_overhead(least);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = ip_vs_sed_dest_overhead(dest);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+        .name =                 "sed",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_sed_init_svc,
+        .done_service =         ip_vs_sed_done_svc,
+        .update_service =       ip_vs_sed_update_svc,
+        .schedule =             ip_vs_sed_schedule,
+};
+static int __init ip_vs_sed_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+static void __exit ip_vs_sed_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
new file mode 100644
index 000000000000..6f7c50e44a39
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -0,0 +1,255 @@
+/*
+ * IPVS:        Source Hashing scheduling module
+ *
+ * Version:     $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[src_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) or (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+/*
+ *      IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+        struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+/*
+ *     for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS        8
+#endif
+#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
+/*
+ *      Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
+{
+        return (tbl[ip_vs_sh_hashkey(addr)]).dest;
+}
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_sh_bucket *b;
+        struct list_head *p;
+        struct ip_vs_dest *dest;
+        b = tbl;
+        p = &svc->destinations;
+        for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+                if (list_empty(p)) {
+                        b->dest = NULL;
+                } else {
+                        if (p == &svc->destinations)
+                                p = p->next;
+                        dest = list_entry(p, struct ip_vs_dest, n_list);
+                        atomic_inc(&dest->refcnt);
+                        b->dest = dest;
+                        p = p->next;
+                }
+                b++;
+        }
+        return 0;
+}
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+        int i;
+        struct ip_vs_sh_bucket *b;
+        b = tbl;
+        for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+                if (b->dest) {
+                        atomic_dec(&b->dest->refcnt);
+                        b->dest = NULL;
+                }
+                b++;
+        }
+}
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_sh_bucket *tbl;
+        /* allocate the SH table for this service */
+        tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+                      GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+        /* assign the hash buckets with the updated service */
+        ip_vs_sh_assign(tbl, svc);
+        return 0;
+}
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_sh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_sh_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+        return 0;
+}
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_sh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_sh_flush(tbl);
+        /* assign the hash buckets with the updated service */
+        ip_vs_sh_assign(tbl, svc);
+        return 0;
+}
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+        return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+/*
+ *      Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_sh_bucket *tbl;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+        dest = ip_vs_sh_get(tbl, iph->saddr);
+        if (!dest
+            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+            || atomic_read(&dest->weight) <= 0
+            || is_overloaded(dest)) {
+                return NULL;
+        }
+        IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(iph->saddr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+        .name =                 "sh",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_sh_init_svc,
+        .done_service =         ip_vs_sh_done_svc,
+        .update_service =       ip_vs_sh_update_svc,
+        .schedule =             ip_vs_sh_schedule,
+};
+static int __init ip_vs_sh_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+static void __exit ip_vs_sh_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
new file mode 100644
index 000000000000..25c479550a32
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -0,0 +1,892 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync:  sync connection info from master load balancer to backups
+ *              through multicast
+ *
+ * Changes:
+ *      Alexandre Cassen        :       Added master & backup support at a time.
+ *      Alexandre Cassen        :       Added SyncID support for incoming sync
+ *                                      messages filtering.
+ *      Justin Ossevoort        :       Fix endian problem on sync message size.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <net/ip.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>                /* for get_fs and set_fs */
+#include <net/ip_vs.h>
+#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT  8848          /* multicast port */
+/*
+ *      IPVS sync connection entry
+ */
+struct ip_vs_sync_conn {
+        __u8                    reserved;
+        /* Protocol, addresses and port numbers */
+        __u8                    protocol;       /* Which protocol (TCP/UDP) */
+        __u16                   cport;
+        __u16                   vport;
+        __u16                   dport;
+        __u32                   caddr;          /* client address */
+        __u32                   vaddr;          /* virtual address */
+        __u32                   daddr;          /* destination address */
+        /* Flags and state transition */
+        __u16                   flags;          /* status flags */
+        __u16                   state;          /* state info */
+        /* The sequence options start here */
+};
+struct ip_vs_sync_conn_options {
+        struct ip_vs_seq        in_seq;         /* incoming seq. struct */
+        struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+};
+#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
+#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
+#define FULL_CONN_SIZE  \
+(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+/*
+  The master mulitcasts messages to the backup load balancers in the
+  following format.
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (1)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                            .                                  |
+      |                            .                                  |
+      |                            .                                  |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (n)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+#define SYNC_MESG_HEADER_LEN    4
+struct ip_vs_sync_mesg {
+        __u8                    nr_conns;
+        __u8                    syncid;
+        __u16                   size;
+        /* ip_vs_sync_conn entries start here */
+};
+/* the maximum length of sync (sending/receiving) message */
+static int sync_send_mesg_maxlen;
+static int sync_recv_mesg_maxlen;
+struct ip_vs_sync_buff {
+        struct list_head        list;
+        unsigned long           firstuse;
+        /* pointers for the message data */
+        struct ip_vs_sync_mesg  *mesg;
+        unsigned char           *head;
+        unsigned char           *end;
+};
+/* the sync_buff list head and the lock */
+static LIST_HEAD(ip_vs_sync_queue);
+static DEFINE_SPINLOCK(ip_vs_sync_lock);
+/* current sync_buff for accepting new conn entries */
+static struct ip_vs_sync_buff   *curr_sb = NULL;
+static DEFINE_SPINLOCK(curr_sb_lock);
+/* ipvs sync daemon state */
+volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
+volatile int ip_vs_master_syncid = 0;
+volatile int ip_vs_backup_syncid = 0;
+/* multicast interface name */
+char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+/* multicast addr */
+static struct sockaddr_in mcast_addr;
+static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+{
+        spin_lock(&ip_vs_sync_lock);
+        list_add_tail(&sb->list, &ip_vs_sync_queue);
+        spin_unlock(&ip_vs_sync_lock);
+}
+static inline struct ip_vs_sync_buff * sb_dequeue(void)
+{
+        struct ip_vs_sync_buff *sb;
+        spin_lock_bh(&ip_vs_sync_lock);
+        if (list_empty(&ip_vs_sync_queue)) {
+                sb = NULL;
+        } else {
+                sb = list_entry(ip_vs_sync_queue.next,
+                                struct ip_vs_sync_buff,
+                                list);
+                list_del(&sb->list);
+        }
+        spin_unlock_bh(&ip_vs_sync_lock);
+        return sb;
+}
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+{
+        struct ip_vs_sync_buff *sb;
+        if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+                return NULL;
+        if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+                kfree(sb);
+                return NULL;
+        }
+        sb->mesg->nr_conns = 0;
+        sb->mesg->syncid = ip_vs_master_syncid;
+        sb->mesg->size = 4;
+        sb->head = (unsigned char *)sb->mesg + 4;
+        sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+        sb->firstuse = jiffies;
+        return sb;
+}
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+        kfree(sb->mesg);
+        kfree(sb);
+}
+/*
+ *      Get the current sync buffer if it has been created for more
+ *      than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(unsigned long time)
+{
+        struct ip_vs_sync_buff *sb;
+        spin_lock_bh(&curr_sb_lock);
+        if (curr_sb && (time == 0 ||
+                        time_before(jiffies - curr_sb->firstuse, time))) {
+                sb = curr_sb;
+                curr_sb = NULL;
+        } else
+                sb = NULL;
+        spin_unlock_bh(&curr_sb_lock);
+        return sb;
+}
+/*
+ *      Add an ip_vs_conn information into the current sync_buff.
+ *      Called by ip_vs_in.
+ */
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
+{
+        struct ip_vs_sync_mesg *m;
+        struct ip_vs_sync_conn *s;
+        int len;
+        spin_lock(&curr_sb_lock);
+        if (!curr_sb) {
+                if (!(curr_sb=ip_vs_sync_buff_create())) {
+                        spin_unlock(&curr_sb_lock);
+                        IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
+                        return;
+                }
+        }
+        len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+                SIMPLE_CONN_SIZE;
+        m = curr_sb->mesg;
+        s = (struct ip_vs_sync_conn *)curr_sb->head;
+        /* copy members */
+        s->protocol = cp->protocol;
+        s->cport = cp->cport;
+        s->vport = cp->vport;
+        s->dport = cp->dport;
+        s->caddr = cp->caddr;
+        s->vaddr = cp->vaddr;
+        s->daddr = cp->daddr;
+        s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+        s->state = htons(cp->state);
+        if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+                struct ip_vs_sync_conn_options *opt =
+                        (struct ip_vs_sync_conn_options *)&s[1];
+                memcpy(opt, &cp->in_seq, sizeof(*opt));
+        }
+        m->nr_conns++;
+        m->size += len;
+        curr_sb->head += len;
+        /* check if there is a space for next one */
+        if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
+                sb_queue_tail(curr_sb);
+                curr_sb = NULL;
+        }
+        spin_unlock(&curr_sb_lock);
+        /* synchronize its controller if it has */
+        if (cp->control)
+                ip_vs_sync_conn(cp->control);
+}
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ */
+static void ip_vs_process_message(const char *buffer, const size_t buflen)
+{
+        struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+        struct ip_vs_sync_conn *s;
+        struct ip_vs_sync_conn_options *opt;
+        struct ip_vs_conn *cp;
+        char *p;
+        int i;
+        /* Convert size back to host byte order */
+        m->size = ntohs(m->size);
+        if (buflen != m->size) {
+                IP_VS_ERR("bogus message\n");
+                return;
+        }
+        /* SyncID sanity check */
+        if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+                IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
+                          m->syncid);
+                return;
+        }
+        p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+        for (i=0; i<m->nr_conns; i++) {
+                s = (struct ip_vs_sync_conn *)p;
+                cp = ip_vs_conn_in_get(s->protocol,
+                                       s->caddr, s->cport,
+                                       s->vaddr, s->vport);
+                if (!cp) {
+                        cp = ip_vs_conn_new(s->protocol,
+                                            s->caddr, s->cport,
+                                            s->vaddr, s->vport,
+                                            s->daddr, s->dport,
+                                            ntohs(s->flags), NULL);
+                        if (!cp) {
+                                IP_VS_ERR("ip_vs_conn_new failed\n");
+                                return;
+                        }
+                        cp->state = ntohs(s->state);
+                } else if (!cp->dest) {
+                        /* it is an entry created by the synchronization */
+                        cp->state = ntohs(s->state);
+                        cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
+                }       /* Note that we don't touch its state and flags
+                           if it is a normal entry. */
+                if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
+                        opt = (struct ip_vs_sync_conn_options *)&s[1];
+                        memcpy(&cp->in_seq, opt, sizeof(*opt));
+                        p += FULL_CONN_SIZE;
+                } else
+                        p += SIMPLE_CONN_SIZE;
+                atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+                cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
+                ip_vs_conn_put(cp);
+                if (p > buffer+buflen) {
+                        IP_VS_ERR("bogus message\n");
+                        return;
+                }
+        }
+}
+/*
+ *      Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+        lock_sock(sk);
+        inet->mc_loop = loop ? 1 : 0;
+        release_sock(sk);
+}
+/*
+ *      Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+        lock_sock(sk);
+        inet->mc_ttl = ttl;
+        release_sock(sk);
+}
+/*
+ *      Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+        struct net_device *dev;
+        struct inet_sock *inet = inet_sk(sk);
+        if ((dev = __dev_get_by_name(ifname)) == NULL)
+                return -ENODEV;
+        if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+                return -EINVAL;
+        lock_sock(sk);
+        inet->mc_index = dev->ifindex;
+        /*  inet->mc_addr  = 0; */
+        release_sock(sk);
+        return 0;
+}
+/*
+ *      Set the maximum length of sync message according to the
+ *      specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(int sync_state)
+{
+        struct net_device *dev;
+        int num;
+        if (sync_state == IP_VS_STATE_MASTER) {
+                if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
+                        return -ENODEV;
+                num = (dev->mtu - sizeof(struct iphdr) -
+                       sizeof(struct udphdr) -
+                       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+                sync_send_mesg_maxlen =
+                        SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
+                IP_VS_DBG(7, "setting the maximum length of sync sending "
+                          "message %d.\n", sync_send_mesg_maxlen);
+        } else if (sync_state == IP_VS_STATE_BACKUP) {
+                if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
+                        return -ENODEV;
+                sync_recv_mesg_maxlen = dev->mtu -
+                        sizeof(struct iphdr) - sizeof(struct udphdr);
+                IP_VS_DBG(7, "setting the maximum length of sync receiving "
+                          "message %d.\n", sync_recv_mesg_maxlen);
+        }
+        return 0;
+}
+/*
+ *      Join a multicast group.
+ *      the group is specified by a class D multicast address 224.0.0.0/8
+ *      in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+        struct ip_mreqn mreq;
+        struct net_device *dev;
+        int ret;
+        memset(&mreq, 0, sizeof(mreq));
+        memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+        if ((dev = __dev_get_by_name(ifname)) == NULL)
+                return -ENODEV;
+        if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+                return -EINVAL;
+        mreq.imr_ifindex = dev->ifindex;
+        lock_sock(sk);
+        ret = ip_mc_join_group(sk, &mreq);
+        release_sock(sk);
+        return ret;
+}
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+        struct net_device *dev;
+        u32 addr;
+        struct sockaddr_in sin;
+        if ((dev = __dev_get_by_name(ifname)) == NULL)
+                return -ENODEV;
+        addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+        if (!addr)
+                IP_VS_ERR("You probably need to specify IP address on "
+                          "multicast interface.\n");
+        IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
+                  ifname, NIPQUAD(addr));
+        /* Now bind the socket with the address of multicast interface */
+        sin.sin_family       = AF_INET;
+        sin.sin_addr.s_addr  = addr;
+        sin.sin_port         = 0;
+        return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+/*
+ *      Set up sending multicast socket over UDP
+ */
+static struct socket * make_send_sock(void)
+{
+        struct socket *sock;
+        /* First create a socket */
+        if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+                IP_VS_ERR("Error during creation of socket; terminating\n");
+                return NULL;
+        }
+        if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
+                IP_VS_ERR("Error setting outbound mcast interface\n");
+                goto error;
+        }
+        set_mcast_loop(sock->sk, 0);
+        set_mcast_ttl(sock->sk, 1);
+        if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
+                IP_VS_ERR("Error binding address of the mcast interface\n");
+                goto error;
+        }
+        if (sock->ops->connect(sock,
+                               (struct sockaddr*)&mcast_addr,
+                               sizeof(struct sockaddr), 0) < 0) {
+                IP_VS_ERR("Error connecting to the multicast addr\n");
+                goto error;
+        }
+        return sock;
+  error:
+        sock_release(sock);
+        return NULL;
+}
+/*
+ *      Set up receiving multicast socket over UDP
+ */
+static struct socket * make_receive_sock(void)
+{
+        struct socket *sock;
+        /* First create a socket */
+        if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+                IP_VS_ERR("Error during creation of socket; terminating\n");
+                return NULL;
+        }
+        /* it is equivalent to the REUSEADDR option in user-space */
+        sock->sk->sk_reuse = 1;
+        if (sock->ops->bind(sock,
+                            (struct sockaddr*)&mcast_addr,
+                            sizeof(struct sockaddr)) < 0) {
+                IP_VS_ERR("Error binding to the multicast addr\n");
+                goto error;
+        }
+        /* join the multicast group */
+        if (join_mcast_group(sock->sk,
+                             (struct in_addr*)&mcast_addr.sin_addr,
+                             ip_vs_backup_mcast_ifn) < 0) {
+                IP_VS_ERR("Error joining to the multicast group\n");
+                goto error;
+        }
+        return sock;
+  error:
+        sock_release(sock);
+        return NULL;
+}
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+        struct msghdr   msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+        struct kvec     iov;
+        int             len;
+        EnterFunction(7);
+        iov.iov_base     = (void *)buffer;
+        iov.iov_len      = length;
+        len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+        LeaveFunction(7);
+        return len;
+}
+static void
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+        int msize;
+        msize = msg->size;
+        /* Put size in network byte order */
+        msg->size = htons(msg->size);
+        if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
+                IP_VS_ERR("ip_vs_send_async error\n");
+}
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+        struct msghdr           msg = {NULL,};
+        struct kvec             iov;
+        int                     len;
+        EnterFunction(7);
+        /* Receive a packet */
+        iov.iov_base     = buffer;
+        iov.iov_len      = (size_t)buflen;
+        len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+        if (len < 0)
+                return -1;
+        LeaveFunction(7);
+        return len;
+}
+static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
+static pid_t sync_master_pid = 0;
+static pid_t sync_backup_pid = 0;
+static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
+static int stop_master_sync = 0;
+static int stop_backup_sync = 0;
+static void sync_master_loop(void)
+{
+        struct socket *sock;
+        struct ip_vs_sync_buff *sb;
+        /* create the sending multicast socket */
+        sock = make_send_sock();
+        if (!sock)
+                return;
+        IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
+                   "syncid = %d\n",
+                   ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+        for (;;) {
+                while ((sb=sb_dequeue())) {
+                        ip_vs_send_sync_msg(sock, sb->mesg);
+                        ip_vs_sync_buff_release(sb);
+                }
+                /* check if entries stay in curr_sb for 2 seconds */
+                if ((sb = get_curr_sync_buff(2*HZ))) {
+                        ip_vs_send_sync_msg(sock, sb->mesg);
+                        ip_vs_sync_buff_release(sb);
+                }
+                if (stop_master_sync)
+                        break;
+                ssleep(1);
+        }
+        /* clean up the sync_buff queue */
+        while ((sb=sb_dequeue())) {
+                ip_vs_sync_buff_release(sb);
+        }
+        /* clean up the current sync_buff */
+        if ((sb = get_curr_sync_buff(0))) {
+                ip_vs_sync_buff_release(sb);
+        }
+        /* release the sending multicast socket */
+        sock_release(sock);
+}
+static void sync_backup_loop(void)
+{
+        struct socket *sock;
+        char *buf;
+        int len;
+        if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
+                IP_VS_ERR("sync_backup_loop: kmalloc error\n");
+                return;
+        }
+        /* create the receiving multicast socket */
+        sock = make_receive_sock();
+        if (!sock)
+                goto out;
+        IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
+                   "syncid = %d\n",
+                   ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+        for (;;) {
+                /* do you have data now? */
+                while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
+                        if ((len =
+                             ip_vs_receive(sock, buf,
+                                           sync_recv_mesg_maxlen)) <= 0) {
+                                IP_VS_ERR("receiving message error\n");
+                                break;
+                        }
+                        /* disable bottom half, because it accessed the data
+                           shared by softirq while getting/creating conns */
+                        local_bh_disable();
+                        ip_vs_process_message(buf, len);
+                        local_bh_enable();
+                }
+                if (stop_backup_sync)
+                        break;
+                ssleep(1);
+        }
+        /* release the sending multicast socket */
+        sock_release(sock);
+  out:
+        kfree(buf);
+}
+static void set_sync_pid(int sync_state, pid_t sync_pid)
+{
+        if (sync_state == IP_VS_STATE_MASTER)
+                sync_master_pid = sync_pid;
+        else if (sync_state == IP_VS_STATE_BACKUP)
+                sync_backup_pid = sync_pid;
+}
+static void set_stop_sync(int sync_state, int set)
+{
+        if (sync_state == IP_VS_STATE_MASTER)
+                stop_master_sync = set;
+        else if (sync_state == IP_VS_STATE_BACKUP)
+                stop_backup_sync = set;
+        else {
+                stop_master_sync = set;
+                stop_backup_sync = set;
+        }
+}
+static int sync_thread(void *startup)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        mm_segment_t oldmm;
+        int state;
+        const char *name;
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
+                state = IP_VS_STATE_MASTER;
+                name = "ipvs_syncmaster";
+        } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
+                state = IP_VS_STATE_BACKUP;
+                name = "ipvs_syncbackup";
+        } else {
+                IP_VS_BUG();
+                ip_vs_use_count_dec();
+                return -EINVAL;
+        }
+        daemonize(name);
+        oldmm = get_fs();
+        set_fs(KERNEL_DS);
+        /* Block all signals */
+        spin_lock_irq(&current->sighand->siglock);
+        siginitsetinv(&current->blocked, 0);
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        /* set the maximum length of sync message */
+        set_sync_mesg_maxlen(state);
+        /* set up multicast address */
+        mcast_addr.sin_family = AF_INET;
+        mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
+        mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
+        add_wait_queue(&sync_wait, &wait);
+        set_sync_pid(state, current->pid);
+        complete((struct completion *)startup);
+        /* processing master/backup loop here */
+        if (state == IP_VS_STATE_MASTER)
+                sync_master_loop();
+        else if (state == IP_VS_STATE_BACKUP)
+                sync_backup_loop();
+        else IP_VS_BUG();
+        remove_wait_queue(&sync_wait, &wait);
+        /* thread exits */
+        set_sync_pid(state, 0);
+        IP_VS_INFO("sync thread stopped!\n");
+        set_fs(oldmm);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        set_stop_sync(state, 0);
+        wake_up(&stop_sync_wait);
+        return 0;
+}
+static int fork_sync_thread(void *startup)
+{
+        pid_t pid;
+        /* fork the sync thread here, then the parent process of the
+           sync thread is the init process after this thread exits. */
+  repeat:
+        if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
+                IP_VS_ERR("could not create sync_thread due to %d... "
+                          "retrying.\n", pid);
+                ssleep(1);
+                goto repeat;
+        }
+        return 0;
+}
+int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+{
+        DECLARE_COMPLETION(startup);
+        pid_t pid;
+        if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
+            (state == IP_VS_STATE_BACKUP && sync_backup_pid))
+                return -EEXIST;
+        IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+        IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
+                  sizeof(struct ip_vs_sync_conn));
+        ip_vs_sync_state |= state;
+        if (state == IP_VS_STATE_MASTER) {
+                strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+                ip_vs_master_syncid = syncid;
+        } else {
+                strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+                ip_vs_backup_syncid = syncid;
+        }
+  repeat:
+        if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
+                IP_VS_ERR("could not create fork_sync_thread due to %d... "
+                          "retrying.\n", pid);
+                ssleep(1);
+                goto repeat;
+        }
+        wait_for_completion(&startup);
+        return 0;
+}
+int stop_sync_thread(int state)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
+            (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
+                return -ESRCH;
+        IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+        IP_VS_INFO("stopping sync thread %d ...\n",
+                   (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        add_wait_queue(&stop_sync_wait, &wait);
+        set_stop_sync(state, 1);
+        ip_vs_sync_state -= state;
+        wake_up(&sync_wait);
+        schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&stop_sync_wait, &wait);
+        /* Note: no need to reap the sync thread, because its parent
+           process is the init process */
+        if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
+            (state == IP_VS_STATE_BACKUP && stop_backup_sync))
+                IP_VS_BUG();
+        return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
new file mode 100644
index 000000000000..8a9d913261d8
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -0,0 +1,151 @@
+/*
+ * IPVS:        Weighted Least-Connection Scheduling module
+ *
+ * Version:     $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
+ *     Wensong Zhang            :     changed to use the inactconns in scheduling
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int
+ip_vs_wlc_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_wlc_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_wlc_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We think the overhead of processing active connections is 256
+         * times higher than that of inactive connections in average. (This
+         * 256 times might not be accurate, we will change it later) We
+         * use the following formula to estimate the overhead now:
+         *                dest->activeconns*256 + dest->inactconns
+         */
+        return (atomic_read(&dest->activeconns) << 8) +
+                atomic_read(&dest->inactconns);
+}
+/*
+ *      Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least;
+        unsigned int loh, doh;
+        IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+        /*
+         * We calculate the load of each dest server as follows:
+         *                (dest overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connections.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                    atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = ip_vs_wlc_dest_overhead(least);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = ip_vs_wlc_dest_overhead(dest);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+        .name =                 "wlc",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_wlc_init_svc,
+        .done_service =         ip_vs_wlc_done_svc,
+        .update_service =       ip_vs_wlc_update_svc,
+        .schedule =             ip_vs_wlc_schedule,
+};
+static int __init ip_vs_wlc_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+static void __exit ip_vs_wlc_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
new file mode 100644
index 000000000000..749fa044eca5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -0,0 +1,235 @@
+/*
+ * IPVS:        Weighted Round-Robin Scheduling module
+ *
+ * Version:     $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
+ *     Julian Anastasov         :     fixed the bug of returning destination
+ *                                    with weight 0 when all weights are zero
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+        struct list_head *cl;   /* current list head */
+        int cw;                 /* current weight */
+        int mw;                 /* maximum weight */
+        int di;                 /* decreasing interval */
+};
+/*
+ *    Get the gcd of server weights
+ */
+static int gcd(int a, int b)
+{
+        int c;
+        while ((c = a % b)) {
+                a = b;
+                b = c;
+        }
+        return b;
+}
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest;
+        int weight;
+        int g = 0;
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                weight = atomic_read(&dest->weight);
+                if (weight > 0) {
+                        if (g > 0)
+                                g = gcd(weight, g);
+                        else
+                                g = weight;
+                }
+        }
+        return g ? g : 1;
+}
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest;
+        int weight = 0;
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (atomic_read(&dest->weight) > weight)
+                        weight = atomic_read(&dest->weight);
+        }
+        return weight;
+}
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_wrr_mark *mark;
+        /*
+         *    Allocate the mark variable for WRR scheduling
+         */
+        mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+        if (mark == NULL) {
+                IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        mark->cl = &svc->destinations;
+        mark->cw = 0;
+        mark->mw = ip_vs_wrr_max_weight(svc);
+        mark->di = ip_vs_wrr_gcd_weight(svc);
+        svc->sched_data = mark;
+        return 0;
+}
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+        /*
+         *    Release the mark variable
+         */
+        kfree(svc->sched_data);
+        return 0;
+}
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_wrr_mark *mark = svc->sched_data;
+        mark->cl = &svc->destinations;
+        mark->mw = ip_vs_wrr_max_weight(svc);
+        mark->di = ip_vs_wrr_gcd_weight(svc);
+        if (mark->cw > mark->mw)
+                mark->cw = 0;
+        return 0;
+}
+/*
+ *    Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_wrr_mark *mark = svc->sched_data;
+        struct list_head *p;
+        IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
+        /*
+         * This loop will always terminate, because mark->cw in (0, max_weight]
+         * and at least one server has its weight equal to max_weight.
+         */
+        write_lock(&svc->sched_lock);
+        p = mark->cl;
+        while (1) {
+                if (mark->cl == &svc->destinations) {
+                        /* it is at the head of the destination list */
+                        if (mark->cl == mark->cl->next) {
+                                /* no dest entry */
+                                dest = NULL;
+                                goto out;
+                        }
+                        mark->cl = svc->destinations.next;
+                        mark->cw -= mark->di;
+                        if (mark->cw <= 0) {
+                                mark->cw = mark->mw;
+                                /*
+                                 * Still zero, which means no available servers.
+                                 */
+                                if (mark->cw == 0) {
+                                        mark->cl = &svc->destinations;
+                                        IP_VS_INFO("ip_vs_wrr_schedule(): "
+                                                   "no available servers\n");
+                                        dest = NULL;
+                                        goto out;
+                                }
+                        }
+                } else
+                        mark->cl = mark->cl->next;
+                if (mark->cl != &svc->destinations) {
+                        /* not at the head of the list */
+                        dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+                        if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                            atomic_read(&dest->weight) >= mark->cw) {
+                                /* got it */
+                                break;
+                        }
+                }
+                if (mark->cl == p && mark->cw == mark->di) {
+                        /* back to the start, and no dest is found.
+                           It is only possible when all dests are OVERLOADED */
+                        dest = NULL;
+                        goto out;
+                }
+        }
+        IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d\n",
+                  NIPQUAD(dest->addr), ntohs(dest->port),
+                  atomic_read(&dest->activeconns),
+                  atomic_read(&dest->refcnt),
+                  atomic_read(&dest->weight));
+  out:
+        write_unlock(&svc->sched_lock);
+        return dest;
+}
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+        .name =                 "wrr",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_wrr_init_svc,
+        .done_service =         ip_vs_wrr_done_svc,
+        .update_service =       ip_vs_wrr_update_svc,
+        .schedule =             ip_vs_wrr_schedule,
+};
+static int __init ip_vs_wrr_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+static void __exit ip_vs_wrr_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
new file mode 100644
index 000000000000..faa6176bbeb1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -0,0 +1,563 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>                  /* for ip_route_output */
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+/*
+ *      Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+{
+        struct dst_entry *old_dst;
+        old_dst = dest->dst_cache;
+        dest->dst_cache = dst;
+        dest->dst_rtos = rtos;
+        dst_release(old_dst);
+}
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+{
+        struct dst_entry *dst = dest->dst_cache;
+        if (!dst)
+                return NULL;
+        if ((dst->obsolete || rtos != dest->dst_rtos) &&
+            dst->ops->check(dst, cookie) == NULL) {
+                dest->dst_cache = NULL;
+                dst_release(dst);
+                return NULL;
+        }
+        dst_hold(dst);
+        return dst;
+}
+static inline struct rtable *
+__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct ip_vs_dest *dest = cp->dest;
+        if (dest) {
+                spin_lock(&dest->dst_lock);
+                if (!(rt = (struct rtable *)
+                      __ip_vs_dst_check(dest, rtos, 0))) {
+                        struct flowi fl = {
+                                .oif = 0,
+                                .nl_u = {
+                                        .ip4_u = {
+                                                .daddr = dest->addr,
+                                                .saddr = 0,
+                                                .tos = rtos, } },
+                        };
+                        if (ip_route_output_key(&rt, &fl)) {
+                                spin_unlock(&dest->dst_lock);
+                                IP_VS_DBG_RL("ip_route_output error, "
+                                             "dest: %u.%u.%u.%u\n",
+                                             NIPQUAD(dest->addr));
+                                return NULL;
+                        }
+                        __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+                        IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
+                                  NIPQUAD(dest->addr),
+                                  atomic_read(&rt->u.dst.__refcnt), rtos);
+                }
+                spin_unlock(&dest->dst_lock);
+        } else {
+                struct flowi fl = {
+                        .oif = 0,
+                        .nl_u = {
+                                .ip4_u = {
+                                        .daddr = cp->daddr,
+                                        .saddr = 0,
+                                        .tos = rtos, } },
+                };
+                if (ip_route_output_key(&rt, &fl)) {
+                        IP_VS_DBG_RL("ip_route_output error, dest: "
+                                     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
+                        return NULL;
+                }
+        }
+        return rt;
+}
+/*
+ *      Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+        struct dst_entry *old_dst;
+        old_dst = dest->dst_cache;
+        dest->dst_cache = NULL;
+        dst_release(old_dst);
+}
+#define IP_VS_XMIT(skb, rt)                             \
+do {                                                    \
+        nf_reset_debug(skb);                            \
+        (skb)->nfcache |= NFC_IPVS_PROPERTY;            \
+        (skb)->ip_summed = CHECKSUM_NONE;               \
+        NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
+                (rt)->u.dst.dev, dst_output);           \
+} while (0)
+/*
+ *      NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                struct ip_vs_protocol *pp)
+{
+        /* we do not touch skb and do not need pskb ptr */
+        return NF_ACCEPT;
+}
+/*
+ *      Bypass transmitter
+ *      Let packets bypass the destination when the destination is not
+ *      available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                  struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct iphdr  *iph = skb->nh.iph;
+        u8     tos = iph->tos;
+        int    mtu;
+        struct flowi fl = {
+                .oif = 0,
+                .nl_u = {
+                        .ip4_u = {
+                                .daddr = iph->daddr,
+                                .saddr = 0,
+                                .tos = RT_TOS(tos), } },
+        };
+        EnterFunction(10);
+        if (ip_route_output_key(&rt, &fl)) {
+                IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+                             "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+                goto tx_error_icmp;
+        }
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+                ip_rt_put(rt);
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+                goto tx_error;
+        }
+        /*
+         * Call ip_send_check because we are not sure it is called
+         * after ip_defrag. Is copy-on-write needed?
+         */
+        if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+                ip_rt_put(rt);
+                return NF_STOLEN;
+        }
+        ip_send_check(skb->nh.iph);
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+ tx_error_icmp:
+        dst_link_failure(skb);
+ tx_error:
+        kfree_skb(skb);
+        LeaveFunction(10);
+        return NF_STOLEN;
+}
+/*
+ *      NAT transmitter (only for outside-to-inside nat forwarding)
+ *      Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+               struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;              /* Route to the other host */
+        int mtu;
+        struct iphdr *iph = skb->nh.iph;
+        EnterFunction(10);
+        /* check if it is a connection of no-client-port */
+        if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+                __u16 _pt, *p;
+                p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+                if (p == NULL)
+                        goto tx_error;
+                ip_vs_conn_fill_cport(cp, *p);
+                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+        }
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+                goto tx_error_icmp;
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+                ip_rt_put(rt);
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
+                goto tx_error;
+        }
+        /* copy-on-write the packet before mangling it */
+        if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
+                goto tx_error_put;
+        if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+                goto tx_error_put;
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /* mangle the packet */
+        if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
+                goto tx_error;
+        skb->nh.iph->daddr = cp->daddr;
+        ip_send_check(skb->nh.iph);
+        IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+        /* FIXME: when application helper enlarges the packet and the length
+           is larger than the MTU of outgoing device, there will be still
+           MTU problem. */
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        LeaveFunction(10);
+        kfree_skb(skb);
+        return NF_STOLEN;
+  tx_error_put:
+        ip_rt_put(rt);
+        goto tx_error;
+}
+/*
+ *   IP Tunneling transmitter
+ *
+ *   This function encapsulates the packet in a new IP packet, its
+ *   destination will be set to cp->daddr. Most code of this function
+ *   is taken from ipip.c.
+ *
+ *   It is used in VS/TUN cluster. The load balancer selects a real
+ *   server from a cluster based on a scheduling algorithm,
+ *   encapsulates the request packet and forwards it to the selected
+ *   server. For example, all real servers are configured with
+ *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ *   the encapsulated packet, it will decapsulate the packet, processe
+ *   the request and return the response packets directly to the client
+ *   without passing the load balancer. This can greatly increase the
+ *   scalability of virtual server.
+ *
+ *   Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                  struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct net_device *tdev;                /* Device to other host */
+        struct iphdr  *old_iph = skb->nh.iph;
+        u8     tos = old_iph->tos;
+        u16    df = old_iph->frag_off;
+        struct iphdr  *iph;                     /* Our new IP header */
+        int    max_headroom;                    /* The extra header space needed */
+        int    mtu;
+        EnterFunction(10);
+        if (skb->protocol != __constant_htons(ETH_P_IP)) {
+                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+                             "ETH_P_IP: %d, skb protocol: %d\n",
+                             __constant_htons(ETH_P_IP), skb->protocol);
+                goto tx_error;
+        }
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+                goto tx_error_icmp;
+        tdev = rt->u.dst.dev;
+        mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+        if (mtu < 68) {
+                ip_rt_put(rt);
+                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+                goto tx_error;
+        }
+        if (skb->dst)
+                skb->dst->ops->update_pmtu(skb->dst, mtu);
+        df |= (old_iph->frag_off&__constant_htons(IP_DF));
+        if ((old_iph->frag_off&__constant_htons(IP_DF))
+            && mtu < ntohs(old_iph->tot_len)) {
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                ip_rt_put(rt);
+                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+                goto tx_error;
+        }
+        /*
+         * Okay, now see if we can stuff it in the buffer as-is.
+         */
+        max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+        if (skb_headroom(skb) < max_headroom
+            || skb_cloned(skb) || skb_shared(skb)) {
+                struct sk_buff *new_skb =
+                        skb_realloc_headroom(skb, max_headroom);
+                if (!new_skb) {
+                        ip_rt_put(rt);
+                        kfree_skb(skb);
+                        IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+                        return NF_STOLEN;
+                }
+                kfree_skb(skb);
+                skb = new_skb;
+                old_iph = skb->nh.iph;
+        }
+        skb->h.raw = (void *) old_iph;
+        /* fix old IP header checksum */
+        ip_send_check(old_iph);
+        skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /*
+         *      Push down and install the IPIP header.
+         */
+        iph                     =       skb->nh.iph;
+        iph->version            =       4;
+        iph->ihl                =       sizeof(struct iphdr)>>2;
+        iph->frag_off           =       df;
+        iph->protocol           =       IPPROTO_IPIP;
+        iph->tos                =       tos;
+        iph->daddr              =       rt->rt_dst;
+        iph->saddr              =       rt->rt_src;
+        iph->ttl                =       old_iph->ttl;
+        iph->tot_len            =       htons(skb->len);
+        ip_select_ident(iph, &rt->u.dst, NULL);
+        ip_send_check(iph);
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        kfree_skb(skb);
+        LeaveFunction(10);
+        return NF_STOLEN;
+}
+/*
+ *      Direct Routing transmitter
+ *      Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+              struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct iphdr  *iph = skb->nh.iph;
+        int    mtu;
+        EnterFunction(10);
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+                goto tx_error_icmp;
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                ip_rt_put(rt);
+                IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+                goto tx_error;
+        }
+        /*
+         * Call ip_send_check because we are not sure it is called
+         * after ip_defrag. Is copy-on-write needed?
+         */
+        if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+                ip_rt_put(rt);
+                return NF_STOLEN;
+        }
+        ip_send_check(skb->nh.iph);
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        kfree_skb(skb);
+        LeaveFunction(10);
+        return NF_STOLEN;
+}
+/*
+ *      ICMP packet transmitter
+ *      called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                struct ip_vs_protocol *pp, int offset)
+{
+        struct rtable   *rt;    /* Route to the other host */
+        int mtu;
+        int rc;
+        EnterFunction(10);
+        /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+           forwarded directly here, because there is no need to
+           translate address/port back */
+        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+                if (cp->packet_xmit)
+                        rc = cp->packet_xmit(skb, cp, pp);
+                else
+                        rc = NF_ACCEPT;
+                /* do not touch skb anymore */
+                atomic_inc(&cp->in_pkts);
+                __ip_vs_conn_put(cp);
+                goto out;
+        }
+        /*
+         * mangle and send the packet here (only for VS/NAT)
+         */
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
+                goto tx_error_icmp;
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
+                ip_rt_put(rt);
+                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+                IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+                goto tx_error;
+        }
+        /* copy-on-write the packet before mangling it */
+        if (!ip_vs_make_skb_writable(&skb, offset))
+                goto tx_error_put;
+        if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+                goto tx_error_put;
+        /* drop the old route when skb is not shared */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        ip_vs_nat_icmp(skb, pp, cp, 0);
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        rc = NF_STOLEN;
+        goto out;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        dev_kfree_skb(skb);
+        rc = NF_STOLEN;
+  out:
+        LeaveFunction(10);
+        return rc;
+  tx_error_put:
+        ip_rt_put(rt);
+        goto tx_error;
+}
diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c
new file mode 100644
index 000000000000..4e9ca7c76407
--- /dev/null
+++ b/net/ipv4/multipath.c
@@ -0,0 +1,55 @@
+/* multipath.c: IPV4 multipath algorithm support.
+ *
+ * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
+ * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <net/ip_mp_alg.h>
+static DEFINE_SPINLOCK(alg_table_lock);
+struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1];
+int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
+{
+        struct ip_mp_alg_ops **slot;
+        int err;
+        if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX ||
+            !ops->mp_alg_select_route)
+                return -EINVAL;
+        spin_lock(&alg_table_lock);
+        slot = &ip_mp_alg_table[n];
+        if (*slot != NULL) {
+                err = -EBUSY;
+        } else {
+                *slot = ops;
+                err = 0;
+        }
+        spin_unlock(&alg_table_lock);
+        return err;
+}
+EXPORT_SYMBOL(multipath_alg_register);
+void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
+{
+        struct ip_mp_alg_ops **slot;
+        if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX)
+                return;
+        spin_lock(&alg_table_lock);
+        slot = &ip_mp_alg_table[n];
+        if (*slot == ops)
+                *slot = NULL;
+        spin_unlock(&alg_table_lock);
+        synchronize_net();
+}
+EXPORT_SYMBOL(multipath_alg_unregister);
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
new file mode 100644
index 000000000000..9349686131fc
--- /dev/null
+++ b/net/ipv4/multipath_drr.c
@@ -0,0 +1,265 @@
+/*
+ *              Device round robin policy for multipath.
+ *
+ *
+ * Version:     $Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $
+ *
+ * Authors:     Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+struct multipath_device {
+        int             ifi; /* interface index of device */
+        atomic_t        usecount;
+        int             allocated;
+};
+#define MULTIPATH_MAX_DEVICECANDIDATES 10
+static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
+static DEFINE_SPINLOCK(state_lock);
+static struct rtable *last_selection = NULL;
+static int inline __multipath_findslot(void)
+{
+        int i;
+        for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
+                if (state[i].allocated == 0)
+                        return i;
+        }
+        return -1;
+}
+static int inline __multipath_finddev(int ifindex)
+{
+        int i;
+        for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
+                if (state[i].allocated != 0 &&
+                    state[i].ifi == ifindex)
+                        return i;
+        }
+        return -1;
+}
+static int drr_dev_event(struct notifier_block *this,
+                         unsigned long event, void *ptr)
+{
+        struct net_device *dev = ptr;
+        int devidx;
+        switch (event) {
+        case NETDEV_UNREGISTER:
+        case NETDEV_DOWN:
+                spin_lock_bh(&state_lock);
+                devidx = __multipath_finddev(dev->ifindex);
+                if (devidx != -1) {
+                        state[devidx].allocated = 0;
+                        state[devidx].ifi = 0;
+                        atomic_set(&state[devidx].usecount, 0);
+                }
+                spin_unlock_bh(&state_lock);
+                break;
+        };
+        return NOTIFY_DONE;
+}
+struct notifier_block drr_dev_notifier = {
+        .notifier_call  = drr_dev_event,
+};
+static void drr_remove(struct rtable *rt)
+{
+        if (last_selection == rt)
+                last_selection = NULL;
+}
+static void drr_safe_inc(atomic_t *usecount)
+{
+        int n;
+        atomic_inc(usecount);
+        n = atomic_read(usecount);
+        if (n <= 0) {
+                int i;
+                spin_lock_bh(&state_lock);
+                for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++)
+                        atomic_set(&state[i].usecount, 0);
+                spin_unlock_bh(&state_lock);
+        }
+}
+static void drr_select_route(const struct flowi *flp,
+                             struct rtable *first, struct rtable **rp)
+{
+        struct rtable *nh, *result, *cur_min;
+        int min_usecount = -1; 
+        int devidx = -1;
+        int cur_min_devidx = -1;
+        /* if necessary and possible utilize the old alternative */
+        if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
+            last_selection != NULL) {
+                result = last_selection;
+                *rp = result;
+                return;
+        }
+        /* 1. make sure all alt. nexthops have the same GC related data */
+        /* 2. determine the new candidate to be returned */
+        result = NULL;
+        cur_min = NULL;
+        for (nh = rcu_dereference(first); nh;
+             nh = rcu_dereference(nh->u.rt_next)) {
+                if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
+                    multipath_comparekeys(&nh->fl, flp)) {
+                        int nh_ifidx = nh->u.dst.dev->ifindex;
+                        nh->u.dst.lastuse = jiffies;
+                        nh->u.dst.__use++;
+                        if (result != NULL)
+                                continue;
+                        /* search for the output interface */
+                        /* this is not SMP safe, only add/remove are
+                         * SMP safe as wrong usecount updates have no big
+                         * impact
+                         */
+                        devidx = __multipath_finddev(nh_ifidx);
+                        if (devidx == -1) {
+                                /* add the interface to the array 
+                                 * SMP safe
+                                 */
+                                spin_lock_bh(&state_lock);
+                                /* due to SMP: search again */
+                                devidx = __multipath_finddev(nh_ifidx);
+                                if (devidx == -1) {
+                                        /* add entry for device */
+                                        devidx = __multipath_findslot();
+                                        if (devidx == -1) {
+                                                /* unlikely but possible */
+                                                continue;
+                                        }
+                                        state[devidx].allocated = 1;
+                                        state[devidx].ifi = nh_ifidx;
+                                        atomic_set(&state[devidx].usecount, 0);
+                                        min_usecount = 0;
+                                }
+                                spin_unlock_bh(&state_lock);
+                        }
+                        if (min_usecount == 0) {
+                                /* if the device has not been used it is
+                                 * the primary target
+                                 */
+                                drr_safe_inc(&state[devidx].usecount);
+                                result = nh;
+                        } else {
+                                int count =
+                                        atomic_read(&state[devidx].usecount);
+                                if (min_usecount == -1 ||
+                                    count < min_usecount) {
+                                        cur_min = nh;
+                                        cur_min_devidx = devidx;
+                                        min_usecount = count;
+                                }
+                        }
+                }
+        }
+        if (!result) {
+                if (cur_min) {
+                        drr_safe_inc(&state[cur_min_devidx].usecount);
+                        result = cur_min;
+                } else {
+                        result = first;
+                }
+        }
+        *rp = result;
+        last_selection = result;
+}
+static struct ip_mp_alg_ops drr_ops = {
+        .mp_alg_select_route    =       drr_select_route,
+        .mp_alg_remove          =       drr_remove,
+};
+static int __init drr_init(void)
+{
+        int err = register_netdevice_notifier(&drr_dev_notifier);
+        if (err)
+                return err;
+        err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR);
+        if (err)
+                goto fail;
+        return 0;
+fail:
+        unregister_netdevice_notifier(&drr_dev_notifier);
+        return err;
+}
+static void __exit drr_exit(void)
+{
+        unregister_netdevice_notifier(&drr_dev_notifier);
+        multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR);
+}
+module_init(drr_init);
+module_exit(drr_exit);
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
new file mode 100644
index 000000000000..805a16e47de5
--- /dev/null
+++ b/net/ipv4/multipath_random.c
@@ -0,0 +1,128 @@
+/*
+ *              Random policy for multipath.
+ *
+ *
+ * Version:     $Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $
+ *
+ * Authors:     Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+#define MULTIPATH_MAX_CANDIDATES 40
+/* interface to random number generation */
+static unsigned int RANDOM_SEED = 93186752;
+static inline unsigned int random(unsigned int ubound)
+{
+        static unsigned int a = 1588635695,
+                q = 2,
+                r = 1117695901;
+        RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
+        return RANDOM_SEED % ubound;
+}
+static void random_select_route(const struct flowi *flp,
+                                struct rtable *first,
+                                struct rtable **rp)
+{
+        struct rtable *rt;
+        struct rtable *decision;
+        unsigned char candidate_count = 0;
+        /* count all candidate */
+        for (rt = rcu_dereference(first); rt;
+             rt = rcu_dereference(rt->u.rt_next)) {
+                if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+                    multipath_comparekeys(&rt->fl, flp))
+                        ++candidate_count;
+        }
+        /* choose a random candidate */
+        decision = first;
+        if (candidate_count > 1) {
+                unsigned char i = 0;
+                unsigned char candidate_no = (unsigned char)
+                        random(candidate_count);
+                /* find chosen candidate and adjust GC data for all candidates
+                 * to ensure they stay in cache
+                 */
+                for (rt = first; rt; rt = rt->u.rt_next) {
+                        if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+                            multipath_comparekeys(&rt->fl, flp)) {
+                                rt->u.dst.lastuse = jiffies;
+                                if (i == candidate_no)
+                                        decision = rt;
+                                if (i >= candidate_count)
+                                        break;
+                                i++;
+                        }
+                }
+        }
+        decision->u.dst.__use++;
+        *rp = decision;
+}
+static struct ip_mp_alg_ops random_ops = {
+        .mp_alg_select_route    =       random_select_route,
+};
+static int __init random_init(void)
+{
+        return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM);
+}
+static void __exit random_exit(void)
+{
+        multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM);
+}
+module_init(random_init);
+module_exit(random_exit);
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
new file mode 100644
index 000000000000..554a82568160
--- /dev/null
+++ b/net/ipv4/multipath_rr.c
@@ -0,0 +1,115 @@
+/*
+ *              Round robin policy for multipath.
+ *
+ *
+ * Version:     $Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $
+ *
+ * Authors:     Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+#define MULTIPATH_MAX_CANDIDATES 40
+static struct rtable* last_used = NULL;
+static void rr_remove(struct rtable *rt)
+{
+        if (last_used == rt)
+                last_used = NULL;
+}
+static void rr_select_route(const struct flowi *flp,
+                            struct rtable *first, struct rtable **rp)
+{
+        struct rtable *nh, *result, *min_use_cand = NULL;
+        int min_use = -1;
+        /* if necessary and possible utilize the old alternative */
+        if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
+            last_used != NULL) {
+                result = last_used;
+                goto out;
+        }
+        /* 1. make sure all alt. nexthops have the same GC related data
+         * 2. determine the new candidate to be returned
+         */
+        result = NULL;
+        for (nh = rcu_dereference(first); nh;
+             nh = rcu_dereference(nh->u.rt_next)) {
+                if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
+                    multipath_comparekeys(&nh->fl, flp)) {
+                        nh->u.dst.lastuse = jiffies;
+                        if (min_use == -1 || nh->u.dst.__use < min_use) {
+                                min_use = nh->u.dst.__use;
+                                min_use_cand = nh;
+                        }
+                }
+        }
+        result = min_use_cand;
+        if (!result)
+                result = first;
+out:
+        last_used = result;
+        result->u.dst.__use++;
+        *rp = result;
+}
+static struct ip_mp_alg_ops rr_ops = {
+        .mp_alg_select_route    =       rr_select_route,
+        .mp_alg_remove          =       rr_remove,
+};
+static int __init rr_init(void)
+{
+        return multipath_alg_register(&rr_ops, IP_MP_ALG_RR);
+}
+static void __exit rr_exit(void)
+{
+        multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR);
+}
+module_init(rr_init);
+module_exit(rr_exit);
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
new file mode 100644
index 000000000000..10b23e1bece6
--- /dev/null
+++ b/net/ipv4/multipath_wrandom.c
@@ -0,0 +1,344 @@
+/*
+ *              Weighted random policy for multipath.
+ *
+ *
+ * Version:     $Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
+ *
+ * Authors:     Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_fib.h>
+#include <net/ip_mp_alg.h>
+#define MULTIPATH_STATE_SIZE 15
+struct multipath_candidate {
+        struct multipath_candidate      *next;
+        int                             power;
+        struct rtable                   *rt;
+};
+struct multipath_dest {
+        struct list_head        list;
+        const struct fib_nh     *nh_info;
+        __u32                   netmask;
+        __u32                   network;
+        unsigned char           prefixlen;
+        struct rcu_head         rcu;
+};
+struct multipath_bucket {
+        struct list_head        head;
+        spinlock_t              lock;
+};
+struct multipath_route {
+        struct list_head        list;
+        int                     oif;
+        __u32                   gw;
+        struct list_head        dests;
+        struct rcu_head         rcu;
+};
+/* state: primarily weight per route information */
+static struct multipath_bucket state[MULTIPATH_STATE_SIZE];
+/* interface to random number generation */
+static unsigned int RANDOM_SEED = 93186752;
+static inline unsigned int random(unsigned int ubound)
+{
+        static unsigned int a = 1588635695,
+                q = 2,
+                r = 1117695901;
+        RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
+        return RANDOM_SEED % ubound;
+}
+static unsigned char __multipath_lookup_weight(const struct flowi *fl,
+                                               const struct rtable *rt)
+{
+        const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
+        struct multipath_route *r;
+        struct multipath_route *target_route = NULL;
+        struct multipath_dest *d;
+        int weight = 1;
+        /* lookup the weight information for a certain route */
+        rcu_read_lock();
+        /* find state entry for gateway or add one if necessary */
+        list_for_each_entry_rcu(r, &state[state_idx].head, list) {
+                if (r->gw == rt->rt_gateway &&
+                    r->oif == rt->idev->dev->ifindex) {
+                        target_route = r;
+                        break;
+                }
+        }
+        if (!target_route) {
+                /* this should not happen... but we are prepared */
+                printk( KERN_CRIT"%s: missing state for gateway: %u and " \
+                        "device %d\n", __FUNCTION__, rt->rt_gateway,
+                        rt->idev->dev->ifindex);
+                goto out;
+        }
+        /* find state entry for destination */
+        list_for_each_entry_rcu(d, &target_route->dests, list) {
+                __u32 targetnetwork = fl->fl4_dst & 
+                        (0xFFFFFFFF >> (32 - d->prefixlen));
+                if ((targetnetwork & d->netmask) == d->network) {
+                        weight = d->nh_info->nh_weight;
+                        goto out;
+                }
+        }
+out:
+        rcu_read_unlock();
+        return weight;
+}
+static void wrandom_init_state(void) 
+{
+        int i;
+        for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
+                INIT_LIST_HEAD(&state[i].head);
+                spin_lock_init(&state[i].lock);
+        }
+}
+static void wrandom_select_route(const struct flowi *flp,
+                                 struct rtable *first,
+                                 struct rtable **rp)
+{
+        struct rtable *rt;
+        struct rtable *decision;
+        struct multipath_candidate *first_mpc = NULL;
+        struct multipath_candidate *mpc, *last_mpc = NULL;
+        int power = 0;
+        int last_power;
+        int selector;
+        const size_t size_mpc = sizeof(struct multipath_candidate);
+        /* collect all candidates and identify their weights */
+        for (rt = rcu_dereference(first); rt;
+             rt = rcu_dereference(rt->u.rt_next)) {
+                if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+                    multipath_comparekeys(&rt->fl, flp)) {
+                        struct multipath_candidate* mpc =
+                                (struct multipath_candidate*)
+                                kmalloc(size_mpc, GFP_KERNEL);
+                        if (!mpc)
+                                return;
+                        power += __multipath_lookup_weight(flp, rt) * 10000;
+                        mpc->power = power;
+                        mpc->rt = rt;
+                        mpc->next = NULL;
+                        if (!first_mpc)
+                                first_mpc = mpc;
+                        else
+                                last_mpc->next = mpc;
+                        last_mpc = mpc;
+                }
+        }
+        /* choose a weighted random candidate */
+        decision = first;
+        selector = random(power);
+        last_power = 0;
+        /* select candidate, adjust GC data and cleanup local state */
+        decision = first;
+        last_mpc = NULL;
+        for (mpc = first_mpc; mpc; mpc = mpc->next) {
+                mpc->rt->u.dst.lastuse = jiffies;
+                if (last_power <= selector && selector < mpc->power)
+                        decision = mpc->rt;
+                last_power = mpc->power;
+                if (last_mpc)
+                        kfree(last_mpc);
+                last_mpc = mpc;
+        }
+        if (last_mpc) {
+                /* concurrent __multipath_flush may lead to !last_mpc */
+                kfree(last_mpc);
+        }
+        decision->u.dst.__use++;
+        *rp = decision;
+}
+static void wrandom_set_nhinfo(__u32 network,
+                               __u32 netmask,
+                               unsigned char prefixlen,
+                               const struct fib_nh *nh)
+{
+        const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
+        struct multipath_route *r, *target_route = NULL;
+        struct multipath_dest *d, *target_dest = NULL;
+        /* store the weight information for a certain route */
+        spin_lock(&state[state_idx].lock);
+        /* find state entry for gateway or add one if necessary */
+        list_for_each_entry_rcu(r, &state[state_idx].head, list) {
+                if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
+                        target_route = r;
+                        break;
+                }
+        }
+        if (!target_route) {
+                const size_t size_rt = sizeof(struct multipath_route);
+                target_route = (struct multipath_route *)
+                        kmalloc(size_rt, GFP_KERNEL);
+                target_route->gw = nh->nh_gw;
+                target_route->oif = nh->nh_oif;
+                memset(&target_route->rcu, 0, sizeof(struct rcu_head));
+                INIT_LIST_HEAD(&target_route->dests);
+                list_add_rcu(&target_route->list, &state[state_idx].head);
+        }
+        /* find state entry for destination or add one if necessary */
+        list_for_each_entry_rcu(d, &target_route->dests, list) {
+                if (d->nh_info == nh) {
+                        target_dest = d;
+                        break;
+                }
+        }
+        if (!target_dest) {
+                const size_t size_dst = sizeof(struct multipath_dest);
+                target_dest = (struct multipath_dest*)
+                        kmalloc(size_dst, GFP_KERNEL);
+                target_dest->nh_info = nh;
+                target_dest->network = network;
+                target_dest->netmask = netmask;
+                target_dest->prefixlen = prefixlen;
+                memset(&target_dest->rcu, 0, sizeof(struct rcu_head));
+                list_add_rcu(&target_dest->list, &target_route->dests);
+        }
+        /* else: we already stored this info for another destination =>
+         * we are finished
+         */
+        spin_unlock(&state[state_idx].lock);
+}
+static void __multipath_free(struct rcu_head *head)
+{
+        struct multipath_route *rt = container_of(head, struct multipath_route,
+                                                  rcu);
+        kfree(rt);
+}
+static void __multipath_free_dst(struct rcu_head *head)
+{
+        struct multipath_dest *dst = container_of(head,
+                                                  struct multipath_dest,
+                                                  rcu);
+        kfree(dst);
+}
+static void wrandom_flush(void)
+{
+        int i;
+        /* defere delete to all entries */
+        for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
+                struct multipath_route *r;
+                spin_lock(&state[i].lock);
+                list_for_each_entry_rcu(r, &state[i].head, list) {
+                        struct multipath_dest *d;
+                        list_for_each_entry_rcu(d, &r->dests, list) {
+                                list_del_rcu(&d->list);
+                                call_rcu(&d->rcu,
+                                         __multipath_free_dst);
+                        }
+                        list_del_rcu(&r->list);
+                        call_rcu(&r->rcu,
+                                 __multipath_free);
+                }
+                spin_unlock(&state[i].lock);
+        }
+}
+static struct ip_mp_alg_ops wrandom_ops = {
+        .mp_alg_select_route    =       wrandom_select_route,
+        .mp_alg_flush           =       wrandom_flush,
+        .mp_alg_set_nhinfo      =       wrandom_set_nhinfo,
+};
+static int __init wrandom_init(void)
+{
+        wrandom_init_state();
+        return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
+}
+static void __exit wrandom_exit(void)
+{
+        multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
+}
+module_init(wrandom_init);
+module_exit(wrandom_exit);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 000000000000..46d4cb1c06f0
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,696 @@
+#
+# IP netfilter configuration
+#
+menu "IP: Netfilter Configuration"
+        depends on INET && NETFILTER
+# connection tracking, helpers and protocols
+config IP_NF_CONNTRACK
+        tristate "Connection tracking (required for masq/NAT)"
+        ---help---
+          Connection tracking keeps a record of what packets have passed
+          through your machine, in order to figure out how they are related
+          into connections.
+          This is required to do Masquerading or other kinds of Network
+          Address Translation (except for Fast NAT).  It can also be used to
+          enhance packet filtering (see `Connection state match support'
+          below).
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_CT_ACCT
+        bool "Connection tracking flow accounting"
+        depends on IP_NF_CONNTRACK
+        help
+          If this option is enabled, the connection tracking code will
+          keep per-flow packet and byte counters.
+          Those counters can be used for flow-based accounting or the
+          `connbytes' match.
+          If unsure, say `N'.
+config IP_NF_CONNTRACK_MARK
+        bool  'Connection mark tracking support'
+        help
+          This option enables support for connection marks, used by the
+          `CONNMARK' target and `connmark' match. Similar to the mark value
+          of packets, but this mark value is kept in the conntrack session
+          instead of the individual packets.
+        
+config IP_NF_CT_PROTO_SCTP
+        tristate  'SCTP protocol connection tracking support (EXPERIMENTAL)'
+        depends on IP_NF_CONNTRACK && EXPERIMENTAL
+        help
+          With this option enabled, the connection tracking code will
+          be able to do state tracking on SCTP connections.
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config IP_NF_FTP
+        tristate "FTP protocol support"
+        depends on IP_NF_CONNTRACK
+        help
+          Tracking FTP connections is problematic: special helpers are
+          required for tracking them, and doing masquerading and other forms
+          of Network Address Translation on them.
+          To compile it as a module, choose M here.  If unsure, say Y.
+config IP_NF_IRC
+        tristate "IRC protocol support"
+        depends on IP_NF_CONNTRACK
+        ---help---
+          There is a commonly-used extension to IRC called
+          Direct Client-to-Client Protocol (DCC).  This enables users to send
+          files to each other, and also chat to each other without the need
+          of a server.  DCC Sending is used anywhere you send files over IRC,
+          and DCC Chat is most commonly used by Eggdrop bots.  If you are
+          using NAT, this extension will enable you to send files and initiate
+          chats.  Note that you do NOT need this extension to get files or
+          have others initiate chats, or everything else in IRC.
+          To compile it as a module, choose M here.  If unsure, say Y.
+config IP_NF_TFTP
+        tristate "TFTP protocol support"
+        depends on IP_NF_CONNTRACK
+        help
+          TFTP connection tracking helper, this is required depending
+          on how restrictive your ruleset is.
+          If you are using a tftp client behind -j SNAT or -j MASQUERADING
+          you will need this.
+          To compile it as a module, choose M here.  If unsure, say Y.
+config IP_NF_AMANDA
+        tristate "Amanda backup protocol support"
+        depends on IP_NF_CONNTRACK
+        help
+          If you are running the Amanda backup package <http://www.amanda.org/>
+          on this machine or machines that will be MASQUERADED through this
+          machine, then you may want to enable this feature.  This allows the
+          connection tracking and natting code to allow the sub-channels that
+          Amanda requires for communication of the backup data, messages and
+          index.
+          To compile it as a module, choose M here.  If unsure, say Y.
+config IP_NF_QUEUE
+        tristate "Userspace queueing via NETLINK"
+        help
+          Netfilter has the ability to queue packets to user space: the
+          netlink device can be used to access them using this driver.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_IPTABLES
+        tristate "IP tables support (required for filtering/masq/NAT)"
+        help
+          iptables is a general, extensible packet identification framework.
+          The packet filtering and full NAT (masquerading, port forwarding,
+          etc) subsystems now use this: say `Y' or `M' here if you want to use
+          either of those.
+          To compile it as a module, choose M here.  If unsure, say N.
+# The matches.
+config IP_NF_MATCH_LIMIT
+        tristate "limit match support"
+        depends on IP_NF_IPTABLES
+        help
+          limit matching allows you to control the rate at which a rule can be
+          matched: mainly useful in combination with the LOG target ("LOG
+          target support", below) and to avoid some Denial of Service attacks.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_IPRANGE
+        tristate "IP range match support"
+        depends on IP_NF_IPTABLES
+        help
+          This option makes possible to match IP addresses against IP address
+          ranges.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_MAC
+        tristate "MAC address match support"
+        depends on IP_NF_IPTABLES
+        help
+          MAC matching allows you to match packets based on the source
+          Ethernet address of the packet.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_PKTTYPE
+        tristate "Packet type match support"
+        depends on IP_NF_IPTABLES
+        help
+         Packet type matching allows you to match a packet by
+         its "class", eg. BROADCAST, MULTICAST, ...
+          Typical usage:
+          iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_MARK
+        tristate "netfilter MARK match support"
+        depends on IP_NF_IPTABLES
+        help
+          Netfilter mark matching allows you to match packets based on the
+          `nfmark' value in the packet.  This can be set by the MARK target
+          (see below).
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_MULTIPORT
+        tristate "Multiple port match support"
+        depends on IP_NF_IPTABLES
+        help
+          Multiport matching allows you to match TCP or UDP packets based on
+          a series of source or destination ports: normally a rule can only
+          match a single range of ports.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_TOS
+        tristate "TOS match support"
+        depends on IP_NF_IPTABLES
+        help
+          TOS matching allows you to match packets based on the Type Of
+          Service fields of the IP packet.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_RECENT
+        tristate "recent match support"
+        depends on IP_NF_IPTABLES
+        help
+          This match is used for creating one or many lists of recently
+          used addresses and then matching against that/those list(s).
+          Short options are available by using 'iptables -m recent -h'
+          Official Website: <http://snowman.net/projects/ipt_recent/>
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_ECN
+        tristate "ECN match support"
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a `ECN' match, which allows you to match against
+          the IPv4 and TCP header ECN fields.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_DSCP
+        tristate "DSCP match support"
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a `DSCP' match, which allows you to match against
+          the IPv4 header DSCP field (DSCP codepoint).
+          The DSCP codepoint can have any value between 0x0 and 0x4f.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_AH_ESP
+        tristate "AH/ESP match support"
+        depends on IP_NF_IPTABLES
+        help
+          These two match extensions (`ah' and `esp') allow you to match a
+          range of SPIs inside AH or ESP headers of IPSec packets.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_LENGTH
+        tristate "LENGTH match support"
+        depends on IP_NF_IPTABLES
+        help
+          This option allows you to match the length of a packet against a
+          specific value or range of values.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_TTL
+        tristate "TTL match support"
+        depends on IP_NF_IPTABLES
+        help
+          This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
+          to match packets by their TTL value.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_TCPMSS
+        tristate "tcpmss match support"
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a `tcpmss' match, which allows you to examine the
+          MSS value of TCP SYN packets, which control the maximum packet size
+          for that connection.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_HELPER
+        tristate "Helper match support"
+        depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+        help
+          Helper matching allows you to match packets in dynamic connections
+          tracked by a conntrack-helper, ie. ip_conntrack_ftp
+          To compile it as a module, choose M here.  If unsure, say Y.
+config IP_NF_MATCH_STATE
+        tristate "Connection state match support"
+        depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+        help
+          Connection state matching allows you to match packets based on their
+          relationship to a tracked connection (ie. previous packets).  This
+          is a powerful tool for packet classification.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_CONNTRACK
+        tristate "Connection tracking match support"
+        depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+        help
+          This is a general conntrack match module, a superset of the state match.
+          It allows matching on additional conntrack information, which is
+          useful in complex configurations, such as NAT gateways with multiple
+          internet links or tunnels.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_OWNER
+        tristate "Owner match support"
+        depends on IP_NF_IPTABLES
+        help
+          Packet owner matching allows you to match locally-generated packets
+          based on who created them: the user, group, process or session.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_PHYSDEV
+        tristate "Physdev match support"
+        depends on IP_NF_IPTABLES && BRIDGE_NETFILTER
+        help
+          Physdev packet matching matches against the physical bridge ports
+          the IP packet arrived on or will leave by.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_MATCH_ADDRTYPE
+        tristate  'address type match support'
+        depends on IP_NF_IPTABLES
+        help
+          This option allows you to match what routing thinks of an address,
+          eg. UNICAST, LOCAL, BROADCAST, ...
+        
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config IP_NF_MATCH_REALM
+        tristate  'realm match support'
+        depends on IP_NF_IPTABLES
+        select NET_CLS_ROUTE
+        help
+          This option adds a `realm' match, which allows you to use the realm
+          key from the routing subsystem inside iptables.
+        
+          This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option 
+          in tc world.
+        
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config IP_NF_MATCH_SCTP
+        tristate  'SCTP protocol match support'
+        depends on IP_NF_IPTABLES
+        help
+          With this option enabled, you will be able to use the iptables
+          `sctp' match in order to match on SCTP source/destination ports
+          and SCTP chunk types.
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config IP_NF_MATCH_COMMENT
+        tristate  'comment match support'
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a `comment' dummy-match, which allows you to put
+          comments in your iptables ruleset.
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config IP_NF_MATCH_CONNMARK
+        tristate  'Connection mark match support'
+        depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES
+        help
+          This option adds a `connmark' match, which allows you to match the
+          connection mark value previously set for the session by `CONNMARK'. 
+        
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  The module will be called
+          ipt_connmark.o.  If unsure, say `N'.
+config IP_NF_MATCH_HASHLIMIT
+        tristate  'hashlimit match support'
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a new iptables `hashlimit' match.  
+          As opposed to `limit', this match dynamically crates a hash table
+          of limit buckets, based on your selection of source/destination
+          ip addresses and/or ports.
+          It enables you to express policies like `10kpps for any given
+          destination IP' or `500pps from any given source IP'  with a single
+          IPtables rule.
+# `filter', generic and specific targets
+config IP_NF_FILTER
+        tristate "Packet filtering"
+        depends on IP_NF_IPTABLES
+        help
+          Packet filtering defines a table `filter', which has a series of
+          rules for simple packet filtering at local input, forwarding and
+          local output.  See the man page for iptables(8).
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_REJECT
+        tristate "REJECT target support"
+        depends on IP_NF_FILTER
+        help
+          The REJECT target allows a filtering rule to specify that an ICMP
+          error should be issued in response to an incoming packet, rather
+          than silently being dropped.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_LOG
+        tristate "LOG target support"
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a `LOG' target, which allows you to create rules in
+          any iptables table which records the packet header to the syslog.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_ULOG
+        tristate "ULOG target support"
+        depends on IP_NF_IPTABLES
+        ---help---
+          This option adds a `ULOG' target, which allows you to create rules in
+          any iptables table. The packet is passed to a userspace logging
+          daemon using netlink multicast sockets; unlike the LOG target
+          which can only be viewed through syslog.
+          The apropriate userspace logging daemon (ulogd) may be obtained from
+          <http://www.gnumonks.org/projects/ulogd/>
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_TCPMSS
+        tristate "TCPMSS target support"
+        depends on IP_NF_IPTABLES
+        ---help---
+          This option adds a `TCPMSS' target, which allows you to alter the
+          MSS value of TCP SYN packets, to control the maximum size for that
+          connection (usually limiting it to your outgoing interface's MTU
+          minus 40).
+          This is used to overcome criminally braindead ISPs or servers which
+          block ICMP Fragmentation Needed packets.  The symptoms of this
+          problem are that everything works fine from your Linux
+          firewall/router, but machines behind it can never exchange large
+          packets:
+                1) Web browsers connect, then hang with no data received.
+                2) Small mail works fine, but large emails hang.
+                3) ssh works fine, but scp hangs after initial handshaking.
+          Workaround: activate this option and add a rule to your firewall
+          configuration like:
+          iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
+                         -j TCPMSS --clamp-mss-to-pmtu
+          To compile it as a module, choose M here.  If unsure, say N.
+# NAT + specific targets
+config IP_NF_NAT
+        tristate "Full NAT"
+        depends on IP_NF_IPTABLES && IP_NF_CONNTRACK
+        help
+          The Full NAT option allows masquerading, port forwarding and other
+          forms of full Network Address Port Translation.  It is controlled by
+          the `nat' table in iptables: see the man page for iptables(8).
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_NAT_NEEDED
+        bool
+        depends on IP_NF_NAT != n
+        default y
+config IP_NF_TARGET_MASQUERADE
+        tristate "MASQUERADE target support"
+        depends on IP_NF_NAT
+        help
+          Masquerading is a special case of NAT: all outgoing connections are
+          changed to seem to come from a particular interface's address, and
+          if the interface goes down, those connections are lost.  This is
+          only useful for dialup accounts with dynamic IP address (ie. your IP
+          address will be different on next dialup).
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_REDIRECT
+        tristate "REDIRECT target support"
+        depends on IP_NF_NAT
+        help
+          REDIRECT is a special case of NAT: all incoming connections are
+          mapped onto the incoming interface's address, causing the packets to
+          come to the local machine instead of passing through.  This is
+          useful for transparent proxies.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_NETMAP
+        tristate "NETMAP target support"
+        depends on IP_NF_NAT
+        help
+          NETMAP is an implementation of static 1:1 NAT mapping of network
+          addresses. It maps the network address part, while keeping the host
+          address part intact. It is similar to Fast NAT, except that
+          Netfilter's connection tracking doesn't work well with Fast NAT.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_SAME
+        tristate "SAME target support"
+        depends on IP_NF_NAT
+        help
+          This option adds a `SAME' target, which works like the standard SNAT
+          target, but attempts to give clients the same IP for all connections.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_NAT_SNMP_BASIC
+        tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
+        depends on EXPERIMENTAL && IP_NF_NAT
+        ---help---
+          This module implements an Application Layer Gateway (ALG) for
+          SNMP payloads.  In conjunction with NAT, it allows a network
+          management system to access multiple private networks with
+          conflicting addresses.  It works by modifying IP addresses
+          inside SNMP payloads to match IP-layer NAT mapping.
+          This is the "basic" form of SNMP-ALG, as described in RFC 2962
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_NAT_IRC
+        tristate
+        depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+        default IP_NF_NAT if IP_NF_IRC=y
+        default m if IP_NF_IRC=m
+# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), 
+# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker.  Argh.
+config IP_NF_NAT_FTP
+        tristate
+        depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+        default IP_NF_NAT if IP_NF_FTP=y
+        default m if IP_NF_FTP=m
+config IP_NF_NAT_TFTP
+        tristate
+        depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+        default IP_NF_NAT if IP_NF_TFTP=y
+        default m if IP_NF_TFTP=m
+config IP_NF_NAT_AMANDA
+        tristate
+        depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+        default IP_NF_NAT if IP_NF_AMANDA=y
+        default m if IP_NF_AMANDA=m
+# mangle + specific targets
+config IP_NF_MANGLE
+        tristate "Packet mangling"
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a `mangle' table to iptables: see the man page for
+          iptables(8).  This table is used for various packet alterations
+          which can effect how the packet is routed.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_TOS
+        tristate "TOS target support"
+        depends on IP_NF_MANGLE
+        help
+          This option adds a `TOS' target, which allows you to create rules in
+          the `mangle' table which alter the Type Of Service field of an IP
+          packet prior to routing.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_ECN
+        tristate "ECN target support"
+        depends on IP_NF_MANGLE
+        ---help---
+          This option adds a `ECN' target, which can be used in the iptables mangle
+          table.  
+          You can use this target to remove the ECN bits from the IPv4 header of
+          an IP packet.  This is particularly useful, if you need to work around
+          existing ECN blackholes on the internet, but don't want to disable
+          ECN support in general.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_DSCP
+        tristate "DSCP target support"
+        depends on IP_NF_MANGLE
+        help
+          This option adds a `DSCP' match, which allows you to match against
+          the IPv4 header DSCP field (DSCP codepoint).
+          The DSCP codepoint can have any value between 0x0 and 0x4f.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_MARK
+        tristate "MARK target support"
+        depends on IP_NF_MANGLE
+        help
+          This option adds a `MARK' target, which allows you to create rules
+          in the `mangle' table which alter the netfilter mark (nfmark) field
+          associated with the packet prior to routing. This can change
+          the routing method (see `Use netfilter MARK value as routing
+          key') and can also be used by other subsystems to change their
+          behavior.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_CLASSIFY
+        tristate "CLASSIFY target support"
+        depends on IP_NF_MANGLE
+        help
+          This option adds a `CLASSIFY' target, which enables the user to set
+          the priority of a packet. Some qdiscs can use this value for
+          classification, among these are:
+          atm, cbq, dsmark, pfifo_fast, htb, prio
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_CONNMARK
+        tristate  'CONNMARK target support'
+        depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
+        help
+          This option adds a `CONNMARK' target, which allows one to manipulate
+          the connection mark value.  Similar to the MARK target, but
+          affects the connection mark value rather than the packet mark value.
+        
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  The module will be called
+          ipt_CONNMARK.o.  If unsure, say `N'.
+config IP_NF_TARGET_CLUSTERIP
+        tristate "CLUSTERIP target support (EXPERIMENTAL)"
+        depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL
+        help
+          The CLUSTERIP target allows you to build load-balancing clusters of
+          network servers without having a dedicated load-balancing
+          router/server/switch.
+        
+          To compile it as a module, choose M here.  If unsure, say N.
+# raw + specific targets
+config IP_NF_RAW
+        tristate  'raw table support (required for NOTRACK/TRACE)'
+        depends on IP_NF_IPTABLES
+        help
+          This option adds a `raw' table to iptables. This table is the very
+          first in the netfilter framework and hooks in at the PREROUTING
+          and OUTPUT chains.
+        
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config IP_NF_TARGET_NOTRACK
+        tristate  'NOTRACK target support'
+        depends on IP_NF_RAW
+        depends on IP_NF_CONNTRACK
+        help
+          The NOTRACK target allows a select rule to specify
+          which packets *not* to enter the conntrack/NAT
+          subsystem with all the consequences (no ICMP error tracking,
+          no protocol helpers for the selected packets).
+        
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
+# ARP tables
+config IP_NF_ARPTABLES
+        tristate "ARP tables support"
+        help
+          arptables is a general, extensible packet identification framework.
+          The ARP packet filtering and mangling (manipulation)subsystems
+          use this: say Y or M here if you want to use either of those.
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_ARPFILTER
+        tristate "ARP packet filtering"
+        depends on IP_NF_ARPTABLES
+        help
+          ARP packet filtering defines a table `filter', which has a series of
+          rules for simple ARP packet filtering at local input and
+          local output.  On a bridge, you can also specify filtering rules
+          for forwarded ARP packets. See the man page for arptables(8).
+          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_ARP_MANGLE
+        tristate "ARP payload mangling"
+        depends on IP_NF_ARPTABLES
+        help
+          Allows altering the ARP packet payload: source and destination
+          hardware and network addresses.
+endmenu
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 000000000000..45796d5924dd
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,89 @@
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+# objects for the standalone - connection tracking / NAT
+ip_conntrack-objs       := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
+iptable_nat-objs        := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+# connection tracking
+obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+# SCTP protocol connection tracking
+obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
+# connection tracking helpers
+obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
+obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
+obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
+obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+# NAT helpers 
+obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
+obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
+obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
+obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+# generic IP tables 
+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+# the three instances of ip_tables
+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+# matches
+obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
+obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
+obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
+obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
+obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
+obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
+obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
+obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
+obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
+obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
+obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
+obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
+obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
+obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
+obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
+obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
+obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
+obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
+obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
+obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
+obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
+obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
+obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+# targets
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
+obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
+obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
+obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
+obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
+obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
+obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
+obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
+obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
+obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o
+obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
+obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
+obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 000000000000..df79f5ed6a0a
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1333 @@
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/netfilter_arp/arp_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables core");
+/*#define DEBUG_ARP_TABLES*/
+/*#define DEBUG_ARP_TABLES_USER*/
+#ifdef DEBUG_ARP_TABLES
+#define dprintf(format, args...)  printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+#ifdef DEBUG_ARP_TABLES_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+#define ARP_NF_ASSERT(x)                                        \
+do {                                                            \
+        if (!(x))                                               \
+                printk("ARP_NF_ASSERT: %s:%s:%u\n",             \
+                       __FUNCTION__, __FILE__, __LINE__);       \
+} while(0)
+#else
+#define ARP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+static DECLARE_MUTEX(arpt_mutex);
+#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+struct arpt_table_info {
+        unsigned int size;
+        unsigned int number;
+        unsigned int initial_entries;
+        unsigned int hook_entry[NF_ARP_NUMHOOKS];
+        unsigned int underflow[NF_ARP_NUMHOOKS];
+        char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
+};
+static LIST_HEAD(arpt_target);
+static LIST_HEAD(arpt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+                                      char *hdr_addr, int len)
+{
+        int i, ret;
+        if (len > ARPT_DEV_ADDR_LEN_MAX)
+                len = ARPT_DEV_ADDR_LEN_MAX;
+        ret = 0;
+        for (i = 0; i < len; i++)
+                ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+        return (ret != 0);
+}
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+                                   struct net_device *dev,
+                                   const char *indev,
+                                   const char *outdev,
+                                   const struct arpt_arp *arpinfo)
+{
+        char *arpptr = (char *)(arphdr + 1);
+        char *src_devaddr, *tgt_devaddr;
+        u32 src_ipaddr, tgt_ipaddr;
+        int i, ret;
+#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg))
+        if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
+                  ARPT_INV_ARPOP)) {
+                dprintf("ARP operation field mismatch.\n");
+                dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
+                        arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+                return 0;
+        }
+        if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
+                  ARPT_INV_ARPHRD)) {
+                dprintf("ARP hardware address format mismatch.\n");
+                dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
+                        arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+                return 0;
+        }
+        if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
+                  ARPT_INV_ARPPRO)) {
+                dprintf("ARP protocol address format mismatch.\n");
+                dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
+                        arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+                return 0;
+        }
+        if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
+                  ARPT_INV_ARPHLN)) {
+                dprintf("ARP hardware address length mismatch.\n");
+                dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
+                        arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+                return 0;
+        }
+        src_devaddr = arpptr;
+        arpptr += dev->addr_len;
+        memcpy(&src_ipaddr, arpptr, sizeof(u32));
+        arpptr += sizeof(u32);
+        tgt_devaddr = arpptr;
+        arpptr += dev->addr_len;
+        memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
+        if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
+                  ARPT_INV_SRCDEVADDR) ||
+            FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
+                  ARPT_INV_TGTDEVADDR)) {
+                dprintf("Source or target device address mismatch.\n");
+                return 0;
+        }
+        if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
+                  ARPT_INV_SRCIP) ||
+            FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
+                  ARPT_INV_TGTIP)) {
+                dprintf("Source or target IP address mismatch.\n");
+                dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+                        NIPQUAD(src_ipaddr),
+                        NIPQUAD(arpinfo->smsk.s_addr),
+                        NIPQUAD(arpinfo->src.s_addr),
+                        arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
+                dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+                        NIPQUAD(tgt_ipaddr),
+                        NIPQUAD(arpinfo->tmsk.s_addr),
+                        NIPQUAD(arpinfo->tgt.s_addr),
+                        arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+                return 0;
+        }
+        /* Look for ifname matches.  */
+        for (i = 0, ret = 0; i < IFNAMSIZ; i++) {
+                ret |= (indev[i] ^ arpinfo->iniface[i])
+                        & arpinfo->iniface_mask[i];
+        }
+        if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
+                dprintf("VIA in mismatch (%s vs %s).%s\n",
+                        indev, arpinfo->iniface,
+                        arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+                return 0;
+        }
+        for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+                unsigned long odev;
+                memcpy(&odev, outdev + i*sizeof(unsigned long),
+                       sizeof(unsigned long));
+                ret |= (odev
+                        ^ ((const unsigned long *)arpinfo->outiface)[i])
+                        & ((const unsigned long *)arpinfo->outiface_mask)[i];
+        }
+        if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
+                dprintf("VIA out mismatch (%s vs %s).%s\n",
+                        outdev, arpinfo->outiface,
+                        arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+                return 0;
+        }
+        return 1;
+}
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+        if (arp->flags & ~ARPT_F_MASK) {
+                duprintf("Unknown flag bits set: %08X\n",
+                         arp->flags & ~ARPT_F_MASK);
+                return 0;
+        }
+        if (arp->invflags & ~ARPT_INV_MASK) {
+                duprintf("Unknown invflag bits set: %08X\n",
+                         arp->invflags & ~ARPT_INV_MASK);
+                return 0;
+        }
+        return 1;
+}
+static unsigned int arpt_error(struct sk_buff **pskb,
+                               unsigned int hooknum,
+                               const struct net_device *in,
+                               const struct net_device *out,
+                               const void *targinfo,
+                               void *userinfo)
+{
+        if (net_ratelimit())
+                printk("arp_tables: error: '%s'\n", (char *)targinfo);
+        return NF_DROP;
+}
+static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
+{
+        return (struct arpt_entry *)(base + offset);
+}
+unsigned int arpt_do_table(struct sk_buff **pskb,
+                           unsigned int hook,
+                           const struct net_device *in,
+                           const struct net_device *out,
+                           struct arpt_table *table,
+                           void *userdata)
+{
+        static const char nulldevname[IFNAMSIZ];
+        unsigned int verdict = NF_DROP;
+        struct arphdr *arp;
+        int hotdrop = 0;
+        struct arpt_entry *e, *back;
+        const char *indev, *outdev;
+        void *table_base;
+        /* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
+        if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) +
+                                     (2 * (*pskb)->dev->addr_len) +
+                                     (2 * sizeof(u32)))))
+                return NF_DROP;
+        indev = in ? in->name : nulldevname;
+        outdev = out ? out->name : nulldevname;
+        read_lock_bh(&table->lock);
+        table_base = (void *)table->private->entries
+                + TABLE_OFFSET(table->private,
+                               smp_processor_id());
+        e = get_entry(table_base, table->private->hook_entry[hook]);
+        back = get_entry(table_base, table->private->underflow[hook]);
+        arp = (*pskb)->nh.arph;
+        do {
+                if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) {
+                        struct arpt_entry_target *t;
+                        int hdr_len;
+                        hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
+                                (2 * (*pskb)->dev->addr_len);
+                        ADD_COUNTER(e->counters, hdr_len, 1);
+                        t = arpt_get_target(e);
+                        /* Standard target? */
+                        if (!t->u.kernel.target->target) {
+                                int v;
+                                v = ((struct arpt_standard_target *)t)->verdict;
+                                if (v < 0) {
+                                        /* Pop from stack? */
+                                        if (v != ARPT_RETURN) {
+                                                verdict = (unsigned)(-v) - 1;
+                                                break;
+                                        }
+                                        e = back;
+                                        back = get_entry(table_base,
+                                                         back->comefrom);
+                                        continue;
+                                }
+                                if (table_base + v
+                                    != (void *)e + e->next_offset) {
+                                        /* Save old back ptr in next entry */
+                                        struct arpt_entry *next
+                                                = (void *)e + e->next_offset;
+                                        next->comefrom =
+                                                (void *)back - table_base;
+                                        /* set back pointer to next entry */
+                                        back = next;
+                                }
+                                e = get_entry(table_base, v);
+                        } else {
+                                /* Targets which reenter must return
+                                 * abs. verdicts
+                                 */
+                                verdict = t->u.kernel.target->target(pskb,
+                                                                     hook,
+                                                                     in, out,
+                                                                     t->data,
+                                                                     userdata);
+                                /* Target might have changed stuff. */
+                                arp = (*pskb)->nh.arph;
+                                if (verdict == ARPT_CONTINUE)
+                                        e = (void *)e + e->next_offset;
+                                else
+                                        /* Verdict */
+                                        break;
+                        }
+                } else {
+                        e = (void *)e + e->next_offset;
+                }
+        } while (!hotdrop);
+        read_unlock_bh(&table->lock);
+        if (hotdrop)
+                return NF_DROP;
+        else
+                return verdict;
+}
+static inline void *find_inlist_lock_noload(struct list_head *head,
+                                            const char *name,
+                                            int *error,
+                                            struct semaphore *mutex)
+{
+        void *ret;
+        *error = down_interruptible(mutex);
+        if (*error != 0)
+                return NULL;
+        ret = list_named_find(head, name);
+        if (!ret) {
+                *error = -ENOENT;
+                up(mutex);
+        }
+        return ret;
+}
+#ifndef CONFIG_KMOD
+#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
+#else
+static void *
+find_inlist_lock(struct list_head *head,
+                 const char *name,
+                 const char *prefix,
+                 int *error,
+                 struct semaphore *mutex)
+{
+        void *ret;
+        ret = find_inlist_lock_noload(head, name, error, mutex);
+        if (!ret) {
+                duprintf("find_inlist: loading `%s%s'.\n", prefix, name);
+                request_module("%s%s", prefix, name);
+                ret = find_inlist_lock_noload(head, name, error, mutex);
+        }
+        return ret;
+}
+#endif
+static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex)
+{
+        return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex);
+}
+static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex)
+{
+        return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex);
+}
+/* All zeroes == unconditional rule. */
+static inline int unconditional(const struct arpt_arp *arp)
+{
+        unsigned int i;
+        for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++)
+                if (((__u32 *)arp)[i])
+                        return 0;
+        return 1;
+}
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops.  Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
+{
+        unsigned int hook;
+        /* No recursion; use packet counter to save back ptrs (reset
+         * to 0 as we leave), and comefrom to save source hook bitmask.
+         */
+        for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+                unsigned int pos = newinfo->hook_entry[hook];
+                struct arpt_entry *e
+                        = (struct arpt_entry *)(newinfo->entries + pos);
+                if (!(valid_hooks & (1 << hook)))
+                        continue;
+                /* Set initial back pointer. */
+                e->counters.pcnt = pos;
+                for (;;) {
+                        struct arpt_standard_target *t
+                                = (void *)arpt_get_target(e);
+                        if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
+                                printk("arptables: loop hook %u pos %u %08X.\n",
+                                       hook, pos, e->comefrom);
+                                return 0;
+                        }
+                        e->comefrom
+                                |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+                        /* Unconditional return/END. */
+                        if (e->target_offset == sizeof(struct arpt_entry)
+                            && (strcmp(t->target.u.user.name,
+                                       ARPT_STANDARD_TARGET) == 0)
+                            && t->verdict < 0
+                            && unconditional(&e->arp)) {
+                                unsigned int oldpos, size;
+                                /* Return: backtrack through the last
+                                 * big jump.
+                                 */
+                                do {
+                                        e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+                                        oldpos = pos;
+                                        pos = e->counters.pcnt;
+                                        e->counters.pcnt = 0;
+                                        /* We're at the start. */
+                                        if (pos == oldpos)
+                                                goto next;
+                                        e = (struct arpt_entry *)
+                                                (newinfo->entries + pos);
+                                } while (oldpos == pos + e->next_offset);
+                                /* Move along one */
+                                size = e->next_offset;
+                                e = (struct arpt_entry *)
+                                        (newinfo->entries + pos + size);
+                                e->counters.pcnt = pos;
+                                pos += size;
+                        } else {
+                                int newpos = t->verdict;
+                                if (strcmp(t->target.u.user.name,
+                                           ARPT_STANDARD_TARGET) == 0
+                                    && newpos >= 0) {
+                                        /* This a jump; chase it. */
+                                        duprintf("Jump rule %u -> %u\n",
+                                                 pos, newpos);
+                                } else {
+                                        /* ... this is a fallthru */
+                                        newpos = pos + e->next_offset;
+                                }
+                                e = (struct arpt_entry *)
+                                        (newinfo->entries + newpos);
+                                e->counters.pcnt = pos;
+                                pos = newpos;
+                        }
+                }
+                next:
+                duprintf("Finished chain %u\n", hook);
+        }
+        return 1;
+}
+static inline int standard_check(const struct arpt_entry_target *t,
+                                 unsigned int max_offset)
+{
+        struct arpt_standard_target *targ = (void *)t;
+        /* Check standard info. */
+        if (t->u.target_size
+            != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
+                duprintf("arpt_standard_check: target size %u != %Zu\n",
+                         t->u.target_size,
+                         ARPT_ALIGN(sizeof(struct arpt_standard_target)));
+                return 0;
+        }
+        if (targ->verdict >= 0
+            && targ->verdict > max_offset - sizeof(struct arpt_entry)) {
+                duprintf("arpt_standard_check: bad verdict (%i)\n",
+                         targ->verdict);
+                return 0;
+        }
+        if (targ->verdict < -NF_MAX_VERDICT - 1) {
+                duprintf("arpt_standard_check: bad negative verdict (%i)\n",
+                         targ->verdict);
+                return 0;
+        }
+        return 1;
+}
+static struct arpt_target arpt_standard_target;
+static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+                              unsigned int *i)
+{
+        struct arpt_entry_target *t;
+        struct arpt_target *target;
+        int ret;
+        if (!arp_checkentry(&e->arp)) {
+                duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+                return -EINVAL;
+        }
+        t = arpt_get_target(e);
+        target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex);
+        if (!target) {
+                duprintf("check_entry: `%s' not found\n", t->u.user.name);
+                goto out;
+        }
+        if (!try_module_get((target->me))) {
+                ret = -ENOENT;
+                goto out_unlock;
+        }
+        t->u.kernel.target = target;
+        up(&arpt_mutex);
+        if (t->u.kernel.target == &arpt_standard_target) {
+                if (!standard_check(t, size)) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+        } else if (t->u.kernel.target->checkentry
+                   && !t->u.kernel.target->checkentry(name, e, t->data,
+                                                      t->u.target_size
+                                                      - sizeof(*t),
+                                                      e->comefrom)) {
+                module_put(t->u.kernel.target->me);
+                duprintf("arp_tables: check failed for `%s'.\n",
+                         t->u.kernel.target->name);
+                ret = -EINVAL;
+                goto out;
+        }
+        (*i)++;
+        return 0;
+out_unlock:
+        up(&arpt_mutex);
+out:
+        return ret;
+}
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+                                             struct arpt_table_info *newinfo,
+                                             unsigned char *base,
+                                             unsigned char *limit,
+                                             const unsigned int *hook_entries,
+                                             const unsigned int *underflows,
+                                             unsigned int *i)
+{
+        unsigned int h;
+        if ((unsigned long)e % __alignof__(struct arpt_entry) != 0
+            || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+                duprintf("Bad offset %p\n", e);
+                return -EINVAL;
+        }
+        if (e->next_offset
+            < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+                duprintf("checking: element %p size %u\n",
+                         e, e->next_offset);
+                return -EINVAL;
+        }
+        /* Check hooks & underflows */
+        for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+                if ((unsigned char *)e - base == hook_entries[h])
+                        newinfo->hook_entry[h] = hook_entries[h];
+                if ((unsigned char *)e - base == underflows[h])
+                        newinfo->underflow[h] = underflows[h];
+        }
+        /* FIXME: underflows must be unconditional, standard verdicts
+           < 0 (not ARPT_RETURN). --RR */
+        /* Clear counters and comefrom */
+        e->counters = ((struct arpt_counters) { 0, 0 });
+        e->comefrom = 0;
+        (*i)++;
+        return 0;
+}
+static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
+{
+        struct arpt_entry_target *t;
+        if (i && (*i)-- == 0)
+                return 1;
+        t = arpt_get_target(e);
+        if (t->u.kernel.target->destroy)
+                t->u.kernel.target->destroy(t->data,
+                                            t->u.target_size - sizeof(*t));
+        module_put(t->u.kernel.target->me);
+        return 0;
+}
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(const char *name,
+                           unsigned int valid_hooks,
+                           struct arpt_table_info *newinfo,
+                           unsigned int size,
+                           unsigned int number,
+                           const unsigned int *hook_entries,
+                           const unsigned int *underflows)
+{
+        unsigned int i;
+        int ret;
+        newinfo->size = size;
+        newinfo->number = number;
+        /* Init all hooks to impossible value. */
+        for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+                newinfo->hook_entry[i] = 0xFFFFFFFF;
+                newinfo->underflow[i] = 0xFFFFFFFF;
+        }
+        duprintf("translate_table: size %u\n", newinfo->size);
+        i = 0;
+        /* Walk through entries, checking offsets. */
+        ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+                                 check_entry_size_and_hooks,
+                                 newinfo,
+                                 newinfo->entries,
+                                 newinfo->entries + size,
+                                 hook_entries, underflows, &i);
+        duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
+        if (ret != 0)
+                return ret;
+        if (i != number) {
+                duprintf("translate_table: %u not %u entries\n",
+                         i, number);
+                return -EINVAL;
+        }
+        /* Check hooks all assigned */
+        for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+                /* Only hooks which are valid */
+                if (!(valid_hooks & (1 << i)))
+                        continue;
+                if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+                        duprintf("Invalid hook entry %u %u\n",
+                                 i, hook_entries[i]);
+                        return -EINVAL;
+                }
+                if (newinfo->underflow[i] == 0xFFFFFFFF) {
+                        duprintf("Invalid underflow %u %u\n",
+                                 i, underflows[i]);
+                        return -EINVAL;
+                }
+        }
+        if (!mark_source_chains(newinfo, valid_hooks)) {
+                duprintf("Looping hook\n");
+                return -ELOOP;
+        }
+        /* Finally, each sanity check must pass */
+        i = 0;
+        ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+                                 check_entry, name, size, &i);
+        if (ret != 0) {
+                ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+                                   cleanup_entry, &i);
+                return ret;
+        }
+        /* And one copy for every other CPU */
+        for (i = 1; i < num_possible_cpus(); i++) {
+                memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+                       newinfo->entries,
+                       SMP_ALIGN(newinfo->size));
+        }
+        return ret;
+}
+static struct arpt_table_info *replace_table(struct arpt_table *table,
+                                             unsigned int num_counters,
+                                             struct arpt_table_info *newinfo,
+                                             int *error)
+{
+        struct arpt_table_info *oldinfo;
+        /* Do the substitution. */
+        write_lock_bh(&table->lock);
+        /* Check inside lock: is the old number correct? */
+        if (num_counters != table->private->number) {
+                duprintf("num_counters != table->private->number (%u/%u)\n",
+                         num_counters, table->private->number);
+                write_unlock_bh(&table->lock);
+                *error = -EAGAIN;
+                return NULL;
+        }
+        oldinfo = table->private;
+        table->private = newinfo;
+        newinfo->initial_entries = oldinfo->initial_entries;
+        write_unlock_bh(&table->lock);
+        return oldinfo;
+}
+/* Gets counters. */
+static inline int add_entry_to_counter(const struct arpt_entry *e,
+                                       struct arpt_counters total[],
+                                       unsigned int *i)
+{
+        ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+        (*i)++;
+        return 0;
+}
+static void get_counters(const struct arpt_table_info *t,
+                         struct arpt_counters counters[])
+{
+        unsigned int cpu;
+        unsigned int i;
+        for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+                i = 0;
+                ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+                                   t->size,
+                                   add_entry_to_counter,
+                                   counters,
+                                   &i);
+        }
+}
+static int copy_entries_to_user(unsigned int total_size,
+                                struct arpt_table *table,
+                                void __user *userptr)
+{
+        unsigned int off, num, countersize;
+        struct arpt_entry *e;
+        struct arpt_counters *counters;
+        int ret = 0;
+        /* We need atomic snapshot of counters: rest doesn't change
+         * (other than comefrom, which userspace doesn't care
+         * about).
+         */
+        countersize = sizeof(struct arpt_counters) * table->private->number;
+        counters = vmalloc(countersize);
+        if (counters == NULL)
+                return -ENOMEM;
+        /* First, sum counters... */
+        memset(counters, 0, countersize);
+        write_lock_bh(&table->lock);
+        get_counters(table->private, counters);
+        write_unlock_bh(&table->lock);
+        /* ... then copy entire thing from CPU 0... */
+        if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+                ret = -EFAULT;
+                goto free_counters;
+        }
+        /* FIXME: use iterator macros --RR */
+        /* ... then go back and fix counters and names */
+        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+                struct arpt_entry_target *t;
+                e = (struct arpt_entry *)(table->private->entries + off);
+                if (copy_to_user(userptr + off
+                                 + offsetof(struct arpt_entry, counters),
+                                 &counters[num],
+                                 sizeof(counters[num])) != 0) {
+                        ret = -EFAULT;
+                        goto free_counters;
+                }
+                t = arpt_get_target(e);
+                if (copy_to_user(userptr + off + e->target_offset
+                                 + offsetof(struct arpt_entry_target,
+                                            u.user.name),
+                                 t->u.kernel.target->name,
+                                 strlen(t->u.kernel.target->name)+1) != 0) {
+                        ret = -EFAULT;
+                        goto free_counters;
+                }
+        }
+ free_counters:
+        vfree(counters);
+        return ret;
+}
+static int get_entries(const struct arpt_get_entries *entries,
+                       struct arpt_get_entries __user *uptr)
+{
+        int ret;
+        struct arpt_table *t;
+        t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex);
+        if (t) {
+                duprintf("t->private->number = %u\n",
+                         t->private->number);
+                if (entries->size == t->private->size)
+                        ret = copy_entries_to_user(t->private->size,
+                                                   t, uptr->entrytable);
+                else {
+                        duprintf("get_entries: I've got %u not %u!\n",
+                                 t->private->size,
+                                 entries->size);
+                        ret = -EINVAL;
+                }
+                up(&arpt_mutex);
+        } else
+                duprintf("get_entries: Can't find %s!\n",
+                         entries->name);
+        return ret;
+}
+static int do_replace(void __user *user, unsigned int len)
+{
+        int ret;
+        struct arpt_replace tmp;
+        struct arpt_table *t;
+        struct arpt_table_info *newinfo, *oldinfo;
+        struct arpt_counters *counters;
+        if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+                return -EFAULT;
+        /* Hack: Causes ipchains to give correct error msg --RR */
+        if (len != sizeof(tmp) + tmp.size)
+                return -ENOPROTOOPT;
+        /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+        if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
+                return -ENOMEM;
+        newinfo = vmalloc(sizeof(struct arpt_table_info)
+                          + SMP_ALIGN(tmp.size) * num_possible_cpus());
+        if (!newinfo)
+                return -ENOMEM;
+        if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+                           tmp.size) != 0) {
+                ret = -EFAULT;
+                goto free_newinfo;
+        }
+        counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters));
+        if (!counters) {
+                ret = -ENOMEM;
+                goto free_newinfo;
+        }
+        memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
+        ret = translate_table(tmp.name, tmp.valid_hooks,
+                              newinfo, tmp.size, tmp.num_entries,
+                              tmp.hook_entry, tmp.underflow);
+        if (ret != 0)
+                goto free_newinfo_counters;
+        duprintf("arp_tables: Translated table\n");
+        t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
+        if (!t)
+                goto free_newinfo_counters_untrans;
+        /* You lied! */
+        if (tmp.valid_hooks != t->valid_hooks) {
+                duprintf("Valid hook crap: %08X vs %08X\n",
+                         tmp.valid_hooks, t->valid_hooks);
+                ret = -EINVAL;
+                goto free_newinfo_counters_untrans_unlock;
+        }
+        /* Get a reference in advance, we're not allowed fail later */
+        if (!try_module_get(t->me)) {
+                ret = -EBUSY;
+                goto free_newinfo_counters_untrans_unlock;
+        }
+        oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+        if (!oldinfo)
+                goto put_module;
+        /* Update module usage count based on number of rules */
+        duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+                oldinfo->number, oldinfo->initial_entries, newinfo->number);
+        if ((oldinfo->number > oldinfo->initial_entries) || 
+            (newinfo->number <= oldinfo->initial_entries)) 
+                module_put(t->me);
+        if ((oldinfo->number > oldinfo->initial_entries) &&
+            (newinfo->number <= oldinfo->initial_entries))
+                module_put(t->me);
+        /* Get the old counters. */
+        get_counters(oldinfo, counters);
+        /* Decrease module usage counts and free resource */
+        ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+        vfree(oldinfo);
+        if (copy_to_user(tmp.counters, counters,
+                         sizeof(struct arpt_counters) * tmp.num_counters) != 0)
+                ret = -EFAULT;
+        vfree(counters);
+        up(&arpt_mutex);
+        return ret;
+ put_module:
+        module_put(t->me);
+ free_newinfo_counters_untrans_unlock:
+        up(&arpt_mutex);
+ free_newinfo_counters_untrans:
+        ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
+ free_newinfo_counters:
+        vfree(counters);
+ free_newinfo:
+        vfree(newinfo);
+        return ret;
+}
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK.
+ */
+static inline int add_counter_to_entry(struct arpt_entry *e,
+                                       const struct arpt_counters addme[],
+                                       unsigned int *i)
+{
+        ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+        (*i)++;
+        return 0;
+}
+static int do_add_counters(void __user *user, unsigned int len)
+{
+        unsigned int i;
+        struct arpt_counters_info tmp, *paddc;
+        struct arpt_table *t;
+        int ret;
+        if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+                return -EFAULT;
+        if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters))
+                return -EINVAL;
+        paddc = vmalloc(len);
+        if (!paddc)
+                return -ENOMEM;
+        if (copy_from_user(paddc, user, len) != 0) {
+                ret = -EFAULT;
+                goto free;
+        }
+        t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
+        if (!t)
+                goto free;
+        write_lock_bh(&t->lock);
+        if (t->private->number != paddc->num_counters) {
+                ret = -EINVAL;
+                goto unlock_up_free;
+        }
+        i = 0;
+        ARPT_ENTRY_ITERATE(t->private->entries,
+                           t->private->size,
+                           add_counter_to_entry,
+                           paddc->counters,
+                           &i);
+ unlock_up_free:
+        write_unlock_bh(&t->lock);
+        up(&arpt_mutex);
+ free:
+        vfree(paddc);
+        return ret;
+}
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+        int ret;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        switch (cmd) {
+        case ARPT_SO_SET_REPLACE:
+                ret = do_replace(user, len);
+                break;
+        case ARPT_SO_SET_ADD_COUNTERS:
+                ret = do_add_counters(user, len);
+                break;
+        default:
+                duprintf("do_arpt_set_ctl:  unknown request %i\n", cmd);
+                ret = -EINVAL;
+        }
+        return ret;
+}
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+        int ret;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        switch (cmd) {
+        case ARPT_SO_GET_INFO: {
+                char name[ARPT_TABLE_MAXNAMELEN];
+                struct arpt_table *t;
+                if (*len != sizeof(struct arpt_getinfo)) {
+                        duprintf("length %u != %Zu\n", *len,
+                                 sizeof(struct arpt_getinfo));
+                        ret = -EINVAL;
+                        break;
+                }
+                if (copy_from_user(name, user, sizeof(name)) != 0) {
+                        ret = -EFAULT;
+                        break;
+                }
+                name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+                t = arpt_find_table_lock(name, &ret, &arpt_mutex);
+                if (t) {
+                        struct arpt_getinfo info;
+                        info.valid_hooks = t->valid_hooks;
+                        memcpy(info.hook_entry, t->private->hook_entry,
+                               sizeof(info.hook_entry));
+                        memcpy(info.underflow, t->private->underflow,
+                               sizeof(info.underflow));
+                        info.num_entries = t->private->number;
+                        info.size = t->private->size;
+                        strcpy(info.name, name);
+                        if (copy_to_user(user, &info, *len) != 0)
+                                ret = -EFAULT;
+                        else
+                                ret = 0;
+                        up(&arpt_mutex);
+                }
+        }
+        break;
+        case ARPT_SO_GET_ENTRIES: {
+                struct arpt_get_entries get;
+                if (*len < sizeof(get)) {
+                        duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+                        ret = -EINVAL;
+                } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+                        ret = -EFAULT;
+                } else if (*len != sizeof(struct arpt_get_entries) + get.size) {
+                        duprintf("get_entries: %u != %Zu\n", *len,
+                                 sizeof(struct arpt_get_entries) + get.size);
+                        ret = -EINVAL;
+                } else
+                        ret = get_entries(&get, user);
+                break;
+        }
+        default:
+                duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
+                ret = -EINVAL;
+        }
+        return ret;
+}
+/* Registration hooks for targets. */
+int arpt_register_target(struct arpt_target *target)
+{
+        int ret;
+        ret = down_interruptible(&arpt_mutex);
+        if (ret != 0)
+                return ret;
+        if (!list_named_insert(&arpt_target, target)) {
+                duprintf("arpt_register_target: `%s' already in list!\n",
+                         target->name);
+                ret = -EINVAL;
+        }
+        up(&arpt_mutex);
+        return ret;
+}
+void arpt_unregister_target(struct arpt_target *target)
+{
+        down(&arpt_mutex);
+        LIST_DELETE(&arpt_target, target);
+        up(&arpt_mutex);
+}
+int arpt_register_table(struct arpt_table *table,
+                        const struct arpt_replace *repl)
+{
+        int ret;
+        struct arpt_table_info *newinfo;
+        static struct arpt_table_info bootstrap
+                = { 0, 0, 0, { 0 }, { 0 }, { } };
+        newinfo = vmalloc(sizeof(struct arpt_table_info)
+                          + SMP_ALIGN(repl->size) * num_possible_cpus());
+        if (!newinfo) {
+                ret = -ENOMEM;
+                return ret;
+        }
+        memcpy(newinfo->entries, repl->entries, repl->size);
+        ret = translate_table(table->name, table->valid_hooks,
+                              newinfo, repl->size,
+                              repl->num_entries,
+                              repl->hook_entry,
+                              repl->underflow);
+        duprintf("arpt_register_table: translate table gives %d\n", ret);
+        if (ret != 0) {
+                vfree(newinfo);
+                return ret;
+        }
+        ret = down_interruptible(&arpt_mutex);
+        if (ret != 0) {
+                vfree(newinfo);
+                return ret;
+        }
+        /* Don't autoload: we'd eat our tail... */
+        if (list_named_find(&arpt_tables, table->name)) {
+                ret = -EEXIST;
+                goto free_unlock;
+        }
+        /* Simplifies replace_table code. */
+        table->private = &bootstrap;
+        if (!replace_table(table, 0, newinfo, &ret))
+                goto free_unlock;
+        duprintf("table->private->number = %u\n",
+                 table->private->number);
+        
+        /* save number of initial entries */
+        table->private->initial_entries = table->private->number;
+        rwlock_init(&table->lock);
+        list_prepend(&arpt_tables, table);
+ unlock:
+        up(&arpt_mutex);
+        return ret;
+ free_unlock:
+        vfree(newinfo);
+        goto unlock;
+}
+void arpt_unregister_table(struct arpt_table *table)
+{
+        down(&arpt_mutex);
+        LIST_DELETE(&arpt_tables, table);
+        up(&arpt_mutex);
+        /* Decrease module usage counts and free resources */
+        ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+                           cleanup_entry, NULL);
+        vfree(table->private);
+}
+/* The built-in targets: standard (NULL) and error. */
+static struct arpt_target arpt_standard_target = {
+        .name           = ARPT_STANDARD_TARGET,
+};
+static struct arpt_target arpt_error_target = {
+        .name           = ARPT_ERROR_TARGET,
+        .target         = arpt_error,
+};
+static struct nf_sockopt_ops arpt_sockopts = {
+        .pf             = PF_INET,
+        .set_optmin     = ARPT_BASE_CTL,
+        .set_optmax     = ARPT_SO_SET_MAX+1,
+        .set            = do_arpt_set_ctl,
+        .get_optmin     = ARPT_BASE_CTL,
+        .get_optmax     = ARPT_SO_GET_MAX+1,
+        .get            = do_arpt_get_ctl,
+};
+#ifdef CONFIG_PROC_FS
+static inline int print_name(const struct arpt_table *t,
+                             off_t start_offset, char *buffer, int length,
+                             off_t *pos, unsigned int *count)
+{
+        if ((*count)++ >= start_offset) {
+                unsigned int namelen;
+                namelen = sprintf(buffer + *pos, "%s\n", t->name);
+                if (*pos + namelen > length) {
+                        /* Stop iterating */
+                        return 1;
+                }
+                *pos += namelen;
+        }
+        return 0;
+}
+static int arpt_get_tables(char *buffer, char **start, off_t offset, int length)
+{
+        off_t pos = 0;
+        unsigned int count = 0;
+        if (down_interruptible(&arpt_mutex) != 0)
+                return 0;
+        LIST_FIND(&arpt_tables, print_name, struct arpt_table *,
+                  offset, buffer, length, &pos, &count);
+        up(&arpt_mutex);
+        /* `start' hack - see fs/proc/generic.c line ~105 */
+        *start=(char *)((unsigned long)count-offset);
+        return pos;
+}
+#endif /*CONFIG_PROC_FS*/
+static int __init init(void)
+{
+        int ret;
+        /* Noone else will be downing sem now, so we won't sleep */
+        down(&arpt_mutex);
+        list_append(&arpt_target, &arpt_standard_target);
+        list_append(&arpt_target, &arpt_error_target);
+        up(&arpt_mutex);
+        /* Register setsockopt */
+        ret = nf_register_sockopt(&arpt_sockopts);
+        if (ret < 0) {
+                duprintf("Unable to register sockopts.\n");
+                return ret;
+        }
+#ifdef CONFIG_PROC_FS
+        {
+                struct proc_dir_entry *proc;
+                proc = proc_net_create("arp_tables_names", 0, arpt_get_tables);
+                if (!proc) {
+                        nf_unregister_sockopt(&arpt_sockopts);
+                        return -ENOMEM;
+                }
+                proc->owner = THIS_MODULE;
+        }
+#endif
+        printk("arp_tables: (C) 2002 David S. Miller\n");
+        return 0;
+}
+static void __exit fini(void)
+{
+        nf_unregister_sockopt(&arpt_sockopts);
+#ifdef CONFIG_PROC_FS
+        proc_net_remove("arp_tables_names");
+#endif
+}
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+EXPORT_SYMBOL(arpt_register_target);
+EXPORT_SYMBOL(arpt_unregister_target);
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 000000000000..3e592ec86482
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,104 @@
+/* module that allows mangling of the arp payload */
+#include <linux/module.h>
+#include <linux/netfilter_arp/arpt_mangle.h>
+#include <net/sock.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("arptables arp payload mangle target");
+static unsigned int
+target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in,
+   const struct net_device *out, const void *targinfo, void *userinfo)
+{
+        const struct arpt_mangle *mangle = targinfo;
+        struct arphdr *arp;
+        unsigned char *arpptr;
+        int pln, hln;
+        if (skb_shared(*pskb) || skb_cloned(*pskb)) {
+                struct sk_buff *nskb;
+                nskb = skb_copy(*pskb, GFP_ATOMIC);
+                if (!nskb)
+                        return NF_DROP;
+                if ((*pskb)->sk)
+                        skb_set_owner_w(nskb, (*pskb)->sk);
+                kfree_skb(*pskb);
+                *pskb = nskb;
+        }
+        arp = (*pskb)->nh.arph;
+        arpptr = (*pskb)->nh.raw + sizeof(*arp);
+        pln = arp->ar_pln;
+        hln = arp->ar_hln;
+        /* We assume that pln and hln were checked in the match */
+        if (mangle->flags & ARPT_MANGLE_SDEV) {
+                if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+                   (arpptr + hln > (**pskb).tail))
+                        return NF_DROP;
+                memcpy(arpptr, mangle->src_devaddr, hln);
+        }
+        arpptr += hln;
+        if (mangle->flags & ARPT_MANGLE_SIP) {
+                if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+                   (arpptr + pln > (**pskb).tail))
+                        return NF_DROP;
+                memcpy(arpptr, &mangle->u_s.src_ip, pln);
+        }
+        arpptr += pln;
+        if (mangle->flags & ARPT_MANGLE_TDEV) {
+                if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+                   (arpptr + hln > (**pskb).tail))
+                        return NF_DROP;
+                memcpy(arpptr, mangle->tgt_devaddr, hln);
+        }
+        arpptr += hln;
+        if (mangle->flags & ARPT_MANGLE_TIP) {
+                if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+                   (arpptr + pln > (**pskb).tail))
+                        return NF_DROP;
+                memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
+        }
+        return mangle->target;
+}
+static int
+checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo,
+   unsigned int targinfosize, unsigned int hook_mask)
+{
+        const struct arpt_mangle *mangle = targinfo;
+        if (mangle->flags & ~ARPT_MANGLE_MASK ||
+            !(mangle->flags & ARPT_MANGLE_MASK))
+                return 0;
+        if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
+           mangle->target != ARPT_CONTINUE)
+                return 0;
+        return 1;
+}
+static struct arpt_target arpt_mangle_reg
+= {
+        .name           = "mangle",
+        .target         = target,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        if (arpt_register_target(&arpt_mangle_reg))
+                return -EINVAL;
+        return 0;
+}
+static void __exit fini(void)
+{
+        arpt_unregister_target(&arpt_mangle_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 000000000000..0d759f5a4ef0
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,214 @@
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter_arp/arp_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables filter table");
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
+                           (1 << NF_ARP_FORWARD))
+/* Standard entry. */
+struct arpt_standard
+{
+        struct arpt_entry entry;
+        struct arpt_standard_target target;
+};
+struct arpt_error_target
+{
+        struct arpt_entry_target target;
+        char errorname[ARPT_FUNCTION_MAXNAMELEN];
+};
+struct arpt_error
+{
+        struct arpt_entry entry;
+        struct arpt_error_target target;
+};
+static struct
+{
+        struct arpt_replace repl;
+        struct arpt_standard entries[3];
+        struct arpt_error term;
+} initial_table __initdata
+= { { "filter", FILTER_VALID_HOOKS, 4,
+      sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
+      { [NF_ARP_IN] = 0,
+        [NF_ARP_OUT] = sizeof(struct arpt_standard),
+        [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
+      { [NF_ARP_IN] = 0,
+        [NF_ARP_OUT] = sizeof(struct arpt_standard),
+        [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
+      0, NULL, { } },
+    {
+            /* ARP_IN */
+            {
+                    {
+                            {
+                                    { 0 }, { 0 }, { 0 }, { 0 },
+                                    0, 0,
+                                    { { 0, }, { 0, } },
+                                    { { 0, }, { 0, } },
+                                    0, 0,
+                                    0, 0,
+                                    0, 0,
+                                    "", "", { 0 }, { 0 },
+                                    0, 0
+                            },
+                            sizeof(struct arpt_entry),
+                            sizeof(struct arpt_standard),
+                            0,
+                            { 0, 0 }, { } },
+                    { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+                      -NF_ACCEPT - 1 }
+            },
+            /* ARP_OUT */
+            {
+                    {
+                            {
+                                    { 0 }, { 0 }, { 0 }, { 0 },
+                                    0, 0,
+                                    { { 0, }, { 0, } },
+                                    { { 0, }, { 0, } },
+                                    0, 0,
+                                    0, 0,
+                                    0, 0,
+                                    "", "", { 0 }, { 0 },
+                                    0, 0
+                            },
+                            sizeof(struct arpt_entry),
+                            sizeof(struct arpt_standard),
+                            0,
+                            { 0, 0 }, { } },
+                    { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+                      -NF_ACCEPT - 1 }
+            },
+            /* ARP_FORWARD */
+            {
+                    {
+                            {
+                                    { 0 }, { 0 }, { 0 }, { 0 },
+                                    0, 0,
+                                    { { 0, }, { 0, } },
+                                    { { 0, }, { 0, } },
+                                    0, 0,
+                                    0, 0,
+                                    0, 0,
+                                    "", "", { 0 }, { 0 },
+                                    0, 0
+                            },
+                            sizeof(struct arpt_entry),
+                            sizeof(struct arpt_standard),
+                            0,
+                            { 0, 0 }, { } },
+                    { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+                      -NF_ACCEPT - 1 }
+            }
+    },
+    /* ERROR */
+    {
+            {
+                    {
+                            { 0 }, { 0 }, { 0 }, { 0 },
+                            0, 0,
+                            { { 0, }, { 0, } },
+                            { { 0, }, { 0, } },
+                            0, 0,
+                            0, 0,
+                            0, 0,
+                            "", "", { 0 }, { 0 },
+                            0, 0
+                    },
+                    sizeof(struct arpt_entry),
+                    sizeof(struct arpt_error),
+                    0,
+                    { 0, 0 }, { } },
+            { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } },
+                { } },
+              "ERROR"
+            }
+    }
+};
+static struct arpt_table packet_filter = {
+        .name           = "filter",
+        .valid_hooks    = FILTER_VALID_HOOKS,
+        .lock           = RW_LOCK_UNLOCKED,
+        .private        = NULL,
+        .me             = THIS_MODULE,
+};
+/* The work comes in here from netfilter.c */
+static unsigned int arpt_hook(unsigned int hook,
+                              struct sk_buff **pskb,
+                              const struct net_device *in,
+                              const struct net_device *out,
+                              int (*okfn)(struct sk_buff *))
+{
+        return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+static struct nf_hook_ops arpt_ops[] = {
+        {
+                .hook           = arpt_hook,
+                .owner          = THIS_MODULE,
+                .pf             = NF_ARP,
+                .hooknum        = NF_ARP_IN,
+        },
+        {
+                .hook           = arpt_hook,
+                .owner          = THIS_MODULE,
+                .pf             = NF_ARP,
+                .hooknum        = NF_ARP_OUT,
+        },
+        {
+                .hook           = arpt_hook,
+                .owner          = THIS_MODULE,
+                .pf             = NF_ARP,
+                .hooknum        = NF_ARP_FORWARD,
+        },
+};
+static int __init init(void)
+{
+        int ret, i;
+        /* Register table */
+        ret = arpt_register_table(&packet_filter, &initial_table.repl);
+        if (ret < 0)
+                return ret;
+        for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
+                if ((ret = nf_register_hook(&arpt_ops[i])) < 0)
+                        goto cleanup_hooks;
+        return ret;
+cleanup_hooks:
+        while (--i >= 0)
+                nf_unregister_hook(&arpt_ops[i]);
+        arpt_unregister_table(&packet_filter);
+        return ret;
+}
+static void __exit fini(void)
+{
+        unsigned int i;
+        for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
+                nf_unregister_hook(&arpt_ops[i]);
+        arpt_unregister_table(&packet_filter);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
new file mode 100644
index 000000000000..3dbddd062605
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -0,0 +1,167 @@
+/* Amanda extension for IP connection tracking, Version 0.2
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on HW's ip_conntrack_irc.c as well as other modules
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      Module load syntax:
+ *      insmod ip_conntrack_amanda.o [master_timeout=n]
+ *      
+ *      Where master_timeout is the timeout (in seconds) of the master
+ *      connection (port 10080).  This defaults to 5 minutes but if
+ *      your clients take longer than 5 minutes to do their work
+ *      before getting back to the Amanda server, you can increase
+ *      this value.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/moduleparam.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
+static unsigned int master_timeout = 300;
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda connection tracking module");
+MODULE_LICENSE("GPL");
+module_param(master_timeout, int, 0600);
+MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
+static char *conns[] = { "DATA ", "MESG ", "INDEX " };
+/* This is slow, but it's simple. --RR */
+static char amanda_buffer[65536];
+static DECLARE_LOCK(amanda_buffer_lock);
+unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
+                                   enum ip_conntrack_info ctinfo,
+                                   unsigned int matchoff,
+                                   unsigned int matchlen,
+                                   struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
+static int help(struct sk_buff **pskb,
+                struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+        struct ip_conntrack_expect *exp;
+        char *data, *data_limit, *tmp;
+        unsigned int dataoff, i;
+        u_int16_t port, len;
+        int ret = NF_ACCEPT;
+        /* Only look at packets from the Amanda server */
+        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+                return NF_ACCEPT;
+        /* increase the UDP timeout of the master connection as replies from
+         * Amanda clients to the server can be quite delayed */
+        ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
+        /* No data? */
+        dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+        if (dataoff >= (*pskb)->len) {
+                if (net_ratelimit())
+                        printk("amanda_help: skblen = %u\n", (*pskb)->len);
+                return NF_ACCEPT;
+        }
+        LOCK_BH(&amanda_buffer_lock);
+        skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
+        data = amanda_buffer;
+        data_limit = amanda_buffer + (*pskb)->len - dataoff;
+        *data_limit = '\0';
+        /* Search for the CONNECT string */
+        data = strstr(data, "CONNECT ");
+        if (!data)
+                goto out;
+        data += strlen("CONNECT ");
+        /* Only search first line. */   
+        if ((tmp = strchr(data, '\n')))
+                *tmp = '\0';
+        for (i = 0; i < ARRAY_SIZE(conns); i++) {
+                char *match = strstr(data, conns[i]);
+                if (!match)
+                        continue;
+                tmp = data = match + strlen(conns[i]);
+                port = simple_strtoul(data, &data, 10);
+                len = data - tmp;
+                if (port == 0 || len > 5)
+                        break;
+                exp = ip_conntrack_expect_alloc();
+                if (exp == NULL) {
+                        ret = NF_DROP;
+                        goto out;
+                }
+                exp->expectfn = NULL;
+                exp->master = ct;
+                exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
+                exp->tuple.src.u.tcp.port = 0;
+                exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+                exp->tuple.dst.protonum = IPPROTO_TCP;
+                exp->tuple.dst.u.tcp.port = htons(port);
+                exp->mask.src.ip = 0xFFFFFFFF;
+                exp->mask.src.u.tcp.port = 0;
+                exp->mask.dst.ip = 0xFFFFFFFF;
+                exp->mask.dst.protonum = 0xFF;
+                exp->mask.dst.u.tcp.port = 0xFFFF;
+                if (ip_nat_amanda_hook)
+                        ret = ip_nat_amanda_hook(pskb, ctinfo,
+                                                 tmp - amanda_buffer,
+                                                 len, exp);
+                else if (ip_conntrack_expect_related(exp) != 0) {
+                        ip_conntrack_expect_free(exp);
+                        ret = NF_DROP;
+                }
+        }
+out:
+        UNLOCK_BH(&amanda_buffer_lock);
+        return ret;
+}
+static struct ip_conntrack_helper amanda_helper = {
+        .max_expected = ARRAY_SIZE(conns),
+        .timeout = 180,
+        .me = THIS_MODULE,
+        .help = help,
+        .name = "amanda",
+        .tuple = { .src = { .u = { __constant_htons(10080) } },
+                   .dst = { .protonum = IPPROTO_UDP },
+        },
+        .mask = { .src = { .u = { 0xFFFF } },
+                 .dst = { .protonum = 0xFF },
+        },
+};
+static void __exit fini(void)
+{
+        ip_conntrack_helper_unregister(&amanda_helper);
+}
+static int __init init(void)
+{
+        return ip_conntrack_helper_register(&amanda_helper);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
new file mode 100644
index 000000000000..28d9425d5c39
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -0,0 +1,1247 @@
+/* Connection state tracking for netfilter.  This is separated from,
+   but required by, the NAT layer; it can also be used by an iptables
+   extension. */
+/* (C) 1999-2001 Paul `Rusty' Russell  
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ *      - new API and handling of conntrack/nat helpers
+ *      - now capable of multiple expectations for one master
+ * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
+ *      - add usage/reference counts to ip_conntrack_expect
+ *      - export ip_conntrack[_expect]_{find_get,put} functions
+ * */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <linux/stddef.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+/* This rwlock protects the main hash table, protocol/helper/expected
+   registrations, conntrack timers*/
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+#define IP_CONNTRACK_VERSION    "2.1"
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+DECLARE_RWLOCK(ip_conntrack_lock);
+/* ip_conntrack_standalone needs this */
+atomic_t ip_conntrack_count = ATOMIC_INIT(0);
+void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
+LIST_HEAD(ip_conntrack_expect_list);
+struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
+static LIST_HEAD(helpers);
+unsigned int ip_conntrack_htable_size = 0;
+int ip_conntrack_max;
+struct list_head *ip_conntrack_hash;
+static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_expect_cachep;
+struct ip_conntrack ip_conntrack_untracked;
+unsigned int ip_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int ip_conntrack_vmalloc;
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+void 
+ip_conntrack_put(struct ip_conntrack *ct)
+{
+        IP_NF_ASSERT(ct);
+        nf_conntrack_put(&ct->ct_general);
+}
+static int ip_conntrack_hash_rnd_initted;
+static unsigned int ip_conntrack_hash_rnd;
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+#if 0
+        dump_tuple(tuple);
+#endif
+        return (jhash_3words(tuple->src.ip,
+                             (tuple->dst.ip ^ tuple->dst.protonum),
+                             (tuple->src.u.all | (tuple->dst.u.all << 16)),
+                             ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+}
+int
+ip_ct_get_tuple(const struct iphdr *iph,
+                const struct sk_buff *skb,
+                unsigned int dataoff,
+                struct ip_conntrack_tuple *tuple,
+                const struct ip_conntrack_protocol *protocol)
+{
+        /* Never happen */
+        if (iph->frag_off & htons(IP_OFFSET)) {
+                printk("ip_conntrack_core: Frag of proto %u.\n",
+                       iph->protocol);
+                return 0;
+        }
+        tuple->src.ip = iph->saddr;
+        tuple->dst.ip = iph->daddr;
+        tuple->dst.protonum = iph->protocol;
+        tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+        return protocol->pkt_to_tuple(skb, dataoff, tuple);
+}
+int
+ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
+                   const struct ip_conntrack_tuple *orig,
+                   const struct ip_conntrack_protocol *protocol)
+{
+        inverse->src.ip = orig->dst.ip;
+        inverse->dst.ip = orig->src.ip;
+        inverse->dst.protonum = orig->dst.protonum;
+        inverse->dst.dir = !orig->dst.dir;
+        return protocol->invert_tuple(inverse, orig);
+}
+/* ip_conntrack_expect helper functions */
+static void destroy_expect(struct ip_conntrack_expect *exp)
+{
+        ip_conntrack_put(exp->master);
+        IP_NF_ASSERT(!timer_pending(&exp->timeout));
+        kmem_cache_free(ip_conntrack_expect_cachep, exp);
+        CONNTRACK_STAT_INC(expect_delete);
+}
+static void unlink_expect(struct ip_conntrack_expect *exp)
+{
+        MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+        list_del(&exp->list);
+        /* Logically in destroy_expect, but we hold the lock here. */
+        exp->master->expecting--;
+}
+static void expectation_timed_out(unsigned long ul_expect)
+{
+        struct ip_conntrack_expect *exp = (void *)ul_expect;
+        WRITE_LOCK(&ip_conntrack_lock);
+        unlink_expect(exp);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        destroy_expect(exp);
+}
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct ip_conntrack_expect *
+find_expectation(const struct ip_conntrack_tuple *tuple)
+{
+        struct ip_conntrack_expect *i;
+        list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+                /* If master is not in hash table yet (ie. packet hasn't left
+                   this machine yet), how can other end know about expected?
+                   Hence these are not the droids you are looking for (if
+                   master ct never got confirmed, we'd hold a reference to it
+                   and weird things would happen to future packets). */
+                if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+                    && is_confirmed(i->master)
+                    && del_timer(&i->timeout)) {
+                        unlink_expect(i);
+                        return i;
+                }
+        }
+        return NULL;
+}
+/* delete all expectations for this conntrack */
+static void remove_expectations(struct ip_conntrack *ct)
+{
+        struct ip_conntrack_expect *i, *tmp;
+        /* Optimization: most connection never expect any others. */
+        if (ct->expecting == 0)
+                return;
+        list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
+                if (i->master == ct && del_timer(&i->timeout)) {
+                        unlink_expect(i);
+                        destroy_expect(i);
+                }
+        }
+}
+static void
+clean_from_lists(struct ip_conntrack *ct)
+{
+        unsigned int ho, hr;
+        
+        DEBUGP("clean_from_lists(%p)\n", ct);
+        MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+        ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+        hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+        LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+        LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+        /* Destroy all pending expectations */
+        remove_expectations(ct);
+}
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+        struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
+        struct ip_conntrack_protocol *proto;
+        DEBUGP("destroy_conntrack(%p)\n", ct);
+        IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
+        IP_NF_ASSERT(!timer_pending(&ct->timeout));
+        /* To make sure we don't get any weird locking issues here:
+         * destroy_conntrack() MUST NOT be called with a write lock
+         * to ip_conntrack_lock!!! -HW */
+        proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+        if (proto && proto->destroy)
+                proto->destroy(ct);
+        if (ip_conntrack_destroyed)
+                ip_conntrack_destroyed(ct);
+        WRITE_LOCK(&ip_conntrack_lock);
+        /* Expectations will have been removed in clean_from_lists,
+         * except TFTP can create an expectation on the first packet,
+         * before connection is in the list, so we need to clean here,
+         * too. */
+        remove_expectations(ct);
+        /* We overload first tuple to link into unconfirmed list. */
+        if (!is_confirmed(ct)) {
+                BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+                list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+        }
+        CONNTRACK_STAT_INC(delete);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        if (ct->master)
+                ip_conntrack_put(ct->master);
+        DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+        kmem_cache_free(ip_conntrack_cachep, ct);
+        atomic_dec(&ip_conntrack_count);
+}
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+        struct ip_conntrack *ct = (void *)ul_conntrack;
+        WRITE_LOCK(&ip_conntrack_lock);
+        /* Inside lock so preempt is disabled on module removal path.
+         * Otherwise we can get spurious warnings. */
+        CONNTRACK_STAT_INC(delete_list);
+        clean_from_lists(ct);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        ip_conntrack_put(ct);
+}
+static inline int
+conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
+                    const struct ip_conntrack_tuple *tuple,
+                    const struct ip_conntrack *ignored_conntrack)
+{
+        MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+        return tuplehash_to_ctrack(i) != ignored_conntrack
+                && ip_ct_tuple_equal(tuple, &i->tuple);
+}
+static struct ip_conntrack_tuple_hash *
+__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
+                    const struct ip_conntrack *ignored_conntrack)
+{
+        struct ip_conntrack_tuple_hash *h;
+        unsigned int hash = hash_conntrack(tuple);
+        MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+        list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
+                if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+                        CONNTRACK_STAT_INC(found);
+                        return h;
+                }
+                CONNTRACK_STAT_INC(searched);
+        }
+        return NULL;
+}
+/* Find a connection corresponding to a tuple. */
+struct ip_conntrack_tuple_hash *
+ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
+                      const struct ip_conntrack *ignored_conntrack)
+{
+        struct ip_conntrack_tuple_hash *h;
+        READ_LOCK(&ip_conntrack_lock);
+        h = __ip_conntrack_find(tuple, ignored_conntrack);
+        if (h)
+                atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+        READ_UNLOCK(&ip_conntrack_lock);
+        return h;
+}
+/* Confirm a connection given skb; places it in hash table */
+int
+__ip_conntrack_confirm(struct sk_buff **pskb)
+{
+        unsigned int hash, repl_hash;
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        /* ipt_REJECT uses ip_conntrack_attach to attach related
+           ICMP/TCP RST packets in other direction.  Actual packet
+           which created connection will be IP_CT_NEW or for an
+           expected connection, IP_CT_RELATED. */
+        if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+                return NF_ACCEPT;
+        hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+        repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+        /* We're not in hash table, and we refuse to set up related
+           connections for unconfirmed conns.  But packet copies and
+           REJECT will give spurious warnings here. */
+        /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+        /* No external references means noone else could have
+           confirmed us. */
+        IP_NF_ASSERT(!is_confirmed(ct));
+        DEBUGP("Confirming conntrack %p\n", ct);
+        WRITE_LOCK(&ip_conntrack_lock);
+        /* See if there's one in the list already, including reverse:
+           NAT could have grabbed it without realizing, since we're
+           not in the hash.  If there is, we lost race. */
+        if (!LIST_FIND(&ip_conntrack_hash[hash],
+                       conntrack_tuple_cmp,
+                       struct ip_conntrack_tuple_hash *,
+                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
+            && !LIST_FIND(&ip_conntrack_hash[repl_hash],
+                          conntrack_tuple_cmp,
+                          struct ip_conntrack_tuple_hash *,
+                          &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+                /* Remove from unconfirmed list */
+                list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+                list_prepend(&ip_conntrack_hash[hash],
+                             &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+                list_prepend(&ip_conntrack_hash[repl_hash],
+                             &ct->tuplehash[IP_CT_DIR_REPLY]);
+                /* Timer relative to confirmation time, not original
+                   setting time, otherwise we'd get timer wrap in
+                   weird delay cases. */
+                ct->timeout.expires += jiffies;
+                add_timer(&ct->timeout);
+                atomic_inc(&ct->ct_general.use);
+                set_bit(IPS_CONFIRMED_BIT, &ct->status);
+                CONNTRACK_STAT_INC(insert);
+                WRITE_UNLOCK(&ip_conntrack_lock);
+                return NF_ACCEPT;
+        }
+        CONNTRACK_STAT_INC(insert_failed);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        return NF_DROP;
+}
+/* Returns true if a connection correspondings to the tuple (required
+   for NAT). */
+int
+ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
+                         const struct ip_conntrack *ignored_conntrack)
+{
+        struct ip_conntrack_tuple_hash *h;
+        READ_LOCK(&ip_conntrack_lock);
+        h = __ip_conntrack_find(tuple, ignored_conntrack);
+        READ_UNLOCK(&ip_conntrack_lock);
+        return h != NULL;
+}
+/* There's a small race here where we may free a just-assured
+   connection.  Too bad: we're in trouble anyway. */
+static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
+{
+        return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
+}
+static int early_drop(struct list_head *chain)
+{
+        /* Traverse backwards: gives us oldest, which is roughly LRU */
+        struct ip_conntrack_tuple_hash *h;
+        struct ip_conntrack *ct = NULL;
+        int dropped = 0;
+        READ_LOCK(&ip_conntrack_lock);
+        h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
+        if (h) {
+                ct = tuplehash_to_ctrack(h);
+                atomic_inc(&ct->ct_general.use);
+        }
+        READ_UNLOCK(&ip_conntrack_lock);
+        if (!ct)
+                return dropped;
+        if (del_timer(&ct->timeout)) {
+                death_by_timeout((unsigned long)ct);
+                dropped = 1;
+                CONNTRACK_STAT_INC(early_drop);
+        }
+        ip_conntrack_put(ct);
+        return dropped;
+}
+static inline int helper_cmp(const struct ip_conntrack_helper *i,
+                             const struct ip_conntrack_tuple *rtuple)
+{
+        return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
+}
+static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+{
+        return LIST_FIND(&helpers, helper_cmp,
+                         struct ip_conntrack_helper *,
+                         tuple);
+}
+/* Allocate a new conntrack: we return -ENOMEM if classification
+   failed due to stress.  Otherwise it really is unclassifiable. */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(const struct ip_conntrack_tuple *tuple,
+               struct ip_conntrack_protocol *protocol,
+               struct sk_buff *skb)
+{
+        struct ip_conntrack *conntrack;
+        struct ip_conntrack_tuple repl_tuple;
+        size_t hash;
+        struct ip_conntrack_expect *exp;
+        if (!ip_conntrack_hash_rnd_initted) {
+                get_random_bytes(&ip_conntrack_hash_rnd, 4);
+                ip_conntrack_hash_rnd_initted = 1;
+        }
+        hash = hash_conntrack(tuple);
+        if (ip_conntrack_max
+            && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+                /* Try dropping from this hash chain. */
+                if (!early_drop(&ip_conntrack_hash[hash])) {
+                        if (net_ratelimit())
+                                printk(KERN_WARNING
+                                       "ip_conntrack: table full, dropping"
+                                       " packet.\n");
+                        return ERR_PTR(-ENOMEM);
+                }
+        }
+        if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+                DEBUGP("Can't invert tuple.\n");
+                return NULL;
+        }
+        conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
+        if (!conntrack) {
+                DEBUGP("Can't allocate conntrack.\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        memset(conntrack, 0, sizeof(*conntrack));
+        atomic_set(&conntrack->ct_general.use, 1);
+        conntrack->ct_general.destroy = destroy_conntrack;
+        conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
+        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
+        if (!protocol->new(conntrack, skb)) {
+                kmem_cache_free(ip_conntrack_cachep, conntrack);
+                return NULL;
+        }
+        /* Don't set timer yet: wait for confirmation */
+        init_timer(&conntrack->timeout);
+        conntrack->timeout.data = (unsigned long)conntrack;
+        conntrack->timeout.function = death_by_timeout;
+        WRITE_LOCK(&ip_conntrack_lock);
+        exp = find_expectation(tuple);
+        if (exp) {
+                DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
+                        conntrack, exp);
+                /* Welcome, Mr. Bond.  We've been expecting you... */
+                __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
+                conntrack->master = exp->master;
+#if CONFIG_IP_NF_CONNTRACK_MARK
+                conntrack->mark = exp->master->mark;
+#endif
+                nf_conntrack_get(&conntrack->master->ct_general);
+                CONNTRACK_STAT_INC(expect_new);
+        } else {
+                conntrack->helper = ip_ct_find_helper(&repl_tuple);
+                CONNTRACK_STAT_INC(new);
+        }
+        /* Overload tuple linked list to put us in unconfirmed list. */
+        list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+        atomic_inc(&ip_conntrack_count);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        if (exp) {
+                if (exp->expectfn)
+                        exp->expectfn(conntrack, exp);
+                destroy_expect(exp);
+        }
+        return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
+}
+/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+static inline struct ip_conntrack *
+resolve_normal_ct(struct sk_buff *skb,
+                  struct ip_conntrack_protocol *proto,
+                  int *set_reply,
+                  unsigned int hooknum,
+                  enum ip_conntrack_info *ctinfo)
+{
+        struct ip_conntrack_tuple tuple;
+        struct ip_conntrack_tuple_hash *h;
+        struct ip_conntrack *ct;
+        IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
+        if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
+                                &tuple,proto))
+                return NULL;
+        /* look for tuple match */
+        h = ip_conntrack_find_get(&tuple, NULL);
+        if (!h) {
+                h = init_conntrack(&tuple, proto, skb);
+                if (!h)
+                        return NULL;
+                if (IS_ERR(h))
+                        return (void *)h;
+        }
+        ct = tuplehash_to_ctrack(h);
+        /* It exists; we have (non-exclusive) reference. */
+        if (DIRECTION(h) == IP_CT_DIR_REPLY) {
+                *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+                /* Please set reply bit if this packet OK */
+                *set_reply = 1;
+        } else {
+                /* Once we've had two way comms, always ESTABLISHED. */
+                if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+                        DEBUGP("ip_conntrack_in: normal packet for %p\n",
+                               ct);
+                        *ctinfo = IP_CT_ESTABLISHED;
+                } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
+                        DEBUGP("ip_conntrack_in: related packet for %p\n",
+                               ct);
+                        *ctinfo = IP_CT_RELATED;
+                } else {
+                        DEBUGP("ip_conntrack_in: new packet for %p\n",
+                               ct);
+                        *ctinfo = IP_CT_NEW;
+                }
+                *set_reply = 0;
+        }
+        skb->nfct = &ct->ct_general;
+        skb->nfctinfo = *ctinfo;
+        return ct;
+}
+/* Netfilter hook itself. */
+unsigned int ip_conntrack_in(unsigned int hooknum,
+                             struct sk_buff **pskb,
+                             const struct net_device *in,
+                             const struct net_device *out,
+                             int (*okfn)(struct sk_buff *))
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        struct ip_conntrack_protocol *proto;
+        int set_reply;
+        int ret;
+        /* Previously seen (loopback or untracked)?  Ignore. */
+        if ((*pskb)->nfct) {
+                CONNTRACK_STAT_INC(ignore);
+                return NF_ACCEPT;
+        }
+        /* Never happen */
+        if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
+                if (net_ratelimit()) {
+                printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
+                       (*pskb)->nh.iph->protocol, hooknum);
+                }
+                return NF_DROP;
+        }
+        /* FIXME: Do this right please. --RR */
+        (*pskb)->nfcache |= NFC_UNKNOWN;
+/* Doesn't cover locally-generated broadcast, so not worth it. */
+#if 0
+        /* Ignore broadcast: no `connection'. */
+        if ((*pskb)->pkt_type == PACKET_BROADCAST) {
+                printk("Broadcast packet!\n");
+                return NF_ACCEPT;
+        } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
+                   == htonl(0x000000FF)) {
+                printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
+                       NIPQUAD((*pskb)->nh.iph->saddr),
+                       NIPQUAD((*pskb)->nh.iph->daddr),
+                       (*pskb)->sk, (*pskb)->pkt_type);
+        }
+#endif
+        proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+        /* It may be an special packet, error, unclean...
+         * inverse of the return code tells to the netfilter
+         * core what to do with the packet. */
+        if (proto->error != NULL 
+            && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
+                CONNTRACK_STAT_INC(error);
+                CONNTRACK_STAT_INC(invalid);
+                return -ret;
+        }
+        if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
+                /* Not valid part of a connection */
+                CONNTRACK_STAT_INC(invalid);
+                return NF_ACCEPT;
+        }
+        if (IS_ERR(ct)) {
+                /* Too stressed to deal. */
+                CONNTRACK_STAT_INC(drop);
+                return NF_DROP;
+        }
+        IP_NF_ASSERT((*pskb)->nfct);
+        ret = proto->packet(ct, *pskb, ctinfo);
+        if (ret < 0) {
+                /* Invalid: inverse of the return code tells
+                 * the netfilter core what to do*/
+                nf_conntrack_put((*pskb)->nfct);
+                (*pskb)->nfct = NULL;
+                CONNTRACK_STAT_INC(invalid);
+                return -ret;
+        }
+        if (set_reply)
+                set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+        return ret;
+}
+int invert_tuplepr(struct ip_conntrack_tuple *inverse,
+                   const struct ip_conntrack_tuple *orig)
+{
+        return ip_ct_invert_tuple(inverse, orig, 
+                                  ip_ct_find_proto(orig->dst.protonum));
+}
+/* Would two expected things clash? */
+static inline int expect_clash(const struct ip_conntrack_expect *a,
+                               const struct ip_conntrack_expect *b)
+{
+        /* Part covered by intersection of masks must be unequal,
+           otherwise they clash */
+        struct ip_conntrack_tuple intersect_mask
+                = { { a->mask.src.ip & b->mask.src.ip,
+                      { a->mask.src.u.all & b->mask.src.u.all } },
+                    { a->mask.dst.ip & b->mask.dst.ip,
+                      { a->mask.dst.u.all & b->mask.dst.u.all },
+                      a->mask.dst.protonum & b->mask.dst.protonum } };
+        return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+}
+static inline int expect_matches(const struct ip_conntrack_expect *a,
+                                 const struct ip_conntrack_expect *b)
+{
+        return a->master == b->master
+                && ip_ct_tuple_equal(&a->tuple, &b->tuple)
+                && ip_ct_tuple_equal(&a->mask, &b->mask);
+}
+/* Generally a bad idea to call this: could have matched already. */
+void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
+{
+        struct ip_conntrack_expect *i;
+        WRITE_LOCK(&ip_conntrack_lock);
+        /* choose the the oldest expectation to evict */
+        list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+                if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+                        unlink_expect(i);
+                        WRITE_UNLOCK(&ip_conntrack_lock);
+                        destroy_expect(i);
+                        return;
+                }
+        }
+        WRITE_UNLOCK(&ip_conntrack_lock);
+}
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
+{
+        struct ip_conntrack_expect *new;
+        new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
+        if (!new) {
+                DEBUGP("expect_related: OOM allocating expect\n");
+                return NULL;
+        }
+        new->master = NULL;
+        return new;
+}
+void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+{
+        kmem_cache_free(ip_conntrack_expect_cachep, expect);
+}
+static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
+{
+        atomic_inc(&exp->master->ct_general.use);
+        exp->master->expecting++;
+        list_add(&exp->list, &ip_conntrack_expect_list);
+        if (exp->master->helper->timeout) {
+                init_timer(&exp->timeout);
+                exp->timeout.data = (unsigned long)exp;
+                exp->timeout.function = expectation_timed_out;
+                exp->timeout.expires
+                        = jiffies + exp->master->helper->timeout * HZ;
+                add_timer(&exp->timeout);
+        } else
+                exp->timeout.function = NULL;
+        CONNTRACK_STAT_INC(expect_create);
+}
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct ip_conntrack *master)
+{
+        struct ip_conntrack_expect *i;
+        list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+                if (i->master == master) {
+                        if (del_timer(&i->timeout)) {
+                                unlink_expect(i);
+                                destroy_expect(i);
+                        }
+                        break;
+                }
+        }
+}
+static inline int refresh_timer(struct ip_conntrack_expect *i)
+{
+        if (!del_timer(&i->timeout))
+                return 0;
+        i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+        add_timer(&i->timeout);
+        return 1;
+}
+int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
+{
+        struct ip_conntrack_expect *i;
+        int ret;
+        DEBUGP("ip_conntrack_expect_related %p\n", related_to);
+        DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
+        DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
+        WRITE_LOCK(&ip_conntrack_lock);
+        list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+                if (expect_matches(i, expect)) {
+                        /* Refresh timer: if it's dying, ignore.. */
+                        if (refresh_timer(i)) {
+                                ret = 0;
+                                /* We don't need the one they've given us. */
+                                ip_conntrack_expect_free(expect);
+                                goto out;
+                        }
+                } else if (expect_clash(i, expect)) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+        }
+        /* Will be over limit? */
+        if (expect->master->helper->max_expected && 
+            expect->master->expecting >= expect->master->helper->max_expected)
+                evict_oldest_expect(expect->master);
+        ip_conntrack_expect_insert(expect);
+        ret = 0;
+out:
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        return ret;
+}
+/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
+   implicitly racy: see __ip_conntrack_confirm */
+void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+                              const struct ip_conntrack_tuple *newreply)
+{
+        WRITE_LOCK(&ip_conntrack_lock);
+        /* Should be unconfirmed, so not in hash table yet */
+        IP_NF_ASSERT(!is_confirmed(conntrack));
+        DEBUGP("Altering reply tuple of %p to ", conntrack);
+        DUMP_TUPLE(newreply);
+        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+        if (!conntrack->master && conntrack->expecting == 0)
+                conntrack->helper = ip_ct_find_helper(newreply);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+}
+int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
+{
+        BUG_ON(me->timeout == 0);
+        WRITE_LOCK(&ip_conntrack_lock);
+        list_prepend(&helpers, me);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        return 0;
+}
+static inline int unhelp(struct ip_conntrack_tuple_hash *i,
+                         const struct ip_conntrack_helper *me)
+{
+        if (tuplehash_to_ctrack(i)->helper == me)
+                tuplehash_to_ctrack(i)->helper = NULL;
+        return 0;
+}
+void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
+{
+        unsigned int i;
+        struct ip_conntrack_expect *exp, *tmp;
+        /* Need write lock here, to delete helper. */
+        WRITE_LOCK(&ip_conntrack_lock);
+        LIST_DELETE(&helpers, me);
+        /* Get rid of expectations */
+        list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
+                if (exp->master->helper == me && del_timer(&exp->timeout)) {
+                        unlink_expect(exp);
+                        destroy_expect(exp);
+                }
+        }
+        /* Get rid of expecteds, set helpers to NULL. */
+        LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
+        for (i = 0; i < ip_conntrack_htable_size; i++)
+                LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
+                            struct ip_conntrack_tuple_hash *, me);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        /* Someone could be still looking at the helper in a bh. */
+        synchronize_net();
+}
+static inline void ct_add_counters(struct ip_conntrack *ct,
+                                   enum ip_conntrack_info ctinfo,
+                                   const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+        if (skb) {
+                ct->counters[CTINFO2DIR(ctinfo)].packets++;
+                ct->counters[CTINFO2DIR(ctinfo)].bytes += 
+                                        ntohs(skb->nh.iph->tot_len);
+        }
+#endif
+}
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct, 
+                        enum ip_conntrack_info ctinfo,
+                        const struct sk_buff *skb,
+                        unsigned long extra_jiffies)
+{
+        IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+        /* If not in hash table, timer will not be active yet */
+        if (!is_confirmed(ct)) {
+                ct->timeout.expires = extra_jiffies;
+                ct_add_counters(ct, ctinfo, skb);
+        } else {
+                WRITE_LOCK(&ip_conntrack_lock);
+                /* Need del_timer for race avoidance (may already be dying). */
+                if (del_timer(&ct->timeout)) {
+                        ct->timeout.expires = jiffies + extra_jiffies;
+                        add_timer(&ct->timeout);
+                }
+                ct_add_counters(ct, ctinfo, skb);
+                WRITE_UNLOCK(&ip_conntrack_lock);
+        }
+}
+/* Returns new sk_buff, or NULL */
+struct sk_buff *
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+        struct sock *sk = skb->sk;
+#ifdef CONFIG_NETFILTER_DEBUG
+        unsigned int olddebug = skb->nf_debug;
+#endif
+        if (sk) {
+                sock_hold(sk);
+                skb_orphan(skb);
+        }
+        local_bh_disable(); 
+        skb = ip_defrag(skb, user);
+        local_bh_enable();
+        if (!skb) {
+                if (sk)
+                        sock_put(sk);
+                return skb;
+        }
+        if (sk) {
+                skb_set_owner_w(skb, sk);
+                sock_put(sk);
+        }
+        ip_send_check(skb->nh.iph);
+        skb->nfcache |= NFC_ALTERED;
+#ifdef CONFIG_NETFILTER_DEBUG
+        /* Packet path as if nothing had happened. */
+        skb->nf_debug = olddebug;
+#endif
+        return skb;
+}
+/* Used by ipt_REJECT. */
+static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        /* This ICMP is in reverse direction to the packet which caused it */
+        ct = ip_conntrack_get(skb, &ctinfo);
+        
+        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+                ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+        else
+                ctinfo = IP_CT_RELATED;
+        /* Attach to new skbuff, and increment count */
+        nskb->nfct = &ct->ct_general;
+        nskb->nfctinfo = ctinfo;
+        nf_conntrack_get(nskb->nfct);
+}
+static inline int
+do_iter(const struct ip_conntrack_tuple_hash *i,
+        int (*iter)(struct ip_conntrack *i, void *data),
+        void *data)
+{
+        return iter(tuplehash_to_ctrack(i), data);
+}
+/* Bring out ya dead! */
+static struct ip_conntrack_tuple_hash *
+get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
+                void *data, unsigned int *bucket)
+{
+        struct ip_conntrack_tuple_hash *h = NULL;
+        WRITE_LOCK(&ip_conntrack_lock);
+        for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+                h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
+                                struct ip_conntrack_tuple_hash *, iter, data);
+                if (h)
+                        break;
+        }
+        if (!h)
+                h = LIST_FIND_W(&unconfirmed, do_iter,
+                                struct ip_conntrack_tuple_hash *, iter, data);
+        if (h)
+                atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        return h;
+}
+void
+ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
+{
+        struct ip_conntrack_tuple_hash *h;
+        unsigned int bucket = 0;
+        while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+                struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+                /* Time to push up daises... */
+                if (del_timer(&ct->timeout))
+                        death_by_timeout((unsigned long)ct);
+                /* ... else the timer will get him soon. */
+                ip_conntrack_put(ct);
+        }
+}
+/* Fast function for those who don't want to parse /proc (and I don't
+   blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+   mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_conntrack_tuple_hash *h;
+        struct ip_conntrack_tuple tuple;
+        
+        IP_CT_TUPLE_U_BLANK(&tuple);
+        tuple.src.ip = inet->rcv_saddr;
+        tuple.src.u.tcp.port = inet->sport;
+        tuple.dst.ip = inet->daddr;
+        tuple.dst.u.tcp.port = inet->dport;
+        tuple.dst.protonum = IPPROTO_TCP;
+        /* We only do TCP at the moment: is there a better way? */
+        if (strcmp(sk->sk_prot->name, "TCP")) {
+                DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
+                return -ENOPROTOOPT;
+        }
+        if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
+                DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
+                       *len, sizeof(struct sockaddr_in));
+                return -EINVAL;
+        }
+        h = ip_conntrack_find_get(&tuple, NULL);
+        if (h) {
+                struct sockaddr_in sin;
+                struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+                sin.sin_family = AF_INET;
+                sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+                        .tuple.dst.u.tcp.port;
+                sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+                        .tuple.dst.ip;
+                DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+                       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+                ip_conntrack_put(ct);
+                if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+                        return -EFAULT;
+                else
+                        return 0;
+        }
+        DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
+               NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
+               NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
+        return -ENOENT;
+}
+static struct nf_sockopt_ops so_getorigdst = {
+        .pf             = PF_INET,
+        .get_optmin     = SO_ORIGINAL_DST,
+        .get_optmax     = SO_ORIGINAL_DST+1,
+        .get            = &getorigdst,
+};
+static int kill_all(struct ip_conntrack *i, void *data)
+{
+        return 1;
+}
+static void free_conntrack_hash(void)
+{
+        if (ip_conntrack_vmalloc)
+                vfree(ip_conntrack_hash);
+        else
+                free_pages((unsigned long)ip_conntrack_hash, 
+                           get_order(sizeof(struct list_head)
+                                     * ip_conntrack_htable_size));
+}
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+        ip_ct_attach = NULL;
+        /* This makes sure all current packets have passed through
+           netfilter framework.  Roll on, two-stage module
+           delete... */
+        synchronize_net();
+ 
+ i_see_dead_people:
+        ip_ct_iterate_cleanup(kill_all, NULL);
+        if (atomic_read(&ip_conntrack_count) != 0) {
+                schedule();
+                goto i_see_dead_people;
+        }
+        kmem_cache_destroy(ip_conntrack_cachep);
+        kmem_cache_destroy(ip_conntrack_expect_cachep);
+        free_conntrack_hash();
+        nf_unregister_sockopt(&so_getorigdst);
+}
+static int hashsize;
+module_param(hashsize, int, 0400);
+int __init ip_conntrack_init(void)
+{
+        unsigned int i;
+        int ret;
+        /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
+         * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
+        if (hashsize) {
+                ip_conntrack_htable_size = hashsize;
+        } else {
+                ip_conntrack_htable_size
+                        = (((num_physpages << PAGE_SHIFT) / 16384)
+                           / sizeof(struct list_head));
+                if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+                        ip_conntrack_htable_size = 8192;
+                if (ip_conntrack_htable_size < 16)
+                        ip_conntrack_htable_size = 16;
+        }
+        ip_conntrack_max = 8 * ip_conntrack_htable_size;
+        printk("ip_conntrack version %s (%u buckets, %d max)"
+               " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
+               ip_conntrack_htable_size, ip_conntrack_max,
+               sizeof(struct ip_conntrack));
+        ret = nf_register_sockopt(&so_getorigdst);
+        if (ret != 0) {
+                printk(KERN_ERR "Unable to register netfilter socket option\n");
+                return ret;
+        }
+        /* AK: the hash table is twice as big than needed because it
+           uses list_head.  it would be much nicer to caches to use a
+           single pointer list head here. */
+        ip_conntrack_vmalloc = 0; 
+        ip_conntrack_hash 
+                =(void*)__get_free_pages(GFP_KERNEL, 
+                                         get_order(sizeof(struct list_head)
+                                                   *ip_conntrack_htable_size));
+        if (!ip_conntrack_hash) { 
+                ip_conntrack_vmalloc = 1;
+                printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
+                ip_conntrack_hash = vmalloc(sizeof(struct list_head)
+                                            * ip_conntrack_htable_size);
+        }
+        if (!ip_conntrack_hash) {
+                printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
+                goto err_unreg_sockopt;
+        }
+        ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
+                                                sizeof(struct ip_conntrack), 0,
+                                                0, NULL, NULL);
+        if (!ip_conntrack_cachep) {
+                printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
+                goto err_free_hash;
+        }
+        ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+                                        sizeof(struct ip_conntrack_expect),
+                                        0, 0, NULL, NULL);
+        if (!ip_conntrack_expect_cachep) {
+                printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+                goto err_free_conntrack_slab;
+        }
+        /* Don't NEED lock here, but good form anyway. */
+        WRITE_LOCK(&ip_conntrack_lock);
+        for (i = 0; i < MAX_IP_CT_PROTO; i++)
+                ip_ct_protos[i] = &ip_conntrack_generic_protocol;
+        /* Sew in builtin protocols. */
+        ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+        ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+        ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        for (i = 0; i < ip_conntrack_htable_size; i++)
+                INIT_LIST_HEAD(&ip_conntrack_hash[i]);
+        /* For use by ipt_REJECT */
+        ip_ct_attach = ip_conntrack_attach;
+        /* Set up fake conntrack:
+            - to never be deleted, not in any hashes */
+        atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
+        /*  - and look it like as a confirmed connection */
+        set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
+        return ret;
+err_free_conntrack_slab:
+        kmem_cache_destroy(ip_conntrack_cachep);
+err_free_hash:
+        free_conntrack_hash();
+err_unreg_sockopt:
+        nf_unregister_sockopt(&so_getorigdst);
+        return -ENOMEM;
+}
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
new file mode 100644
index 000000000000..12b88cbb11db
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -0,0 +1,501 @@
+/* FTP extension for IP connection tracking. */
+/* (C) 1999-2001 Paul `Rusty' Russell  
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/ctype.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+#include <linux/moduleparam.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp connection tracking helper");
+/* This is slow, but it's simple. --RR */
+static char ftp_buffer[65536];
+static DECLARE_LOCK(ip_ftp_lock);
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, int, &ports_c, 0400);
+static int loose;
+module_param(loose, int, 0600);
+unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb,
+                                enum ip_conntrack_info ctinfo,
+                                enum ip_ct_ftp_type type,
+                                unsigned int matchoff,
+                                unsigned int matchlen,
+                                struct ip_conntrack_expect *exp,
+                                u32 *seq);
+EXPORT_SYMBOL_GPL(ip_nat_ftp_hook);
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static int try_rfc959(const char *, size_t, u_int32_t [], char);
+static int try_eprt(const char *, size_t, u_int32_t [], char);
+static int try_epsv_response(const char *, size_t, u_int32_t [], char);
+static struct ftp_search {
+        enum ip_conntrack_dir dir;
+        const char *pattern;
+        size_t plen;
+        char skip;
+        char term;
+        enum ip_ct_ftp_type ftptype;
+        int (*getnum)(const char *, size_t, u_int32_t[], char);
+} search[] = {
+        {
+                IP_CT_DIR_ORIGINAL,
+                "PORT", sizeof("PORT") - 1, ' ', '\r',
+                IP_CT_FTP_PORT,
+                try_rfc959,
+        },
+        {
+                IP_CT_DIR_REPLY,
+                "227 ", sizeof("227 ") - 1, '(', ')',
+                IP_CT_FTP_PASV,
+                try_rfc959,
+        },
+        {
+                IP_CT_DIR_ORIGINAL,
+                "EPRT", sizeof("EPRT") - 1, ' ', '\r',
+                IP_CT_FTP_EPRT,
+                try_eprt,
+        },
+        {
+                IP_CT_DIR_REPLY,
+                "229 ", sizeof("229 ") - 1, '(', ')',
+                IP_CT_FTP_EPSV,
+                try_epsv_response,
+        },
+};
+static int try_number(const char *data, size_t dlen, u_int32_t array[],
+                      int array_size, char sep, char term)
+{
+        u_int32_t i, len;
+        memset(array, 0, sizeof(array[0])*array_size);
+        /* Keep data pointing at next char. */
+        for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
+                if (*data >= '0' && *data <= '9') {
+                        array[i] = array[i]*10 + *data - '0';
+                }
+                else if (*data == sep)
+                        i++;
+                else {
+                        /* Unexpected character; true if it's the
+                           terminator and we're finished. */
+                        if (*data == term && i == array_size - 1)
+                                return len;
+                        DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
+                               len, i, *data);
+                        return 0;
+                }
+        }
+        DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
+        return 0;
+}
+/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
+static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6],
+                       char term)
+{
+        return try_number(data, dlen, array, 6, ',', term);
+}
+/* Grab port: number up to delimiter */
+static int get_port(const char *data, int start, size_t dlen, char delim,
+                    u_int32_t array[2])
+{
+        u_int16_t port = 0;
+        int i;
+        for (i = start; i < dlen; i++) {
+                /* Finished? */
+                if (data[i] == delim) {
+                        if (port == 0)
+                                break;
+                        array[0] = port >> 8;
+                        array[1] = port;
+                        return i + 1;
+                }
+                else if (data[i] >= '0' && data[i] <= '9')
+                        port = port*10 + data[i] - '0';
+                else /* Some other crap */
+                        break;
+        }
+        return 0;
+}
+/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */
+static int try_eprt(const char *data, size_t dlen, u_int32_t array[6],
+                    char term)
+{
+        char delim;
+        int length;
+        /* First character is delimiter, then "1" for IPv4, then
+           delimiter again. */
+        if (dlen <= 3) return 0;
+        delim = data[0];
+        if (isdigit(delim) || delim < 33 || delim > 126
+            || data[1] != '1' || data[2] != delim)
+                return 0;
+        DEBUGP("EPRT: Got |1|!\n");
+        /* Now we have IP address. */
+        length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
+        if (length == 0)
+                return 0;
+        DEBUGP("EPRT: Got IP address!\n");
+        /* Start offset includes initial "|1|", and trailing delimiter */
+        return get_port(data, 3 + length + 1, dlen, delim, array+4);
+}
+/* Returns 0, or length of numbers: |||6446| */
+static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6],
+                             char term)
+{
+        char delim;
+        /* Three delimiters. */
+        if (dlen <= 3) return 0;
+        delim = data[0];
+        if (isdigit(delim) || delim < 33 || delim > 126
+            || data[1] != delim || data[2] != delim)
+                return 0;
+        return get_port(data, 3, dlen, delim, array+4);
+}
+/* Return 1 for match, 0 for accept, -1 for partial. */
+static int find_pattern(const char *data, size_t dlen,
+                        const char *pattern, size_t plen,
+                        char skip, char term,
+                        unsigned int *numoff,
+                        unsigned int *numlen,
+                        u_int32_t array[6],
+                        int (*getnum)(const char *, size_t, u_int32_t[], char))
+{
+        size_t i;
+        DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
+        if (dlen == 0)
+                return 0;
+        if (dlen <= plen) {
+                /* Short packet: try for partial? */
+                if (strnicmp(data, pattern, dlen) == 0)
+                        return -1;
+                else return 0;
+        }
+        if (strnicmp(data, pattern, plen) != 0) {
+#if 0
+                size_t i;
+                DEBUGP("ftp: string mismatch\n");
+                for (i = 0; i < plen; i++) {
+                        DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+                                i, data[i], data[i],
+                                pattern[i], pattern[i]);
+                }
+#endif
+                return 0;
+        }
+        DEBUGP("Pattern matches!\n");
+        /* Now we've found the constant string, try to skip
+           to the 'skip' character */
+        for (i = plen; data[i] != skip; i++)
+                if (i == dlen - 1) return -1;
+        /* Skip over the last character */
+        i++;
+        DEBUGP("Skipped up to `%c'!\n", skip);
+        *numoff = i;
+        *numlen = getnum(data + i, dlen - i, array, term);
+        if (!*numlen)
+                return -1;
+        DEBUGP("Match succeeded!\n");
+        return 1;
+}
+/* Look up to see if we're just after a \n. */
+static int find_nl_seq(u16 seq, const struct ip_ct_ftp_master *info, int dir)
+{
+        unsigned int i;
+        for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
+                if (info->seq_aft_nl[dir][i] == seq)
+                        return 1;
+        return 0;
+}
+/* We don't update if it's older than what we have. */
+static void update_nl_seq(u16 nl_seq, struct ip_ct_ftp_master *info, int dir)
+{
+        unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
+        /* Look for oldest: if we find exact match, we're done. */
+        for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
+                if (info->seq_aft_nl[dir][i] == nl_seq)
+                        return;
+                if (oldest == info->seq_aft_nl_num[dir]
+                    || before(info->seq_aft_nl[dir][i], oldest))
+                        oldest = i;
+        }
+        if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+                info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
+        else if (oldest != NUM_SEQ_TO_REMEMBER)
+                info->seq_aft_nl[dir][oldest] = nl_seq;
+}
+static int help(struct sk_buff **pskb,
+                struct ip_conntrack *ct,
+                enum ip_conntrack_info ctinfo)
+{
+        unsigned int dataoff, datalen;
+        struct tcphdr _tcph, *th;
+        char *fb_ptr;
+        int ret;
+        u32 seq, array[6] = { 0 };
+        int dir = CTINFO2DIR(ctinfo);
+        unsigned int matchlen, matchoff;
+        struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info;
+        struct ip_conntrack_expect *exp;
+        unsigned int i;
+        int found = 0, ends_in_nl;
+        /* Until there's been traffic both ways, don't look in packets. */
+        if (ctinfo != IP_CT_ESTABLISHED
+            && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
+                DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
+                return NF_ACCEPT;
+        }
+        th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL)
+                return NF_ACCEPT;
+        dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
+        /* No data? */
+        if (dataoff >= (*pskb)->len) {
+                DEBUGP("ftp: pskblen = %u\n", (*pskb)->len);
+                return NF_ACCEPT;
+        }
+        datalen = (*pskb)->len - dataoff;
+        LOCK_BH(&ip_ftp_lock);
+        fb_ptr = skb_header_pointer(*pskb, dataoff,
+                                    (*pskb)->len - dataoff, ftp_buffer);
+        BUG_ON(fb_ptr == NULL);
+        ends_in_nl = (fb_ptr[datalen - 1] == '\n');
+        seq = ntohl(th->seq) + datalen;
+        /* Look up to see if we're just after a \n. */
+        if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+                /* Now if this ends in \n, update ftp info. */
+                DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
+                       ct_ftp_info->seq_aft_nl[0][dir] 
+                       old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
+                ret = NF_ACCEPT;
+                goto out_update_nl;
+        }
+        /* Initialize IP array to expected address (it's not mentioned
+           in EPSV responses) */
+        array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF;
+        array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF;
+        array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
+        array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
+        for (i = 0; i < ARRAY_SIZE(search); i++) {
+                if (search[i].dir != dir) continue;
+                found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
+                                     search[i].pattern,
+                                     search[i].plen,
+                                     search[i].skip,
+                                     search[i].term,
+                                     &matchoff, &matchlen,
+                                     array,
+                                     search[i].getnum);
+                if (found) break;
+        }
+        if (found == -1) {
+                /* We don't usually drop packets.  After all, this is
+                   connection tracking, not packet filtering.
+                   However, it is necessary for accurate tracking in
+                   this case. */
+                if (net_ratelimit())
+                        printk("conntrack_ftp: partial %s %u+%u\n",
+                               search[i].pattern,
+                               ntohl(th->seq), datalen);
+                ret = NF_DROP;
+                goto out;
+        } else if (found == 0) { /* No match */
+                ret = NF_ACCEPT;
+                goto out_update_nl;
+        }
+        DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n",
+               fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
+                         
+        /* Allocate expectation which will be inserted */
+        exp = ip_conntrack_expect_alloc();
+        if (exp == NULL) {
+                ret = NF_DROP;
+                goto out;
+        }
+        /* We refer to the reverse direction ("!dir") tuples here,
+         * because we're expecting something in the other direction.
+         * Doesn't matter unless NAT is happening.  */
+        exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+        if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3])
+            != ct->tuplehash[dir].tuple.src.ip) {
+                /* Enrico Scholz's passive FTP to partially RNAT'd ftp
+                   server: it really wants us to connect to a
+                   different IP address.  Simply don't record it for
+                   NAT. */
+                DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
+                       array[0], array[1], array[2], array[3],
+                       NIPQUAD(ct->tuplehash[dir].tuple.src.ip));
+                /* Thanks to Cristiano Lincoln Mattos
+                   <lincoln@cesar.org.br> for reporting this potential
+                   problem (DMZ machines opening holes to internal
+                   networks, or the packet filter itself). */
+                if (!loose) {
+                        ret = NF_ACCEPT;
+                        ip_conntrack_expect_free(exp);
+                        goto out_update_nl;
+                }
+                exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
+                                         | (array[2] << 8) | array[3]);
+        }
+        exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+        exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]);
+        exp->tuple.src.u.tcp.port = 0; /* Don't care. */
+        exp->tuple.dst.protonum = IPPROTO_TCP;
+        exp->mask = ((struct ip_conntrack_tuple)
+                { { 0xFFFFFFFF, { 0 } },
+                  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
+        exp->expectfn = NULL;
+        exp->master = ct;
+        /* Now, NAT might want to mangle the packet, and register the
+         * (possibly changed) expectation itself. */
+        if (ip_nat_ftp_hook)
+                ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+                                      matchoff, matchlen, exp, &seq);
+        else {
+                /* Can't expect this?  Best to drop packet now. */
+                if (ip_conntrack_expect_related(exp) != 0) {
+                        ip_conntrack_expect_free(exp);
+                        ret = NF_DROP;
+                } else
+                        ret = NF_ACCEPT;
+        }
+out_update_nl:
+        /* Now if this ends in \n, update ftp info.  Seq may have been
+         * adjusted by NAT code. */
+        if (ends_in_nl)
+                update_nl_seq(seq, ct_ftp_info,dir);
+ out:
+        UNLOCK_BH(&ip_ftp_lock);
+        return ret;
+}
+static struct ip_conntrack_helper ftp[MAX_PORTS];
+static char ftp_names[MAX_PORTS][10];
+/* Not __exit: called from init() */
+static void fini(void)
+{
+        int i;
+        for (i = 0; i < ports_c; i++) {
+                DEBUGP("ip_ct_ftp: unregistering helper for port %d\n",
+                                ports[i]);
+                ip_conntrack_helper_unregister(&ftp[i]);
+        }
+}
+static int __init init(void)
+{
+        int i, ret;
+        char *tmpname;
+        if (ports_c == 0)
+                ports[ports_c++] = FTP_PORT;
+        for (i = 0; i < ports_c; i++) {
+                ftp[i].tuple.src.u.tcp.port = htons(ports[i]);
+                ftp[i].tuple.dst.protonum = IPPROTO_TCP;
+                ftp[i].mask.src.u.tcp.port = 0xFFFF;
+                ftp[i].mask.dst.protonum = 0xFF;
+                ftp[i].max_expected = 1;
+                ftp[i].timeout = 5 * 60; /* 5 minutes */
+                ftp[i].me = THIS_MODULE;
+                ftp[i].help = help;
+                tmpname = &ftp_names[i][0];
+                if (ports[i] == FTP_PORT)
+                        sprintf(tmpname, "ftp");
+                else
+                        sprintf(tmpname, "ftp-%d", ports[i]);
+                ftp[i].name = tmpname;
+                DEBUGP("ip_ct_ftp: registering helper for port %d\n", 
+                                ports[i]);
+                ret = ip_conntrack_helper_register(&ftp[i]);
+                if (ret) {
+                        fini();
+                        return ret;
+                }
+        }
+        return 0;
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
new file mode 100644
index 000000000000..33cc7348b6ee
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -0,0 +1,313 @@
+/* IRC extension for IP connection tracking, Version 1.21
+ * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
+ * based on RR's ip_conntrack_ftp.c     
+ *
+ * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ **
+ *      Module load syntax:
+ *      insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS>
+ *                          max_dcc_channels=n dcc_timeout=secs
+ *      
+ *      please give the ports of all IRC servers You wish to connect to.
+ *      If You don't specify ports, the default will be port 6667.
+ *      With max_dcc_channels you can define the maximum number of not
+ *      yet answered DCC channels per IRC session (default 8).
+ *      With dcc_timeout you can specify how long the system waits for 
+ *      an expected DCC channel (default 300 seconds).
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+#include <linux/moduleparam.h>
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+static int max_dcc_channels = 8;
+static unsigned int dcc_timeout = 300;
+/* This is slow, but it's simple. --RR */
+static char irc_buffer[65536];
+static DECLARE_LOCK(irc_buffer_lock);
+unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
+                                enum ip_conntrack_info ctinfo,
+                                unsigned int matchoff,
+                                unsigned int matchlen,
+                                struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
+MODULE_LICENSE("GPL");
+module_param_array(ports, int, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of IRC servers");
+module_param(max_dcc_channels, int, 0400);
+MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
+module_param(dcc_timeout, int, 0400);
+MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
+static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
+#define MINMATCHLEN     5
+#if 0
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \
+                                       __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+static int parse_dcc(char *data, char *data_end, u_int32_t *ip,
+                     u_int16_t *port, char **ad_beg_p, char **ad_end_p)
+/* tries to get the ip_addr and port out of a dcc command
+   return value: -1 on failure, 0 on success 
+        data            pointer to first byte of DCC command data
+        data_end        pointer to last byte of dcc command data
+        ip              returns parsed ip of dcc command
+        port            returns parsed port of dcc command
+        ad_beg_p        returns pointer to first byte of addr data
+        ad_end_p        returns pointer to last byte of addr data */
+{
+        /* at least 12: "AAAAAAAA P\1\n" */
+        while (*data++ != ' ')
+                if (data > data_end - 12)
+                        return -1;
+        *ad_beg_p = data;
+        *ip = simple_strtoul(data, &data, 10);
+        /* skip blanks between ip and port */
+        while (*data == ' ') {
+                if (data >= data_end) 
+                        return -1;
+                data++;
+        }
+        *port = simple_strtoul(data, &data, 10);
+        *ad_end_p = data;
+        return 0;
+}
+static int help(struct sk_buff **pskb,
+                struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+        unsigned int dataoff;
+        struct tcphdr _tcph, *th;
+        char *data, *data_limit, *ib_ptr;
+        int dir = CTINFO2DIR(ctinfo);
+        struct ip_conntrack_expect *exp;
+        u32 seq;
+        u_int32_t dcc_ip;
+        u_int16_t dcc_port;
+        int i, ret = NF_ACCEPT;
+        char *addr_beg_p, *addr_end_p;
+        DEBUGP("entered\n");
+        /* If packet is coming from IRC server */
+        if (dir == IP_CT_DIR_REPLY)
+                return NF_ACCEPT;
+        /* Until there's been traffic both ways, don't look in packets. */
+        if (ctinfo != IP_CT_ESTABLISHED
+            && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+                DEBUGP("Conntrackinfo = %u\n", ctinfo);
+                return NF_ACCEPT;
+        }
+        /* Not a full tcp header? */
+        th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL)
+                return NF_ACCEPT;
+        /* No data? */
+        dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
+        if (dataoff >= (*pskb)->len)
+                return NF_ACCEPT;
+        LOCK_BH(&irc_buffer_lock);
+        ib_ptr = skb_header_pointer(*pskb, dataoff,
+                                    (*pskb)->len - dataoff, irc_buffer);
+        BUG_ON(ib_ptr == NULL);
+        data = ib_ptr;
+        data_limit = ib_ptr + (*pskb)->len - dataoff;
+        /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
+         * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
+        while (data < (data_limit - (19 + MINMATCHLEN))) {
+                if (memcmp(data, "\1DCC ", 5)) {
+                        data++;
+                        continue;
+                }
+                data += 5;
+                /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+                DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n",
+                        NIPQUAD(iph->saddr), ntohs(th->source),
+                        NIPQUAD(iph->daddr), ntohs(th->dest));
+                for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
+                        if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
+                                /* no match */
+                                continue;
+                        }
+                        DEBUGP("DCC %s detected\n", dccprotos[i]);
+                        data += strlen(dccprotos[i]);
+                        /* we have at least 
+                         * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+                         * data left (== 14/13 bytes) */
+                        if (parse_dcc((char *)data, data_limit, &dcc_ip,
+                                       &dcc_port, &addr_beg_p, &addr_end_p)) {
+                                /* unable to parse */
+                                DEBUGP("unable to parse dcc command\n");
+                                continue;
+                        }
+                        DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n",
+                                HIPQUAD(dcc_ip), dcc_port);
+                        /* dcc_ip can be the internal OR external (NAT'ed) IP
+                         * Tiago Sousa <mirage@kaotik.org> */
+                        if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip)
+                            && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) {
+                                if (net_ratelimit())
+                                        printk(KERN_WARNING
+                                                "Forged DCC command from "
+                                                "%u.%u.%u.%u: %u.%u.%u.%u:%u\n",
+                                NIPQUAD(ct->tuplehash[dir].tuple.src.ip),
+                                                HIPQUAD(dcc_ip), dcc_port);
+                                continue;
+                        }
+                        exp = ip_conntrack_expect_alloc();
+                        if (exp == NULL) {
+                                ret = NF_DROP;
+                                goto out;
+                        }
+                        /* save position of address in dcc string,
+                         * necessary for NAT */
+                        DEBUGP("tcph->seq = %u\n", th->seq);
+                        seq = ntohl(th->seq) + (addr_beg_p - ib_ptr);
+                        /* We refer to the reverse direction ("!dir")
+                         * tuples here, because we're expecting
+                         * something in the other * direction.
+                         * Doesn't matter unless NAT is happening.  */
+                        exp->tuple = ((struct ip_conntrack_tuple)
+                                { { 0, { 0 } },
+                                  { ct->tuplehash[!dir].tuple.dst.ip,
+                                    { .tcp = { htons(dcc_port) } },
+                                    IPPROTO_TCP }});
+                        exp->mask = ((struct ip_conntrack_tuple)
+                                { { 0, { 0 } },
+                                  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
+                        exp->expectfn = NULL;
+                        exp->master = ct;
+                        if (ip_nat_irc_hook)
+                                ret = ip_nat_irc_hook(pskb, ctinfo, 
+                                                      addr_beg_p - ib_ptr,
+                                                      addr_end_p - addr_beg_p,
+                                                      exp);
+                        else if (ip_conntrack_expect_related(exp) != 0) {
+                                ip_conntrack_expect_free(exp);
+                                ret = NF_DROP;
+                        }
+                        goto out;
+                } /* for .. NUM_DCCPROTO */
+        } /* while data < ... */
+ out:
+        UNLOCK_BH(&irc_buffer_lock);
+        return ret;
+}
+static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
+static char irc_names[MAX_PORTS][10];
+static void fini(void);
+static int __init init(void)
+{
+        int i, ret;
+        struct ip_conntrack_helper *hlpr;
+        char *tmpname;
+        if (max_dcc_channels < 1) {
+                printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n");
+                return -EBUSY;
+        }
+        if (dcc_timeout < 0) {
+                printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
+                return -EBUSY;
+        }
+        
+        /* If no port given, default to standard irc port */
+        if (ports_c == 0)
+                ports[ports_c++] = IRC_PORT;
+        for (i = 0; i < ports_c; i++) {
+                hlpr = &irc_helpers[i];
+                hlpr->tuple.src.u.tcp.port = htons(ports[i]);
+                hlpr->tuple.dst.protonum = IPPROTO_TCP;
+                hlpr->mask.src.u.tcp.port = 0xFFFF;
+                hlpr->mask.dst.protonum = 0xFF;
+                hlpr->max_expected = max_dcc_channels;
+                hlpr->timeout = dcc_timeout;
+                hlpr->me = THIS_MODULE;
+                hlpr->help = help;
+                tmpname = &irc_names[i][0];
+                if (ports[i] == IRC_PORT)
+                        sprintf(tmpname, "irc");
+                else
+                        sprintf(tmpname, "irc-%d", i);
+                hlpr->name = tmpname;
+                DEBUGP("port #%d: %d\n", i, ports[i]);
+                ret = ip_conntrack_helper_register(hlpr);
+                if (ret) {
+                        printk("ip_conntrack_irc: ERROR registering port %d\n",
+                                ports[i]);
+                        fini();
+                        return -EBUSY;
+                }
+        }
+        return 0;
+}
+/* This function is intentionally _NOT_ defined as __exit, because 
+ * it is needed by the init function */
+static void fini(void)
+{
+        int i;
+        for (i = 0; i < ports_c; i++) {
+                DEBUGP("unregistering port %d\n",
+                       ports[i]);
+                ip_conntrack_helper_unregister(&irc_helpers[i]);
+        }
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
new file mode 100644
index 000000000000..88c3712bd251
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -0,0 +1,75 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+unsigned long ip_ct_generic_timeout = 600*HZ;
+static int generic_pkt_to_tuple(const struct sk_buff *skb,
+                                unsigned int dataoff,
+                                struct ip_conntrack_tuple *tuple)
+{
+        tuple->src.u.all = 0;
+        tuple->dst.u.all = 0;
+        return 1;
+}
+static int generic_invert_tuple(struct ip_conntrack_tuple *tuple,
+                                const struct ip_conntrack_tuple *orig)
+{
+        tuple->src.u.all = 0;
+        tuple->dst.u.all = 0;
+        return 1;
+}
+/* Print out the per-protocol part of the tuple. */
+static int generic_print_tuple(struct seq_file *s,
+                               const struct ip_conntrack_tuple *tuple)
+{
+        return 0;
+}
+/* Print out the private part of the conntrack. */
+static int generic_print_conntrack(struct seq_file *s,
+                                   const struct ip_conntrack *state)
+{
+        return 0;
+}
+/* Returns verdict for packet, or -1 for invalid. */
+static int packet(struct ip_conntrack *conntrack,
+                  const struct sk_buff *skb,
+                  enum ip_conntrack_info ctinfo)
+{
+        ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
+        return NF_ACCEPT;
+}
+/* Called when a new connection for this protocol found. */
+static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
+{
+        return 1;
+}
+struct ip_conntrack_protocol ip_conntrack_generic_protocol =
+{
+        .proto                  = 0,
+        .name                   = "unknown",
+        .pkt_to_tuple           = generic_pkt_to_tuple,
+        .invert_tuple           = generic_invert_tuple,
+        .print_tuple            = generic_print_tuple,
+        .print_conntrack        = generic_print_conntrack,
+        .packet                 = packet,
+        .new                    = new,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
new file mode 100644
index 000000000000..602c74db3252
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -0,0 +1,279 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/icmp.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+unsigned long ip_ct_icmp_timeout = 30*HZ;
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static int icmp_pkt_to_tuple(const struct sk_buff *skb,
+                             unsigned int dataoff,
+                             struct ip_conntrack_tuple *tuple)
+{
+        struct icmphdr _hdr, *hp;
+        hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+        if (hp == NULL)
+                return 0;
+        tuple->dst.u.icmp.type = hp->type;
+        tuple->src.u.icmp.id = hp->un.echo.id;
+        tuple->dst.u.icmp.code = hp->code;
+        return 1;
+}
+static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
+                             const struct ip_conntrack_tuple *orig)
+{
+        /* Add 1; spaces filled with 0. */
+        static u_int8_t invmap[]
+                = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+                    [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+                    [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+                    [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+                    [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+                    [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+                    [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+                    [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
+        if (orig->dst.u.icmp.type >= sizeof(invmap)
+            || !invmap[orig->dst.u.icmp.type])
+                return 0;
+        tuple->src.u.icmp.id = orig->src.u.icmp.id;
+        tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+        tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+        return 1;
+}
+/* Print out the per-protocol part of the tuple. */
+static int icmp_print_tuple(struct seq_file *s,
+                            const struct ip_conntrack_tuple *tuple)
+{
+        return seq_printf(s, "type=%u code=%u id=%u ",
+                          tuple->dst.u.icmp.type,
+                          tuple->dst.u.icmp.code,
+                          ntohs(tuple->src.u.icmp.id));
+}
+/* Print out the private part of the conntrack. */
+static int icmp_print_conntrack(struct seq_file *s,
+                                const struct ip_conntrack *conntrack)
+{
+        return 0;
+}
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct ip_conntrack *ct,
+                       const struct sk_buff *skb,
+                       enum ip_conntrack_info ctinfo)
+{
+        /* Try to delete connection immediately after all replies:
+           won't actually vanish as we still have skb, and del_timer
+           means this will only run once even if count hits zero twice
+           (theoretically possible with SMP) */
+        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+                if (atomic_dec_and_test(&ct->proto.icmp.count)
+                    && del_timer(&ct->timeout))
+                        ct->timeout.function((unsigned long)ct);
+        } else {
+                atomic_inc(&ct->proto.icmp.count);
+                ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
+        }
+        return NF_ACCEPT;
+}
+/* Called when a new connection for this protocol found. */
+static int icmp_new(struct ip_conntrack *conntrack,
+                    const struct sk_buff *skb)
+{
+        static u_int8_t valid_new[]
+                = { [ICMP_ECHO] = 1,
+                    [ICMP_TIMESTAMP] = 1,
+                    [ICMP_INFO_REQUEST] = 1,
+                    [ICMP_ADDRESS] = 1 };
+        if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
+            || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
+                /* Can't create a new ICMP `conn' with this. */
+                DEBUGP("icmp: can't create new conn with type %u\n",
+                       conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+                DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
+                return 0;
+        }
+        atomic_set(&conntrack->proto.icmp.count, 0);
+        return 1;
+}
+static int
+icmp_error_message(struct sk_buff *skb,
+                   enum ip_conntrack_info *ctinfo,
+                   unsigned int hooknum)
+{
+        struct ip_conntrack_tuple innertuple, origtuple;
+        struct {
+                struct icmphdr icmp;
+                struct iphdr ip;
+        } _in, *inside;
+        struct ip_conntrack_protocol *innerproto;
+        struct ip_conntrack_tuple_hash *h;
+        int dataoff;
+        IP_NF_ASSERT(skb->nfct == NULL);
+        /* Not enough header? */
+        inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
+        if (inside == NULL)
+                return NF_ACCEPT;
+        /* Ignore ICMP's containing fragments (shouldn't happen) */
+        if (inside->ip.frag_off & htons(IP_OFFSET)) {
+                DEBUGP("icmp_error_track: fragment of proto %u\n",
+                       inside->ip.protocol);
+                return NF_ACCEPT;
+        }
+        innerproto = ip_ct_find_proto(inside->ip.protocol);
+        dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
+        /* Are they talking about one of our connections? */
+        if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
+                DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+                return NF_ACCEPT;
+        }
+        /* Ordinarily, we'd expect the inverted tupleproto, but it's
+           been preserved inside the ICMP. */
+        if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
+                DEBUGP("icmp_error_track: Can't invert tuple\n");
+                return NF_ACCEPT;
+        }
+        *ctinfo = IP_CT_RELATED;
+        h = ip_conntrack_find_get(&innertuple, NULL);
+        if (!h) {
+                /* Locally generated ICMPs will match inverted if they
+                   haven't been SNAT'ed yet */
+                /* FIXME: NAT code has to handle half-done double NAT --RR */
+                if (hooknum == NF_IP_LOCAL_OUT)
+                        h = ip_conntrack_find_get(&origtuple, NULL);
+                if (!h) {
+                        DEBUGP("icmp_error_track: no match\n");
+                        return NF_ACCEPT;
+                }
+                /* Reverse direction from that found */
+                if (DIRECTION(h) != IP_CT_DIR_REPLY)
+                        *ctinfo += IP_CT_IS_REPLY;
+        } else {
+                if (DIRECTION(h) == IP_CT_DIR_REPLY)
+                        *ctinfo += IP_CT_IS_REPLY;
+        }
+        /* Update skb to refer to this connection */
+        skb->nfct = &tuplehash_to_ctrack(h)->ct_general;
+        skb->nfctinfo = *ctinfo;
+        return -NF_ACCEPT;
+}
+/* Small and modified version of icmp_rcv */
+static int
+icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
+           unsigned int hooknum)
+{
+        struct icmphdr _ih, *icmph;
+        /* Not enough header? */
+        icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
+        if (icmph == NULL) {
+                if (LOG_INVALID(IPPROTO_ICMP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+                                      "ip_ct_icmp: short packet ");
+                return -NF_ACCEPT;
+        }
+        /* See ip_conntrack_proto_tcp.c */
+        if (hooknum != NF_IP_PRE_ROUTING)
+                goto checksum_skipped;
+        switch (skb->ip_summed) {
+        case CHECKSUM_HW:
+                if (!(u16)csum_fold(skb->csum)) 
+                        break;
+                if (LOG_INVALID(IPPROTO_ICMP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                      "ip_ct_icmp: bad HW ICMP checksum ");
+                return -NF_ACCEPT;
+        case CHECKSUM_NONE:
+                if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
+                        if (LOG_INVALID(IPPROTO_ICMP))
+                                nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                              "ip_ct_icmp: bad ICMP checksum ");
+                        return -NF_ACCEPT;
+                }
+        default:
+                break;
+        }
+checksum_skipped:
+        /*
+         *      18 is the highest 'known' ICMP type. Anything else is a mystery
+         *
+         *      RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
+         *                discarded.
+         */
+        if (icmph->type > NR_ICMP_TYPES) {
+                if (LOG_INVALID(IPPROTO_ICMP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+                                      "ip_ct_icmp: invalid ICMP type ");
+                return -NF_ACCEPT;
+        }
+        /* Need to track icmp error message? */
+        if (icmph->type != ICMP_DEST_UNREACH
+            && icmph->type != ICMP_SOURCE_QUENCH
+            && icmph->type != ICMP_TIME_EXCEEDED
+            && icmph->type != ICMP_PARAMETERPROB
+            && icmph->type != ICMP_REDIRECT)
+                return NF_ACCEPT;
+        return icmp_error_message(skb, ctinfo, hooknum);
+}
+struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
+{
+        .proto                  = IPPROTO_ICMP,
+        .name                   = "icmp",
+        .pkt_to_tuple           = icmp_pkt_to_tuple,
+        .invert_tuple           = icmp_invert_tuple,
+        .print_tuple            = icmp_print_tuple,
+        .print_conntrack        = icmp_print_conntrack,
+        .packet                 = icmp_packet,
+        .new                    = icmp_new,
+        .error                  = icmp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
new file mode 100644
index 000000000000..ff8c34a860ff
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -0,0 +1,649 @@
+/*
+ * Connection tracking protocol helper module for SCTP.
+ * 
+ * SCTP is defined in RFC 2960. References to various sections in this code 
+ * are to this RFC.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/*
+ * Added support for proc manipulation of timeouts.
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#if 0
+#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
+#else
+#define DEBUGP(format, args...)
+#endif
+/* Protects conntrack->proto.sctp */
+static DECLARE_RWLOCK(sctp_lock);
+/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+   closely.  They're more complex. --RR 
+   And so for me for SCTP :D -Kiran */
+static const char *sctp_conntrack_names[] = {
+        "NONE",
+        "CLOSED",
+        "COOKIE_WAIT",
+        "COOKIE_ECHOED",
+        "ESTABLISHED",
+        "SHUTDOWN_SENT",
+        "SHUTDOWN_RECD",
+        "SHUTDOWN_ACK_SENT",
+};
+#define SECS  * HZ
+#define MINS  * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS  * 24 HOURS
+static unsigned long ip_ct_sctp_timeout_closed            =  10 SECS;
+static unsigned long ip_ct_sctp_timeout_cookie_wait       =   3 SECS;
+static unsigned long ip_ct_sctp_timeout_cookie_echoed     =   3 SECS;
+static unsigned long ip_ct_sctp_timeout_established       =   5 DAYS;
+static unsigned long ip_ct_sctp_timeout_shutdown_sent     = 300 SECS / 1000;
+static unsigned long ip_ct_sctp_timeout_shutdown_recd     = 300 SECS / 1000;
+static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent =   3 SECS;
+static unsigned long * sctp_timeouts[]
+= { NULL,                                  /* SCTP_CONNTRACK_NONE  */
+    &ip_ct_sctp_timeout_closed,            /* SCTP_CONNTRACK_CLOSED */
+    &ip_ct_sctp_timeout_cookie_wait,       /* SCTP_CONNTRACK_COOKIE_WAIT */
+    &ip_ct_sctp_timeout_cookie_echoed,     /* SCTP_CONNTRACK_COOKIE_ECHOED */
+    &ip_ct_sctp_timeout_established,       /* SCTP_CONNTRACK_ESTABLISHED */
+    &ip_ct_sctp_timeout_shutdown_sent,     /* SCTP_CONNTRACK_SHUTDOWN_SENT */
+    &ip_ct_sctp_timeout_shutdown_recd,     /* SCTP_CONNTRACK_SHUTDOWN_RECD */
+    &ip_ct_sctp_timeout_shutdown_ack_sent  /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
+ };
+#define sNO SCTP_CONNTRACK_NONE
+#define sCL SCTP_CONNTRACK_CLOSED
+#define sCW SCTP_CONNTRACK_COOKIE_WAIT
+#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
+#define sES SCTP_CONNTRACK_ESTABLISHED
+#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
+#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
+#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define sIV SCTP_CONNTRACK_MAX
+/* 
+        These are the descriptions of the states:
+NOTE: These state names are tantalizingly similar to the states of an 
+SCTP endpoint. But the interpretation of the states is a little different,
+considering that these are the states of the connection and not of an end 
+point. Please note the subtleties. -Kiran
+NONE              - Nothing so far.
+COOKIE WAIT       - We have seen an INIT chunk in the original direction, or also 
+                    an INIT_ACK chunk in the reply direction.
+COOKIE ECHOED     - We have seen a COOKIE_ECHO chunk in the original direction.
+ESTABLISHED       - We have seen a COOKIE_ACK in the reply direction.
+SHUTDOWN_SENT     - We have seen a SHUTDOWN chunk in the original direction.
+SHUTDOWN_RECD     - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
+                    to that of the SHUTDOWN chunk.
+CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of 
+                    the SHUTDOWN chunk. Connection is closed.
+*/
+/* TODO
+ - I have assumed that the first INIT is in the original direction. 
+ This messes things when an INIT comes in the reply direction in CLOSED
+ state.
+ - Check the error type in the reply dir before transitioning from 
+cookie echoed to closed.
+ - Sec 5.2.4 of RFC 2960
+ - Multi Homing support.
+*/
+/* SCTP conntrack state transitions */
+static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+        {
+/*      ORIGINAL        */
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init         */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
+/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
+/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
+/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+        },
+        {
+/*      REPLY   */
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
+/* init_ack     */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
+/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
+/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+        }
+};
+static int sctp_pkt_to_tuple(const struct sk_buff *skb,
+                             unsigned int dataoff,
+                             struct ip_conntrack_tuple *tuple)
+{
+        sctp_sctphdr_t _hdr, *hp;
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        /* Actually only need first 8 bytes. */
+        hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+        if (hp == NULL)
+                return 0;
+        tuple->src.u.sctp.port = hp->source;
+        tuple->dst.u.sctp.port = hp->dest;
+        return 1;
+}
+static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple,
+                             const struct ip_conntrack_tuple *orig)
+{
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        tuple->src.u.sctp.port = orig->dst.u.sctp.port;
+        tuple->dst.u.sctp.port = orig->src.u.sctp.port;
+        return 1;
+}
+/* Print out the per-protocol part of the tuple. */
+static int sctp_print_tuple(struct seq_file *s,
+                            const struct ip_conntrack_tuple *tuple)
+{
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        return seq_printf(s, "sport=%hu dport=%hu ",
+                          ntohs(tuple->src.u.sctp.port),
+                          ntohs(tuple->dst.u.sctp.port));
+}
+/* Print out the private part of the conntrack. */
+static int sctp_print_conntrack(struct seq_file *s,
+                                const struct ip_conntrack *conntrack)
+{
+        enum sctp_conntrack state;
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        READ_LOCK(&sctp_lock);
+        state = conntrack->proto.sctp.state;
+        READ_UNLOCK(&sctp_lock);
+        return seq_printf(s, "%s ", sctp_conntrack_names[state]);
+}
+#define for_each_sctp_chunk(skb, sch, _sch, offset, count)              \
+for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \
+        offset < skb->len &&                                            \
+        (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch));   \
+        offset += (htons(sch->length) + 3) & ~3, count++)
+/* Some validity checks to make sure the chunks are fine */
+static int do_basic_checks(struct ip_conntrack *conntrack,
+                           const struct sk_buff *skb,
+                           char *map)
+{
+        u_int32_t offset, count;
+        sctp_chunkhdr_t _sch, *sch;
+        int flag;
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        flag = 0;
+        for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+                DEBUGP("Chunk Num: %d  Type: %d\n", count, sch->type);
+                if (sch->type == SCTP_CID_INIT 
+                        || sch->type == SCTP_CID_INIT_ACK
+                        || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+                        flag = 1;
+                }
+                /* Cookie Ack/Echo chunks not the first OR 
+                   Init / Init Ack / Shutdown compl chunks not the only chunks */
+                if ((sch->type == SCTP_CID_COOKIE_ACK 
+                        || sch->type == SCTP_CID_COOKIE_ECHO
+                        || flag)
+                     && count !=0 ) {
+                        DEBUGP("Basic checks failed\n");
+                        return 1;
+                }
+                if (map) {
+                        set_bit(sch->type, (void *)map);
+                }
+        }
+        DEBUGP("Basic checks passed\n");
+        return 0;
+}
+static int new_state(enum ip_conntrack_dir dir,
+                     enum sctp_conntrack cur_state,
+                     int chunk_type)
+{
+        int i;
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        DEBUGP("Chunk type: %d\n", chunk_type);
+        switch (chunk_type) {
+                case SCTP_CID_INIT: 
+                        DEBUGP("SCTP_CID_INIT\n");
+                        i = 0; break;
+                case SCTP_CID_INIT_ACK: 
+                        DEBUGP("SCTP_CID_INIT_ACK\n");
+                        i = 1; break;
+                case SCTP_CID_ABORT: 
+                        DEBUGP("SCTP_CID_ABORT\n");
+                        i = 2; break;
+                case SCTP_CID_SHUTDOWN: 
+                        DEBUGP("SCTP_CID_SHUTDOWN\n");
+                        i = 3; break;
+                case SCTP_CID_SHUTDOWN_ACK: 
+                        DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
+                        i = 4; break;
+                case SCTP_CID_ERROR: 
+                        DEBUGP("SCTP_CID_ERROR\n");
+                        i = 5; break;
+                case SCTP_CID_COOKIE_ECHO: 
+                        DEBUGP("SCTP_CID_COOKIE_ECHO\n");
+                        i = 6; break;
+                case SCTP_CID_COOKIE_ACK: 
+                        DEBUGP("SCTP_CID_COOKIE_ACK\n");
+                        i = 7; break;
+                case SCTP_CID_SHUTDOWN_COMPLETE: 
+                        DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
+                        i = 8; break;
+                default:
+                        /* Other chunks like DATA, SACK, HEARTBEAT and
+                        its ACK do not cause a change in state */
+                        DEBUGP("Unknown chunk type, Will stay in %s\n", 
+                                                sctp_conntrack_names[cur_state]);
+                        return cur_state;
+        }
+        DEBUGP("dir: %d   cur_state: %s  chunk_type: %d  new_state: %s\n", 
+                        dir, sctp_conntrack_names[cur_state], chunk_type,
+                        sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+        return sctp_conntracks[dir][i][cur_state];
+}
+/* Returns verdict for packet, or -1 for invalid. */
+static int sctp_packet(struct ip_conntrack *conntrack,
+                       const struct sk_buff *skb,
+                       enum ip_conntrack_info ctinfo)
+{
+        enum sctp_conntrack newconntrack, oldsctpstate;
+        struct iphdr *iph = skb->nh.iph;
+        sctp_sctphdr_t _sctph, *sh;
+        sctp_chunkhdr_t _sch, *sch;
+        u_int32_t offset, count;
+        char map[256 / sizeof (char)] = {0};
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
+        if (sh == NULL)
+                return -1;
+        if (do_basic_checks(conntrack, skb, map) != 0)
+                return -1;
+        /* Check the verification tag (Sec 8.5) */
+        if (!test_bit(SCTP_CID_INIT, (void *)map)
+                && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
+                && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
+                && !test_bit(SCTP_CID_ABORT, (void *)map)
+                && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
+                && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+                DEBUGP("Verification tag check failed\n");
+                return -1;
+        }
+        oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
+        for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+                WRITE_LOCK(&sctp_lock);
+                /* Special cases of Verification tag check (Sec 8.5.1) */
+                if (sch->type == SCTP_CID_INIT) {
+                        /* Sec 8.5.1 (A) */
+                        if (sh->vtag != 0) {
+                                WRITE_UNLOCK(&sctp_lock);
+                                return -1;
+                        }
+                } else if (sch->type == SCTP_CID_ABORT) {
+                        /* Sec 8.5.1 (B) */
+                        if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+                                && !(sh->vtag == conntrack->proto.sctp.vtag
+                                                        [1 - CTINFO2DIR(ctinfo)])) {
+                                WRITE_UNLOCK(&sctp_lock);
+                                return -1;
+                        }
+                } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+                        /* Sec 8.5.1 (C) */
+                        if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+                                && !(sh->vtag == conntrack->proto.sctp.vtag
+                                                        [1 - CTINFO2DIR(ctinfo)] 
+                                        && (sch->flags & 1))) {
+                                WRITE_UNLOCK(&sctp_lock);
+                                return -1;
+                        }
+                } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
+                        /* Sec 8.5.1 (D) */
+                        if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+                                WRITE_UNLOCK(&sctp_lock);
+                                return -1;
+                        }
+                }
+                oldsctpstate = conntrack->proto.sctp.state;
+                newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
+                /* Invalid */
+                if (newconntrack == SCTP_CONNTRACK_MAX) {
+                        DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
+                               CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
+                        WRITE_UNLOCK(&sctp_lock);
+                        return -1;
+                }
+                /* If it is an INIT or an INIT ACK note down the vtag */
+                if (sch->type == SCTP_CID_INIT 
+                        || sch->type == SCTP_CID_INIT_ACK) {
+                        sctp_inithdr_t _inithdr, *ih;
+                        ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+                                                sizeof(_inithdr), &_inithdr);
+                        if (ih == NULL) {
+                                        WRITE_UNLOCK(&sctp_lock);
+                                        return -1;
+                        }
+                        DEBUGP("Setting vtag %x for dir %d\n", 
+                                        ih->init_tag, !CTINFO2DIR(ctinfo));
+                        conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
+                }
+                conntrack->proto.sctp.state = newconntrack;
+                WRITE_UNLOCK(&sctp_lock);
+        }
+        ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
+        if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
+                && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
+                && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
+                DEBUGP("Setting assured bit\n");
+                set_bit(IPS_ASSURED_BIT, &conntrack->status);
+        }
+        return NF_ACCEPT;
+}
+/* Called when a new connection for this protocol found. */
+static int sctp_new(struct ip_conntrack *conntrack, 
+                    const struct sk_buff *skb)
+{
+        enum sctp_conntrack newconntrack;
+        struct iphdr *iph = skb->nh.iph;
+        sctp_sctphdr_t _sctph, *sh;
+        sctp_chunkhdr_t _sch, *sch;
+        u_int32_t offset, count;
+        char map[256 / sizeof (char)] = {0};
+        DEBUGP(__FUNCTION__);
+        DEBUGP("\n");
+        sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
+        if (sh == NULL)
+                return 0;
+        if (do_basic_checks(conntrack, skb, map) != 0)
+                return 0;
+        /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+        if ((test_bit (SCTP_CID_ABORT, (void *)map))
+                || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
+                || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
+                return 0;
+        }
+        newconntrack = SCTP_CONNTRACK_MAX;
+        for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+                /* Don't need lock here: this conntrack not in circulation yet */
+                newconntrack = new_state (IP_CT_DIR_ORIGINAL, 
+                                                SCTP_CONNTRACK_NONE, sch->type);
+                /* Invalid: delete conntrack */
+                if (newconntrack == SCTP_CONNTRACK_MAX) {
+                        DEBUGP("ip_conntrack_sctp: invalid new deleting.\n");
+                        return 0;
+                }
+                /* Copy the vtag into the state info */
+                if (sch->type == SCTP_CID_INIT) {
+                        if (sh->vtag == 0) {
+                                sctp_inithdr_t _inithdr, *ih;
+                                ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+                                                        sizeof(_inithdr), &_inithdr);
+                                if (ih == NULL)
+                                        return 0;
+                                DEBUGP("Setting vtag %x for new conn\n", 
+                                        ih->init_tag);
+                                conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = 
+                                                                ih->init_tag;
+                        } else {
+                                /* Sec 8.5.1 (A) */
+                                return 0;
+                        }
+                }
+                /* If it is a shutdown ack OOTB packet, we expect a return
+                   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+                else {
+                        DEBUGP("Setting vtag %x for new conn OOTB\n", 
+                                sh->vtag);
+                        conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+                }
+                conntrack->proto.sctp.state = newconntrack;
+        }
+        return 1;
+}
+static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { 
+        .proto           = IPPROTO_SCTP, 
+        .name            = "sctp",
+        .pkt_to_tuple    = sctp_pkt_to_tuple, 
+        .invert_tuple    = sctp_invert_tuple, 
+        .print_tuple     = sctp_print_tuple, 
+        .print_conntrack = sctp_print_conntrack,
+        .packet          = sctp_packet, 
+        .new             = sctp_new, 
+        .destroy         = NULL, 
+        .me              = THIS_MODULE 
+};
+#ifdef CONFIG_SYSCTL
+static ctl_table ip_ct_sysctl_table[] = {
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
+                .procname       = "ip_conntrack_sctp_timeout_closed",
+                .data           = &ip_ct_sctp_timeout_closed,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
+                .procname       = "ip_conntrack_sctp_timeout_cookie_wait",
+                .data           = &ip_ct_sctp_timeout_cookie_wait,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
+                .procname       = "ip_conntrack_sctp_timeout_cookie_echoed",
+                .data           = &ip_ct_sctp_timeout_cookie_echoed,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
+                .procname       = "ip_conntrack_sctp_timeout_established",
+                .data           = &ip_ct_sctp_timeout_established,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
+                .procname       = "ip_conntrack_sctp_timeout_shutdown_sent",
+                .data           = &ip_ct_sctp_timeout_shutdown_sent,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
+                .procname       = "ip_conntrack_sctp_timeout_shutdown_recd",
+                .data           = &ip_ct_sctp_timeout_shutdown_recd,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
+                .procname       = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
+                .data           = &ip_ct_sctp_timeout_shutdown_ack_sent,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ip_ct_netfilter_table[] = {
+        {
+                .ctl_name       = NET_IPV4_NETFILTER,
+                .procname       = "netfilter",
+                .mode           = 0555,
+                .child          = ip_ct_sysctl_table,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ip_ct_ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4",
+                .mode           = 0555,
+                .child          = ip_ct_netfilter_table,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ip_ct_net_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net",
+                .mode           = 0555, 
+                .child          = ip_ct_ipv4_table,
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header *ip_ct_sysctl_header;
+#endif
+static int __init init(void)
+{
+        int ret;
+        ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp);
+        if (ret) {
+                printk("ip_conntrack_proto_sctp: protocol register failed\n");
+                goto out;
+        }
+#ifdef CONFIG_SYSCTL
+        ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
+        if (ip_ct_sysctl_header == NULL) {
+                ret = -ENOMEM;
+                printk("ip_conntrack_proto_sctp: can't register to sysctl.\n");
+                goto cleanup;
+        }
+#endif
+        return ret;
+#ifdef CONFIG_SYSCTL
+ cleanup:
+        ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
+#endif
+ out:
+        DEBUGP("SCTP conntrack module loading %s\n", 
+                                        ret ? "failed": "succeeded");
+        return ret;
+}
+static void __exit fini(void)
+{
+        ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
+#ifdef CONFIG_SYSCTL
+        unregister_sysctl_table(ip_ct_sysctl_header);
+#endif
+        DEBUGP("SCTP conntrack module unloaded\n");
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
new file mode 100644
index 000000000000..e800b16fc920
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -0,0 +1,1098 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
+ *      - Real stateful connection tracking
+ *      - Modified state transitions table
+ *      - Window scaling support added
+ *      - SACK support added
+ *
+ * Willy Tarreau:
+ *      - State table bugfixes
+ *      - More robust state changes
+ *      - Tuning timer parameters
+ *
+ * version 2.2
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/spinlock.h>
+#include <net/tcp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#if 0
+#define DEBUGP printk
+#define DEBUGP_VARS
+#else
+#define DEBUGP(format, args...)
+#endif
+/* Protects conntrack->proto.tcp */
+static DECLARE_RWLOCK(tcp_lock);
+/* "Be conservative in what you do, 
+    be liberal in what you accept from others." 
+    If it's non-zero, we mark only out of window RST segments as INVALID. */
+int ip_ct_tcp_be_liberal = 0;
+/* When connection is picked up from the middle, how many packets are required
+   to pass in each direction when we assume we are in sync - if any side uses
+   window scaling, we lost the game. 
+   If it is set to zero, we disable picking up already established 
+   connections. */
+int ip_ct_tcp_loose = 3;
+/* Max number of the retransmitted packets without receiving an (acceptable) 
+   ACK from the destination. If this number is reached, a shorter timer 
+   will be started. */
+int ip_ct_tcp_max_retrans = 3;
+  /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+     closely.  They're more complex. --RR */
+static const char *tcp_conntrack_names[] = {
+        "NONE",
+        "SYN_SENT",
+        "SYN_RECV",
+        "ESTABLISHED",
+        "FIN_WAIT",
+        "CLOSE_WAIT",
+        "LAST_ACK",
+        "TIME_WAIT",
+        "CLOSE",
+        "LISTEN"
+};
+  
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+unsigned long ip_ct_tcp_timeout_syn_sent =      2 MINS;
+unsigned long ip_ct_tcp_timeout_syn_recv =     60 SECS;
+unsigned long ip_ct_tcp_timeout_established =   5 DAYS;
+unsigned long ip_ct_tcp_timeout_fin_wait =      2 MINS;
+unsigned long ip_ct_tcp_timeout_close_wait =   60 SECS;
+unsigned long ip_ct_tcp_timeout_last_ack =     30 SECS;
+unsigned long ip_ct_tcp_timeout_time_wait =     2 MINS;
+unsigned long ip_ct_tcp_timeout_close =        10 SECS;
+/* RFC1122 says the R2 limit should be at least 100 seconds.
+   Linux uses 15 packets as limit, which corresponds 
+   to ~13-30min depending on RTO. */
+unsigned long ip_ct_tcp_timeout_max_retrans =     5 MINS;
+ 
+static unsigned long * tcp_timeouts[]
+= { NULL,                              /*      TCP_CONNTRACK_NONE */
+    &ip_ct_tcp_timeout_syn_sent,       /*      TCP_CONNTRACK_SYN_SENT, */
+    &ip_ct_tcp_timeout_syn_recv,       /*      TCP_CONNTRACK_SYN_RECV, */
+    &ip_ct_tcp_timeout_established,    /*      TCP_CONNTRACK_ESTABLISHED,      */
+    &ip_ct_tcp_timeout_fin_wait,       /*      TCP_CONNTRACK_FIN_WAIT, */
+    &ip_ct_tcp_timeout_close_wait,     /*      TCP_CONNTRACK_CLOSE_WAIT,       */
+    &ip_ct_tcp_timeout_last_ack,       /*      TCP_CONNTRACK_LAST_ACK, */
+    &ip_ct_tcp_timeout_time_wait,      /*      TCP_CONNTRACK_TIME_WAIT,        */
+    &ip_ct_tcp_timeout_close,          /*      TCP_CONNTRACK_CLOSE,    */
+    NULL,                              /*      TCP_CONNTRACK_LISTEN */
+ };
+ 
+#define sNO TCP_CONNTRACK_NONE
+#define sSS TCP_CONNTRACK_SYN_SENT
+#define sSR TCP_CONNTRACK_SYN_RECV
+#define sES TCP_CONNTRACK_ESTABLISHED
+#define sFW TCP_CONNTRACK_FIN_WAIT
+#define sCW TCP_CONNTRACK_CLOSE_WAIT
+#define sLA TCP_CONNTRACK_LAST_ACK
+#define sTW TCP_CONNTRACK_TIME_WAIT
+#define sCL TCP_CONNTRACK_CLOSE
+#define sLI TCP_CONNTRACK_LISTEN
+#define sIV TCP_CONNTRACK_MAX
+#define sIG TCP_CONNTRACK_IGNORE
+/* What TCP flags are set from RST/SYN/FIN/ACK. */
+enum tcp_bit_set {
+        TCP_SYN_SET,
+        TCP_SYNACK_SET,
+        TCP_FIN_SET,
+        TCP_ACK_SET,
+        TCP_RST_SET,
+        TCP_NONE_SET,
+};
+  
+/*
+ * The TCP state transition table needs a few words...
+ *
+ * We are the man in the middle. All the packets go through us
+ * but might get lost in transit to the destination.
+ * It is assumed that the destinations can't receive segments 
+ * we haven't seen.
+ *
+ * The checked segment is in window, but our windows are *not*
+ * equivalent with the ones of the sender/receiver. We always
+ * try to guess the state of the current sender.
+ *
+ * The meaning of the states are:
+ *
+ * NONE:        initial state
+ * SYN_SENT:    SYN-only packet seen 
+ * SYN_RECV:    SYN-ACK packet seen
+ * ESTABLISHED: ACK packet seen
+ * FIN_WAIT:    FIN packet seen
+ * CLOSE_WAIT:  ACK seen (after FIN) 
+ * LAST_ACK:    FIN seen (after FIN)
+ * TIME_WAIT:   last ACK seen
+ * CLOSE:       closed connection
+ *
+ * LISTEN state is not used.
+ *
+ * Packets marked as IGNORED (sIG):
+ *      if they may be either invalid or valid 
+ *      and the receiver may send back a connection 
+ *      closing RST or a SYN/ACK.
+ *
+ * Packets marked as INVALID (sIV):
+ *      if they are invalid
+ *      or we do not support the request (simultaneous open)
+ */
+static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+        {
+/* ORIGINAL */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
+/*
+ *      sNO -> sSS      Initialize a new connection
+ *      sSS -> sSS      Retransmitted SYN
+ *      sSR -> sIG      Late retransmitted SYN?
+ *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
+ *                      are errors. Receiver will reply with RST 
+ *                      and close the connection.
+ *                      Or we are not in sync and hold a dead connection.
+ *      sFW -> sIG
+ *      sCW -> sIG
+ *      sLA -> sIG
+ *      sTW -> sSS      Reopened connection (RFC 1122).
+ *      sCL -> sSS
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ * A SYN/ACK from the client is always invalid:
+ *      - either it tries to set up a simultaneous open, which is 
+ *        not supported;
+ *      - or the firewall has just been inserted between the two hosts
+ *        during the session set-up. The SYN will be retransmitted 
+ *        by the true client (or it'll time out).
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ *      sNO -> sIV      Too late and no reason to do anything...
+ *      sSS -> sIV      Client migth not send FIN in this state:
+ *                      we enforce waiting for a SYN/ACK reply first.
+ *      sSR -> sFW      Close started.
+ *      sES -> sFW      
+ *      sFW -> sLA      FIN seen in both directions, waiting for
+ *                      the last ACK. 
+ *                      Migth be a retransmitted FIN as well...
+ *      sCW -> sLA
+ *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
+ *      sTW -> sTW
+ *      sCL -> sCL
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ *      sNO -> sES      Assumed.
+ *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
+ *      sSR -> sES      Established state is reached.
+ *      sES -> sES      :-)
+ *      sFW -> sCW      Normal close request answered by ACK.
+ *      sCW -> sCW
+ *      sLA -> sTW      Last ACK detected.
+ *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
+ *      sCL -> sCL
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+        },
+        {
+/* REPLY */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*syn*/    { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ *      sNO -> sIV      Never reached.
+ *      sSS -> sIV      Simultaneous open, not supported
+ *      sSR -> sIV      Simultaneous open, not supported.
+ *      sES -> sIV      Server may not initiate a connection.
+ *      sFW -> sIV
+ *      sCW -> sIV
+ *      sLA -> sIV
+ *      sTW -> sIV      Reopened connection, but server may not do it.
+ *      sCL -> sIV
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
+/*
+ *      sSS -> sSR      Standard open.
+ *      sSR -> sSR      Retransmitted SYN/ACK.
+ *      sES -> sIG      Late retransmitted SYN/ACK?
+ *      sFW -> sIG      Might be SYN/ACK answering ignored SYN
+ *      sCW -> sIG
+ *      sLA -> sIG
+ *      sTW -> sIG
+ *      sCL -> sIG
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ *      sSS -> sIV      Server might not send FIN in this state.
+ *      sSR -> sFW      Close started.
+ *      sES -> sFW
+ *      sFW -> sLA      FIN seen in both directions.
+ *      sCW -> sLA
+ *      sLA -> sLA      Retransmitted FIN.
+ *      sTW -> sTW
+ *      sCL -> sCL
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*ack*/    { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ *      sSS -> sIV      Might be a half-open connection.
+ *      sSR -> sSR      Might answer late resent SYN.
+ *      sES -> sES      :-)
+ *      sFW -> sCW      Normal close request answered by ACK.
+ *      sCW -> sCW
+ *      sLA -> sTW      Last ACK detected.
+ *      sTW -> sTW      Retransmitted last ACK.
+ *      sCL -> sCL
+ */
+/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+        }
+};
+static int tcp_pkt_to_tuple(const struct sk_buff *skb,
+                            unsigned int dataoff,
+                            struct ip_conntrack_tuple *tuple)
+{
+        struct tcphdr _hdr, *hp;
+        /* Actually only need first 8 bytes. */
+        hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+        if (hp == NULL)
+                return 0;
+        tuple->src.u.tcp.port = hp->source;
+        tuple->dst.u.tcp.port = hp->dest;
+        return 1;
+}
+static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
+                            const struct ip_conntrack_tuple *orig)
+{
+        tuple->src.u.tcp.port = orig->dst.u.tcp.port;
+        tuple->dst.u.tcp.port = orig->src.u.tcp.port;
+        return 1;
+}
+/* Print out the per-protocol part of the tuple. */
+static int tcp_print_tuple(struct seq_file *s,
+                           const struct ip_conntrack_tuple *tuple)
+{
+        return seq_printf(s, "sport=%hu dport=%hu ",
+                          ntohs(tuple->src.u.tcp.port),
+                          ntohs(tuple->dst.u.tcp.port));
+}
+/* Print out the private part of the conntrack. */
+static int tcp_print_conntrack(struct seq_file *s,
+                               const struct ip_conntrack *conntrack)
+{
+        enum tcp_conntrack state;
+        READ_LOCK(&tcp_lock);
+        state = conntrack->proto.tcp.state;
+        READ_UNLOCK(&tcp_lock);
+        return seq_printf(s, "%s ", tcp_conntrack_names[state]);
+}
+static unsigned int get_conntrack_index(const struct tcphdr *tcph)
+{
+        if (tcph->rst) return TCP_RST_SET;
+        else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
+        else if (tcph->fin) return TCP_FIN_SET;
+        else if (tcph->ack) return TCP_ACK_SET;
+        else return TCP_NONE_SET;
+}
+/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
+   in IP Filter' by Guido van Rooij.
+   
+   http://www.nluug.nl/events/sane2000/papers.html
+   http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+   
+   The boundaries and the conditions are changed according to RFC793:
+   the packet must intersect the window (i.e. segments may be
+   after the right or before the left edge) and thus receivers may ACK
+   segments after the right edge of the window.
+        td_maxend = max(sack + max(win,1)) seen in reply packets
+        td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
+        td_maxwin += seq + len - sender.td_maxend
+                        if seq + len > sender.td_maxend
+        td_end    = max(seq + len) seen in sent packets
+   
+   I.   Upper bound for valid data:     seq <= sender.td_maxend
+   II.  Lower bound for valid data:     seq + len >= sender.td_end - receiver.td_maxwin
+   III. Upper bound for valid ack:      sack <= receiver.td_end
+   IV.  Lower bound for valid ack:      ack >= receiver.td_end - MAXACKWINDOW
+        
+   where sack is the highest right edge of sack block found in the packet.
+        
+   The upper bound limit for a valid ack is not ignored - 
+   we doesn't have to deal with fragments. 
+*/
+static inline __u32 segment_seq_plus_len(__u32 seq,
+                                         size_t len,
+                                         struct iphdr *iph,
+                                         struct tcphdr *tcph)
+{
+        return (seq + len - (iph->ihl + tcph->doff)*4
+                + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
+}
+  
+/* Fixme: what about big packets? */
+#define MAXACKWINCONST                  66000
+#define MAXACKWINDOW(sender)                                            \
+        ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
+                                              : MAXACKWINCONST)
+  
+/*
+ * Simplified tcp_parse_options routine from tcp_input.c
+ */
+static void tcp_options(const struct sk_buff *skb,
+                        struct iphdr *iph,
+                        struct tcphdr *tcph, 
+                        struct ip_ct_tcp_state *state)
+{
+        unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+        unsigned char *ptr;
+        int length = (tcph->doff*4) - sizeof(struct tcphdr);
+        
+        if (!length)
+                return;
+        ptr = skb_header_pointer(skb,
+                                 (iph->ihl * 4) + sizeof(struct tcphdr),
+                                 length, buff);
+        BUG_ON(ptr == NULL);
+        state->td_scale = 
+        state->flags = 0;
+        
+        while (length > 0) {
+                int opcode=*ptr++;
+                int opsize;
+                
+                switch (opcode) {
+                case TCPOPT_EOL:
+                        return;
+                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
+                        length--;
+                        continue;
+                default:
+                        opsize=*ptr++;
+                        if (opsize < 2) /* "silly options" */
+                                return;
+                        if (opsize > length)
+                                break;  /* don't parse partial options */
+                        if (opcode == TCPOPT_SACK_PERM 
+                            && opsize == TCPOLEN_SACK_PERM)
+                                state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
+                        else if (opcode == TCPOPT_WINDOW
+                                 && opsize == TCPOLEN_WINDOW) {
+                                state->td_scale = *(u_int8_t *)ptr;
+                                
+                                if (state->td_scale > 14) {
+                                        /* See RFC1323 */
+                                        state->td_scale = 14;
+                                }
+                                state->flags |=
+                                        IP_CT_TCP_FLAG_WINDOW_SCALE;
+                        }
+                        ptr += opsize - 2;
+                        length -= opsize;
+                }
+        }
+}
+static void tcp_sack(const struct sk_buff *skb,
+                     struct iphdr *iph,
+                     struct tcphdr *tcph,
+                     __u32 *sack)
+{
+        unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+        unsigned char *ptr;
+        int length = (tcph->doff*4) - sizeof(struct tcphdr);
+        __u32 tmp;
+        if (!length)
+                return;
+        ptr = skb_header_pointer(skb,
+                                 (iph->ihl * 4) + sizeof(struct tcphdr),
+                                 length, buff);
+        BUG_ON(ptr == NULL);
+        /* Fast path for timestamp-only option */
+        if (length == TCPOLEN_TSTAMP_ALIGNED*4
+            && *(__u32 *)ptr ==
+                __constant_ntohl((TCPOPT_NOP << 24) 
+                                 | (TCPOPT_NOP << 16)
+                                 | (TCPOPT_TIMESTAMP << 8)
+                                 | TCPOLEN_TIMESTAMP))
+                return;
+                
+        while (length > 0) {
+                int opcode=*ptr++;
+                int opsize, i;
+                
+                switch (opcode) {
+                case TCPOPT_EOL:
+                        return;
+                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
+                        length--;
+                        continue;
+                default:
+                        opsize=*ptr++;
+                        if (opsize < 2) /* "silly options" */
+                                return;
+                        if (opsize > length)
+                                break;  /* don't parse partial options */
+                        if (opcode == TCPOPT_SACK 
+                            && opsize >= (TCPOLEN_SACK_BASE 
+                                          + TCPOLEN_SACK_PERBLOCK)
+                            && !((opsize - TCPOLEN_SACK_BASE) 
+                                 % TCPOLEN_SACK_PERBLOCK)) {
+                                for (i = 0;
+                                     i < (opsize - TCPOLEN_SACK_BASE);
+                                     i += TCPOLEN_SACK_PERBLOCK) {
+                                        tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
+                                        
+                                        if (after(tmp, *sack))
+                                                *sack = tmp;
+                                }
+                                return;
+                        }
+                        ptr += opsize - 2;
+                        length -= opsize;
+                }
+        }
+}
+static int tcp_in_window(struct ip_ct_tcp *state, 
+                         enum ip_conntrack_dir dir,
+                         unsigned int index,
+                         const struct sk_buff *skb,
+                         struct iphdr *iph,
+                         struct tcphdr *tcph)
+{
+        struct ip_ct_tcp_state *sender = &state->seen[dir];
+        struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+        __u32 seq, ack, sack, end, win, swin;
+        int res;
+        
+        /*
+         * Get the required data from the packet.
+         */
+        seq = ntohl(tcph->seq);
+        ack = sack = ntohl(tcph->ack_seq);
+        win = ntohs(tcph->window);
+        end = segment_seq_plus_len(seq, skb->len, iph, tcph);
+        
+        if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
+                tcp_sack(skb, iph, tcph, &sack);
+                
+        DEBUGP("tcp_in_window: START\n");
+        DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+               "seq=%u ack=%u sack=%u win=%u end=%u\n",
+                NIPQUAD(iph->saddr), ntohs(tcph->source), 
+                NIPQUAD(iph->daddr), ntohs(tcph->dest),
+                seq, ack, sack, win, end);
+        DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+               "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+                sender->td_end, sender->td_maxend, sender->td_maxwin,
+                sender->td_scale, 
+                receiver->td_end, receiver->td_maxend, receiver->td_maxwin, 
+                receiver->td_scale);
+                
+        if (sender->td_end == 0) {
+                /*
+                 * Initialize sender data.
+                 */
+                if (tcph->syn && tcph->ack) {
+                        /*
+                         * Outgoing SYN-ACK in reply to a SYN.
+                         */
+                        sender->td_end = 
+                        sender->td_maxend = end;
+                        sender->td_maxwin = (win == 0 ? 1 : win);
+                        tcp_options(skb, iph, tcph, sender);
+                        /* 
+                         * RFC 1323:
+                         * Both sides must send the Window Scale option
+                         * to enable window scaling in either direction.
+                         */
+                        if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
+                              && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
+                                sender->td_scale = 
+                                receiver->td_scale = 0;
+                } else {
+                        /*
+                         * We are in the middle of a connection,
+                         * its history is lost for us.
+                         * Let's try to use the data from the packet.
+                         */
+                        sender->td_end = end;
+                        sender->td_maxwin = (win == 0 ? 1 : win);
+                        sender->td_maxend = end + sender->td_maxwin;
+                }
+        } else if (((state->state == TCP_CONNTRACK_SYN_SENT
+                     && dir == IP_CT_DIR_ORIGINAL)
+                    || (state->state == TCP_CONNTRACK_SYN_RECV
+                        && dir == IP_CT_DIR_REPLY))
+                    && after(end, sender->td_end)) {
+                /*
+                 * RFC 793: "if a TCP is reinitialized ... then it need
+                 * not wait at all; it must only be sure to use sequence 
+                 * numbers larger than those recently used."
+                 */
+                sender->td_end =
+                sender->td_maxend = end;
+                sender->td_maxwin = (win == 0 ? 1 : win);
+                tcp_options(skb, iph, tcph, sender);
+        }
+        
+        if (!(tcph->ack)) {
+                /*
+                 * If there is no ACK, just pretend it was set and OK.
+                 */
+                ack = sack = receiver->td_end;
+        } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == 
+                    (TCP_FLAG_ACK|TCP_FLAG_RST)) 
+                   && (ack == 0)) {
+                /*
+                 * Broken TCP stacks, that set ACK in RST packets as well
+                 * with zero ack value.
+                 */
+                ack = sack = receiver->td_end;
+        }
+        if (seq == end
+            && (!tcph->rst 
+                || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+                /*
+                 * Packets contains no data: we assume it is valid
+                 * and check the ack value only.
+                 * However RST segments are always validated by their
+                 * SEQ number, except when seq == 0 (reset sent answering
+                 * SYN.
+                 */
+                seq = end = sender->td_end;
+                
+        DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+               "seq=%u ack=%u sack =%u win=%u end=%u\n",
+                NIPQUAD(iph->saddr), ntohs(tcph->source),
+                NIPQUAD(iph->daddr), ntohs(tcph->dest),
+                seq, ack, sack, win, end);
+        DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+               "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+                sender->td_end, sender->td_maxend, sender->td_maxwin,
+                sender->td_scale, 
+                receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+                receiver->td_scale);
+        
+        DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+                before(seq, sender->td_maxend + 1),
+                after(end, sender->td_end - receiver->td_maxwin - 1),
+                before(sack, receiver->td_end + 1),
+                after(ack, receiver->td_end - MAXACKWINDOW(sender)));
+        
+        if (sender->loose || receiver->loose ||
+            (before(seq, sender->td_maxend + 1) &&
+             after(end, sender->td_end - receiver->td_maxwin - 1) &&
+             before(sack, receiver->td_end + 1) &&
+             after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
+                /*
+                 * Take into account window scaling (RFC 1323).
+                 */
+                if (!tcph->syn)
+                        win <<= sender->td_scale;
+                
+                /*
+                 * Update sender data.
+                 */
+                swin = win + (sack - ack);
+                if (sender->td_maxwin < swin)
+                        sender->td_maxwin = swin;
+                if (after(end, sender->td_end))
+                        sender->td_end = end;
+                /*
+                 * Update receiver data.
+                 */
+                if (after(end, sender->td_maxend))
+                        receiver->td_maxwin += end - sender->td_maxend;
+                if (after(sack + win, receiver->td_maxend - 1)) {
+                        receiver->td_maxend = sack + win;
+                        if (win == 0)
+                                receiver->td_maxend++;
+                }
+                /* 
+                 * Check retransmissions.
+                 */
+                if (index == TCP_ACK_SET) {
+                        if (state->last_dir == dir
+                            && state->last_seq == seq
+                            && state->last_ack == ack
+                            && state->last_end == end)
+                                state->retrans++;
+                        else {
+                                state->last_dir = dir;
+                                state->last_seq = seq;
+                                state->last_ack = ack;
+                                state->last_end = end;
+                                state->retrans = 0;
+                        }
+                }
+                /*
+                 * Close the window of disabled window tracking :-)
+                 */
+                if (sender->loose)
+                        sender->loose--;
+                
+                res = 1;
+        } else {
+                if (LOG_INVALID(IPPROTO_TCP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+                        "ip_ct_tcp: %s ",
+                        before(seq, sender->td_maxend + 1) ?
+                        after(end, sender->td_end - receiver->td_maxwin - 1) ?
+                        before(sack, receiver->td_end + 1) ?
+                        after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+                        : "ACK is under the lower bound (possible overly delayed ACK)"
+                        : "ACK is over the upper bound (ACKed data not seen yet)"
+                        : "SEQ is under the lower bound (already ACKed data retransmitted)"
+                        : "SEQ is over the upper bound (over the window of the receiver)");
+                res = ip_ct_tcp_be_liberal;
+        }
+  
+        DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
+               "receiver end=%u maxend=%u maxwin=%u\n",
+                res, sender->td_end, sender->td_maxend, sender->td_maxwin, 
+                receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+        return res;
+}
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+/* Update sender->td_end after NAT successfully mangled the packet */
+void ip_conntrack_tcp_update(struct sk_buff *skb,
+                             struct ip_conntrack *conntrack, 
+                             enum ip_conntrack_dir dir)
+{
+        struct iphdr *iph = skb->nh.iph;
+        struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
+        __u32 end;
+#ifdef DEBUGP_VARS
+        struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
+        struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
+#endif
+        end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
+        
+        WRITE_LOCK(&tcp_lock);
+        /*
+         * We have to worry for the ack in the reply packet only...
+         */
+        if (after(end, conntrack->proto.tcp.seen[dir].td_end))
+                conntrack->proto.tcp.seen[dir].td_end = end;
+        conntrack->proto.tcp.last_end = end;
+        WRITE_UNLOCK(&tcp_lock);
+        DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
+               "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+                sender->td_end, sender->td_maxend, sender->td_maxwin,
+                sender->td_scale, 
+                receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+                receiver->td_scale);
+}
+ 
+#endif
+#define TH_FIN  0x01
+#define TH_SYN  0x02
+#define TH_RST  0x04
+#define TH_PUSH 0x08
+#define TH_ACK  0x10
+#define TH_URG  0x20
+#define TH_ECE  0x40
+#define TH_CWR  0x80
+/* table of valid flag combinations - ECE and CWR are always valid */
+static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
+{
+        [TH_SYN]                        = 1,
+        [TH_SYN|TH_ACK]                 = 1,
+        [TH_RST]                        = 1,
+        [TH_RST|TH_ACK]                 = 1,
+        [TH_RST|TH_ACK|TH_PUSH]         = 1,
+        [TH_FIN|TH_ACK]                 = 1,
+        [TH_ACK]                        = 1,
+        [TH_ACK|TH_PUSH]                = 1,
+        [TH_ACK|TH_URG]                 = 1,
+        [TH_ACK|TH_URG|TH_PUSH]         = 1,
+        [TH_FIN|TH_ACK|TH_PUSH]         = 1,
+        [TH_FIN|TH_ACK|TH_URG]          = 1,
+        [TH_FIN|TH_ACK|TH_URG|TH_PUSH]  = 1,
+};
+/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
+static int tcp_error(struct sk_buff *skb,
+                     enum ip_conntrack_info *ctinfo,
+                     unsigned int hooknum)
+{
+        struct iphdr *iph = skb->nh.iph;
+        struct tcphdr _tcph, *th;
+        unsigned int tcplen = skb->len - iph->ihl * 4;
+        u_int8_t tcpflags;
+        /* Smaller that minimal TCP header? */
+        th = skb_header_pointer(skb, iph->ihl * 4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL) {
+                if (LOG_INVALID(IPPROTO_TCP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                "ip_ct_tcp: short packet ");
+                return -NF_ACCEPT;
+        }
+  
+        /* Not whole TCP header or malformed packet */
+        if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
+                if (LOG_INVALID(IPPROTO_TCP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                "ip_ct_tcp: truncated/malformed packet ");
+                return -NF_ACCEPT;
+        }
+  
+        /* Checksum invalid? Ignore.
+         * We skip checking packets on the outgoing path
+         * because the semantic of CHECKSUM_HW is different there 
+         * and moreover root might send raw packets.
+         */
+        /* FIXME: Source route IP option packets --RR */
+        if (hooknum == NF_IP_PRE_ROUTING
+            && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
+                                 skb->ip_summed == CHECKSUM_HW ? skb->csum
+                                 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
+                if (LOG_INVALID(IPPROTO_TCP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                  "ip_ct_tcp: bad TCP checksum ");
+                return -NF_ACCEPT;
+        }
+        /* Check TCP flags. */
+        tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
+        if (!tcp_valid_flags[tcpflags]) {
+                if (LOG_INVALID(IPPROTO_TCP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                  "ip_ct_tcp: invalid TCP flag combination ");
+                return -NF_ACCEPT;
+        }
+        return NF_ACCEPT;
+}
+/* Returns verdict for packet, or -1 for invalid. */
+static int tcp_packet(struct ip_conntrack *conntrack,
+                      const struct sk_buff *skb,
+                      enum ip_conntrack_info ctinfo)
+{
+        enum tcp_conntrack new_state, old_state;
+        enum ip_conntrack_dir dir;
+        struct iphdr *iph = skb->nh.iph;
+        struct tcphdr *th, _tcph;
+        unsigned long timeout;
+        unsigned int index;
+        
+        th = skb_header_pointer(skb, iph->ihl * 4,
+                                sizeof(_tcph), &_tcph);
+        BUG_ON(th == NULL);
+        
+        WRITE_LOCK(&tcp_lock);
+        old_state = conntrack->proto.tcp.state;
+        dir = CTINFO2DIR(ctinfo);
+        index = get_conntrack_index(th);
+        new_state = tcp_conntracks[dir][index][old_state];
+        switch (new_state) {
+        case TCP_CONNTRACK_IGNORE:
+                /* Either SYN in ORIGINAL
+                 * or SYN/ACK in REPLY. */
+                if (index == TCP_SYNACK_SET
+                    && conntrack->proto.tcp.last_index == TCP_SYN_SET
+                    && conntrack->proto.tcp.last_dir != dir
+                    && ntohl(th->ack_seq) ==
+                             conntrack->proto.tcp.last_end) {
+                        /* This SYN/ACK acknowledges a SYN that we earlier 
+                         * ignored as invalid. This means that the client and
+                         * the server are both in sync, while the firewall is
+                         * not. We kill this session and block the SYN/ACK so
+                         * that the client cannot but retransmit its SYN and 
+                         * thus initiate a clean new session.
+                         */
+                        WRITE_UNLOCK(&tcp_lock);
+                        if (LOG_INVALID(IPPROTO_TCP))
+                                nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                          "ip_ct_tcp: killing out of sync session ");
+                        if (del_timer(&conntrack->timeout))
+                                conntrack->timeout.function((unsigned long)
+                                                            conntrack);
+                        return -NF_DROP;
+                }
+                conntrack->proto.tcp.last_index = index;
+                conntrack->proto.tcp.last_dir = dir;
+                conntrack->proto.tcp.last_seq = ntohl(th->seq);
+                conntrack->proto.tcp.last_end = 
+                    segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
+                
+                WRITE_UNLOCK(&tcp_lock);
+                if (LOG_INVALID(IPPROTO_TCP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                  "ip_ct_tcp: invalid packet ignored ");
+                return NF_ACCEPT;
+        case TCP_CONNTRACK_MAX:
+                /* Invalid packet */
+                DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+                       dir, get_conntrack_index(th),
+                       old_state);
+                WRITE_UNLOCK(&tcp_lock);
+                if (LOG_INVALID(IPPROTO_TCP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                  "ip_ct_tcp: invalid state ");
+                return -NF_ACCEPT;
+        case TCP_CONNTRACK_SYN_SENT:
+                if (old_state < TCP_CONNTRACK_TIME_WAIT)
+                        break;
+                if ((conntrack->proto.tcp.seen[dir].flags &
+                         IP_CT_TCP_FLAG_CLOSE_INIT)
+                    || after(ntohl(th->seq),
+                             conntrack->proto.tcp.seen[dir].td_end)) {  
+                        /* Attempt to reopen a closed connection.
+                        * Delete this connection and look up again. */
+                        WRITE_UNLOCK(&tcp_lock);
+                        if (del_timer(&conntrack->timeout))
+                                conntrack->timeout.function((unsigned long)
+                                                            conntrack);
+                        return -NF_REPEAT;
+                } else {
+                        WRITE_UNLOCK(&tcp_lock);
+                        if (LOG_INVALID(IPPROTO_TCP))
+                                nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+                                              "ip_ct_tcp: invalid SYN");
+                        return -NF_ACCEPT;
+                }
+        case TCP_CONNTRACK_CLOSE:
+                if (index == TCP_RST_SET
+                    && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
+                    && conntrack->proto.tcp.last_index == TCP_SYN_SET
+                    && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
+                        /* RST sent to invalid SYN we had let trough
+                         * SYN was in window then, tear down connection.
+                         * We skip window checking, because packet might ACK
+                         * segments we ignored in the SYN. */
+                        goto in_window;
+                }
+                /* Just fall trough */
+        default:
+                /* Keep compilers happy. */
+                break;
+        }
+        if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 
+                           skb, iph, th)) {
+                WRITE_UNLOCK(&tcp_lock);
+                return -NF_ACCEPT;
+        }
+    in_window:
+        /* From now on we have got in-window packets */ 
+        conntrack->proto.tcp.last_index = index;
+        DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+               "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+                NIPQUAD(iph->saddr), ntohs(th->source),
+                NIPQUAD(iph->daddr), ntohs(th->dest),
+                (th->syn ? 1 : 0), (th->ack ? 1 : 0),
+                (th->fin ? 1 : 0), (th->rst ? 1 : 0),
+                old_state, new_state);
+        conntrack->proto.tcp.state = new_state;
+        if (old_state != new_state 
+            && (new_state == TCP_CONNTRACK_FIN_WAIT
+                || new_state == TCP_CONNTRACK_CLOSE))
+                conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+        timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
+                  && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
+                  ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
+        WRITE_UNLOCK(&tcp_lock);
+        if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+                /* If only reply is a RST, we can consider ourselves not to
+                   have an established connection: this is a fairly common
+                   problem case, so we can delete the conntrack
+                   immediately.  --RR */
+                if (th->rst) {
+                        if (del_timer(&conntrack->timeout))
+                                conntrack->timeout.function((unsigned long)
+                                                            conntrack);
+                        return NF_ACCEPT;
+                }
+        } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
+                   && (old_state == TCP_CONNTRACK_SYN_RECV
+                       || old_state == TCP_CONNTRACK_ESTABLISHED)
+                   && new_state == TCP_CONNTRACK_ESTABLISHED) {
+                /* Set ASSURED if we see see valid ack in ESTABLISHED 
+                   after SYN_RECV or a valid answer for a picked up 
+                   connection. */
+                        set_bit(IPS_ASSURED_BIT, &conntrack->status);
+        }
+        ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
+        return NF_ACCEPT;
+}
+ 
+/* Called when a new connection for this protocol found. */
+static int tcp_new(struct ip_conntrack *conntrack,
+                   const struct sk_buff *skb)
+{
+        enum tcp_conntrack new_state;
+        struct iphdr *iph = skb->nh.iph;
+        struct tcphdr *th, _tcph;
+#ifdef DEBUGP_VARS
+        struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
+        struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
+#endif
+        th = skb_header_pointer(skb, iph->ihl * 4,
+                                sizeof(_tcph), &_tcph);
+        BUG_ON(th == NULL);
+        
+        /* Don't need lock here: this conntrack not in circulation yet */
+        new_state
+                = tcp_conntracks[0][get_conntrack_index(th)]
+                [TCP_CONNTRACK_NONE];
+        /* Invalid: delete conntrack */
+        if (new_state >= TCP_CONNTRACK_MAX) {
+                DEBUGP("ip_ct_tcp: invalid new deleting.\n");
+                return 0;
+        }
+        if (new_state == TCP_CONNTRACK_SYN_SENT) {
+                /* SYN packet */
+                conntrack->proto.tcp.seen[0].td_end =
+                        segment_seq_plus_len(ntohl(th->seq), skb->len,
+                                             iph, th);
+                conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+                if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+                        conntrack->proto.tcp.seen[0].td_maxwin = 1;
+                conntrack->proto.tcp.seen[0].td_maxend =
+                        conntrack->proto.tcp.seen[0].td_end;
+                tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
+                conntrack->proto.tcp.seen[1].flags = 0;
+                conntrack->proto.tcp.seen[0].loose = 
+                conntrack->proto.tcp.seen[1].loose = 0;
+        } else if (ip_ct_tcp_loose == 0) {
+                /* Don't try to pick up connections. */
+                return 0;
+        } else {
+                /*
+                 * We are in the middle of a connection,
+                 * its history is lost for us.
+                 * Let's try to use the data from the packet.
+                 */
+                conntrack->proto.tcp.seen[0].td_end =
+                        segment_seq_plus_len(ntohl(th->seq), skb->len,
+                                             iph, th);
+                conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+                if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+                        conntrack->proto.tcp.seen[0].td_maxwin = 1;
+                conntrack->proto.tcp.seen[0].td_maxend =
+                        conntrack->proto.tcp.seen[0].td_end + 
+                        conntrack->proto.tcp.seen[0].td_maxwin;
+                conntrack->proto.tcp.seen[0].td_scale = 0;
+                /* We assume SACK. Should we assume window scaling too? */
+                conntrack->proto.tcp.seen[0].flags =
+                conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
+                conntrack->proto.tcp.seen[0].loose = 
+                conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
+        }
+    
+        conntrack->proto.tcp.seen[1].td_end = 0;
+        conntrack->proto.tcp.seen[1].td_maxend = 0;
+        conntrack->proto.tcp.seen[1].td_maxwin = 1;
+        conntrack->proto.tcp.seen[1].td_scale = 0;      
+        /* tcp_packet will set them */
+        conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
+        conntrack->proto.tcp.last_index = TCP_NONE_SET;
+         
+        DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+               "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+                sender->td_end, sender->td_maxend, sender->td_maxwin,
+                sender->td_scale, 
+                receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+                receiver->td_scale);
+        return 1;
+}
+  
+struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
+{
+        .proto                  = IPPROTO_TCP,
+        .name                   = "tcp",
+        .pkt_to_tuple           = tcp_pkt_to_tuple,
+        .invert_tuple           = tcp_invert_tuple,
+        .print_tuple            = tcp_print_tuple,
+        .print_conntrack        = tcp_print_conntrack,
+        .packet                 = tcp_packet,
+        .new                    = tcp_new,
+        .error                  = tcp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
new file mode 100644
index 000000000000..5bc28a224623
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -0,0 +1,146 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <net/checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+unsigned long ip_ct_udp_timeout = 30*HZ;
+unsigned long ip_ct_udp_timeout_stream = 180*HZ;
+static int udp_pkt_to_tuple(const struct sk_buff *skb,
+                             unsigned int dataoff,
+                             struct ip_conntrack_tuple *tuple)
+{
+        struct udphdr _hdr, *hp;
+        /* Actually only need first 8 bytes. */
+        hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+        if (hp == NULL)
+                return 0;
+        tuple->src.u.udp.port = hp->source;
+        tuple->dst.u.udp.port = hp->dest;
+        return 1;
+}
+static int udp_invert_tuple(struct ip_conntrack_tuple *tuple,
+                            const struct ip_conntrack_tuple *orig)
+{
+        tuple->src.u.udp.port = orig->dst.u.udp.port;
+        tuple->dst.u.udp.port = orig->src.u.udp.port;
+        return 1;
+}
+/* Print out the per-protocol part of the tuple. */
+static int udp_print_tuple(struct seq_file *s,
+                           const struct ip_conntrack_tuple *tuple)
+{
+        return seq_printf(s, "sport=%hu dport=%hu ",
+                          ntohs(tuple->src.u.udp.port),
+                          ntohs(tuple->dst.u.udp.port));
+}
+/* Print out the private part of the conntrack. */
+static int udp_print_conntrack(struct seq_file *s,
+                               const struct ip_conntrack *conntrack)
+{
+        return 0;
+}
+/* Returns verdict for packet, and may modify conntracktype */
+static int udp_packet(struct ip_conntrack *conntrack,
+                      const struct sk_buff *skb,
+                      enum ip_conntrack_info ctinfo)
+{
+        /* If we've seen traffic both ways, this is some kind of UDP
+           stream.  Extend timeout. */
+        if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+                ip_ct_refresh_acct(conntrack, ctinfo, skb, 
+                                   ip_ct_udp_timeout_stream);
+                /* Also, more likely to be important, and not a probe */
+                set_bit(IPS_ASSURED_BIT, &conntrack->status);
+        } else
+                ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
+        return NF_ACCEPT;
+}
+/* Called when a new connection for this protocol found. */
+static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
+{
+        return 1;
+}
+static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
+                     unsigned int hooknum)
+{
+        struct iphdr *iph = skb->nh.iph;
+        unsigned int udplen = skb->len - iph->ihl * 4;
+        struct udphdr _hdr, *hdr;
+        /* Header is too small? */
+        hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
+        if (hdr == NULL) {
+                if (LOG_INVALID(IPPROTO_UDP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                  "ip_ct_udp: short packet ");
+                return -NF_ACCEPT;
+        }
+        
+        /* Truncated/malformed packets */
+        if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+                if (LOG_INVALID(IPPROTO_UDP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                  "ip_ct_udp: truncated/malformed packet ");
+                return -NF_ACCEPT;
+        }
+        
+        /* Packet with no checksum */
+        if (!hdr->check)
+                return NF_ACCEPT;
+        /* Checksum invalid? Ignore.
+         * We skip checking packets on the outgoing path
+         * because the semantic of CHECKSUM_HW is different there 
+         * and moreover root might send raw packets.
+         * FIXME: Source route IP option packets --RR */
+        if (hooknum == NF_IP_PRE_ROUTING
+            && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
+                                 skb->ip_summed == CHECKSUM_HW ? skb->csum
+                                 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
+                if (LOG_INVALID(IPPROTO_UDP))
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                  "ip_ct_udp: bad UDP checksum ");
+                return -NF_ACCEPT;
+        }
+        
+        return NF_ACCEPT;
+}
+struct ip_conntrack_protocol ip_conntrack_protocol_udp =
+{
+        .proto                  = IPPROTO_UDP,
+        .name                   = "udp",
+        .pkt_to_tuple           = udp_pkt_to_tuple,
+        .invert_tuple           = udp_invert_tuple,
+        .print_tuple            = udp_print_tuple,
+        .print_conntrack        = udp_print_conntrack,
+        .packet                 = udp_packet,
+        .new                    = udp_new,
+        .error                  = udp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
new file mode 100644
index 000000000000..80a7bde2a57a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -0,0 +1,961 @@
+/* This file contains all the functions required for the standalone
+   ip_conntrack module.
+   These are not required by the compatibility layer.
+*/
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <net/checksum.h>
+#include <net/ip.h>
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+MODULE_LICENSE("GPL");
+extern atomic_t ip_conntrack_count;
+DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+static int kill_proto(struct ip_conntrack *i, void *data)
+{
+        return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == 
+                        *((u_int8_t *) data));
+}
+#ifdef CONFIG_PROC_FS
+static int
+print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple,
+            struct ip_conntrack_protocol *proto)
+{
+        seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
+                   NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip));
+        return proto->print_tuple(s, tuple);
+}
+#ifdef CONFIG_IP_NF_CT_ACCT
+static unsigned int
+seq_print_counters(struct seq_file *s,
+                   const struct ip_conntrack_counter *counter)
+{
+        return seq_printf(s, "packets=%llu bytes=%llu ",
+                          (unsigned long long)counter->packets,
+                          (unsigned long long)counter->bytes);
+}
+#else
+#define seq_print_counters(x, y)        0
+#endif
+struct ct_iter_state {
+        unsigned int bucket;
+};
+static struct list_head *ct_get_first(struct seq_file *seq)
+{
+        struct ct_iter_state *st = seq->private;
+        for (st->bucket = 0;
+             st->bucket < ip_conntrack_htable_size;
+             st->bucket++) {
+                if (!list_empty(&ip_conntrack_hash[st->bucket]))
+                        return ip_conntrack_hash[st->bucket].next;
+        }
+        return NULL;
+}
+static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
+{
+        struct ct_iter_state *st = seq->private;
+        head = head->next;
+        while (head == &ip_conntrack_hash[st->bucket]) {
+                if (++st->bucket >= ip_conntrack_htable_size)
+                        return NULL;
+                head = ip_conntrack_hash[st->bucket].next;
+        }
+        return head;
+}
+static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct list_head *head = ct_get_first(seq);
+        if (head)
+                while (pos && (head = ct_get_next(seq, head)))
+                        pos--;
+        return pos ? NULL : head;
+}
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        READ_LOCK(&ip_conntrack_lock);
+        return ct_get_idx(seq, *pos);
+}
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return ct_get_next(s, v);
+}
+  
+static void ct_seq_stop(struct seq_file *s, void *v)
+{
+        READ_UNLOCK(&ip_conntrack_lock);
+}
+ 
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+        const struct ip_conntrack_tuple_hash *hash = v;
+        const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
+        struct ip_conntrack_protocol *proto;
+        MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+        IP_NF_ASSERT(conntrack);
+        /* we only want to print DIR_ORIGINAL */
+        if (DIRECTION(hash))
+                return 0;
+        proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+                               .tuple.dst.protonum);
+        IP_NF_ASSERT(proto);
+        if (seq_printf(s, "%-8s %u %ld ",
+                      proto->name,
+                      conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
+                      timer_pending(&conntrack->timeout)
+                      ? (long)(conntrack->timeout.expires - jiffies)/HZ
+                      : 0) != 0)
+                return -ENOSPC;
+        if (proto->print_conntrack(s, conntrack))
+                return -ENOSPC;
+  
+        if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+                        proto))
+                return -ENOSPC;
+        if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
+                return -ENOSPC;
+        if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
+                if (seq_printf(s, "[UNREPLIED] "))
+                        return -ENOSPC;
+        if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
+                        proto))
+                return -ENOSPC;
+        if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
+                return -ENOSPC;
+        if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
+                if (seq_printf(s, "[ASSURED] "))
+                        return -ENOSPC;
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+        if (seq_printf(s, "mark=%lu ", conntrack->mark))
+                return -ENOSPC;
+#endif
+        if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
+                return -ENOSPC;
+        return 0;
+}
+static struct seq_operations ct_seq_ops = {
+        .start = ct_seq_start,
+        .next  = ct_seq_next,
+        .stop  = ct_seq_stop,
+        .show  = ct_seq_show
+};
+  
+static int ct_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        struct ct_iter_state *st;
+        int ret;
+        st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
+        if (st == NULL)
+                return -ENOMEM;
+        ret = seq_open(file, &ct_seq_ops);
+        if (ret)
+                goto out_free;
+        seq          = file->private_data;
+        seq->private = st;
+        memset(st, 0, sizeof(struct ct_iter_state));
+        return ret;
+out_free:
+        kfree(st);
+        return ret;
+}
+static struct file_operations ct_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = ct_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+  
+/* expects */
+static void *exp_seq_start(struct seq_file *s, loff_t *pos)
+{
+        struct list_head *e = &ip_conntrack_expect_list;
+        loff_t i;
+        /* strange seq_file api calls stop even if we fail,
+         * thus we need to grab lock since stop unlocks */
+        READ_LOCK(&ip_conntrack_lock);
+        if (list_empty(e))
+                return NULL;
+        for (i = 0; i <= *pos; i++) {
+                e = e->next;
+                if (e == &ip_conntrack_expect_list)
+                        return NULL;
+        }
+        return e;
+}
+static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        struct list_head *e = v;
+        e = e->next;
+        if (e == &ip_conntrack_expect_list)
+                return NULL;
+        return e;
+}
+static void exp_seq_stop(struct seq_file *s, void *v)
+{
+        READ_UNLOCK(&ip_conntrack_lock);
+}
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+        struct ip_conntrack_expect *expect = v;
+        if (expect->timeout.function)
+                seq_printf(s, "%ld ", timer_pending(&expect->timeout)
+                           ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
+        else
+                seq_printf(s, "- ");
+        seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
+        print_tuple(s, &expect->tuple,
+                    ip_ct_find_proto(expect->tuple.dst.protonum));
+        return seq_putc(s, '\n');
+}
+static struct seq_operations exp_seq_ops = {
+        .start = exp_seq_start,
+        .next = exp_seq_next,
+        .stop = exp_seq_stop,
+        .show = exp_seq_show
+};
+static int exp_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &exp_seq_ops);
+}
+  
+static struct file_operations exp_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = exp_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        int cpu;
+        if (*pos == 0)
+                return SEQ_START_TOKEN;
+        for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+                if (!cpu_possible(cpu))
+                        continue;
+                *pos = cpu+1;
+                return &per_cpu(ip_conntrack_stat, cpu);
+        }
+        return NULL;
+}
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        int cpu;
+        for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+                if (!cpu_possible(cpu))
+                        continue;
+                *pos = cpu+1;
+                return &per_cpu(ip_conntrack_stat, cpu);
+        }
+        return NULL;
+}
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+        unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
+        struct ip_conntrack_stat *st = v;
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete\n");
+                return 0;
+        }
+        seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
+                        "%08x %08x %08x %08x %08x  %08x %08x %08x \n",
+                   nr_conntracks,
+                   st->searched,
+                   st->found,
+                   st->new,
+                   st->invalid,
+                   st->ignore,
+                   st->delete,
+                   st->delete_list,
+                   st->insert,
+                   st->insert_failed,
+                   st->drop,
+                   st->early_drop,
+                   st->error,
+                   st->expect_new,
+                   st->expect_create,
+                   st->expect_delete
+                );
+        return 0;
+}
+static struct seq_operations ct_cpu_seq_ops = {
+        .start  = ct_cpu_seq_start,
+        .next   = ct_cpu_seq_next,
+        .stop   = ct_cpu_seq_stop,
+        .show   = ct_cpu_seq_show,
+};
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &ct_cpu_seq_ops);
+}
+static struct file_operations ct_cpu_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ct_cpu_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+#endif
+static unsigned int ip_confirm(unsigned int hooknum,
+                               struct sk_buff **pskb,
+                               const struct net_device *in,
+                               const struct net_device *out,
+                               int (*okfn)(struct sk_buff *))
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        /* This is where we call the helper: as the packet goes out. */
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        if (ct && ct->helper) {
+                unsigned int ret;
+                ret = ct->helper->help(pskb, ct, ctinfo);
+                if (ret != NF_ACCEPT)
+                        return ret;
+        }
+        /* We've seen it coming out the other side: confirm it */
+        return ip_conntrack_confirm(pskb);
+}
+static unsigned int ip_conntrack_defrag(unsigned int hooknum,
+                                        struct sk_buff **pskb,
+                                        const struct net_device *in,
+                                        const struct net_device *out,
+                                        int (*okfn)(struct sk_buff *))
+{
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+        /* Previously seen (loopback)?  Ignore.  Do this before
+           fragment check. */
+        if ((*pskb)->nfct)
+                return NF_ACCEPT;
+#endif
+        /* Gather fragments. */
+        if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+                *pskb = ip_ct_gather_frags(*pskb,
+                                           hooknum == NF_IP_PRE_ROUTING ? 
+                                           IP_DEFRAG_CONNTRACK_IN :
+                                           IP_DEFRAG_CONNTRACK_OUT);
+                if (!*pskb)
+                        return NF_STOLEN;
+        }
+        return NF_ACCEPT;
+}
+static unsigned int ip_refrag(unsigned int hooknum,
+                              struct sk_buff **pskb,
+                              const struct net_device *in,
+                              const struct net_device *out,
+                              int (*okfn)(struct sk_buff *))
+{
+        struct rtable *rt = (struct rtable *)(*pskb)->dst;
+        /* We've seen it coming out the other side: confirm */
+        if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT)
+                return NF_DROP;
+        /* Local packets are never produced too large for their
+           interface.  We degfragment them at LOCAL_OUT, however,
+           so we have to refragment them here. */
+        if ((*pskb)->len > dst_mtu(&rt->u.dst) &&
+            !skb_shinfo(*pskb)->tso_size) {
+                /* No hook can be after us, so this should be OK. */
+                ip_fragment(*pskb, okfn);
+                return NF_STOLEN;
+        }
+        return NF_ACCEPT;
+}
+static unsigned int ip_conntrack_local(unsigned int hooknum,
+                                       struct sk_buff **pskb,
+                                       const struct net_device *in,
+                                       const struct net_device *out,
+                                       int (*okfn)(struct sk_buff *))
+{
+        /* root is playing with raw sockets. */
+        if ((*pskb)->len < sizeof(struct iphdr)
+            || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+                if (net_ratelimit())
+                        printk("ipt_hook: happy cracking.\n");
+                return NF_ACCEPT;
+        }
+        return ip_conntrack_in(hooknum, pskb, in, out, okfn);
+}
+/* Connection tracking may drop packets, but never alters them, so
+   make it the first hook. */
+static struct nf_hook_ops ip_conntrack_defrag_ops = {
+        .hook           = ip_conntrack_defrag,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_PRE_ROUTING,
+        .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+static struct nf_hook_ops ip_conntrack_in_ops = {
+        .hook           = ip_conntrack_in,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_PRE_ROUTING,
+        .priority       = NF_IP_PRI_CONNTRACK,
+};
+static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = {
+        .hook           = ip_conntrack_defrag,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_LOCAL_OUT,
+        .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+static struct nf_hook_ops ip_conntrack_local_out_ops = {
+        .hook           = ip_conntrack_local,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_LOCAL_OUT,
+        .priority       = NF_IP_PRI_CONNTRACK,
+};
+/* Refragmenter; last chance. */
+static struct nf_hook_ops ip_conntrack_out_ops = {
+        .hook           = ip_refrag,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_POST_ROUTING,
+        .priority       = NF_IP_PRI_LAST,
+};
+static struct nf_hook_ops ip_conntrack_local_in_ops = {
+        .hook           = ip_confirm,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_LOCAL_IN,
+        .priority       = NF_IP_PRI_LAST-1,
+};
+/* Sysctl support */
+#ifdef CONFIG_SYSCTL
+/* From ip_conntrack_core.c */
+extern int ip_conntrack_max;
+extern unsigned int ip_conntrack_htable_size;
+/* From ip_conntrack_proto_tcp.c */
+extern unsigned long ip_ct_tcp_timeout_syn_sent;
+extern unsigned long ip_ct_tcp_timeout_syn_recv;
+extern unsigned long ip_ct_tcp_timeout_established;
+extern unsigned long ip_ct_tcp_timeout_fin_wait;
+extern unsigned long ip_ct_tcp_timeout_close_wait;
+extern unsigned long ip_ct_tcp_timeout_last_ack;
+extern unsigned long ip_ct_tcp_timeout_time_wait;
+extern unsigned long ip_ct_tcp_timeout_close;
+extern unsigned long ip_ct_tcp_timeout_max_retrans;
+extern int ip_ct_tcp_loose;
+extern int ip_ct_tcp_be_liberal;
+extern int ip_ct_tcp_max_retrans;
+/* From ip_conntrack_proto_udp.c */
+extern unsigned long ip_ct_udp_timeout;
+extern unsigned long ip_ct_udp_timeout_stream;
+/* From ip_conntrack_proto_icmp.c */
+extern unsigned long ip_ct_icmp_timeout;
+/* From ip_conntrack_proto_icmp.c */
+extern unsigned long ip_ct_generic_timeout;
+/* Log invalid packets of a given protocol */
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+static struct ctl_table_header *ip_ct_sysctl_header;
+static ctl_table ip_ct_sysctl_table[] = {
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_MAX,
+                .procname       = "ip_conntrack_max",
+                .data           = &ip_conntrack_max,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_COUNT,
+                .procname       = "ip_conntrack_count",
+                .data           = &ip_conntrack_count,
+                .maxlen         = sizeof(int),
+                .mode           = 0444,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_BUCKETS,
+                .procname       = "ip_conntrack_buckets",
+                .data           = &ip_conntrack_htable_size,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0444,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
+                .procname       = "ip_conntrack_tcp_timeout_syn_sent",
+                .data           = &ip_ct_tcp_timeout_syn_sent,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
+                .procname       = "ip_conntrack_tcp_timeout_syn_recv",
+                .data           = &ip_ct_tcp_timeout_syn_recv,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
+                .procname       = "ip_conntrack_tcp_timeout_established",
+                .data           = &ip_ct_tcp_timeout_established,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
+                .procname       = "ip_conntrack_tcp_timeout_fin_wait",
+                .data           = &ip_ct_tcp_timeout_fin_wait,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
+                .procname       = "ip_conntrack_tcp_timeout_close_wait",
+                .data           = &ip_ct_tcp_timeout_close_wait,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
+                .procname       = "ip_conntrack_tcp_timeout_last_ack",
+                .data           = &ip_ct_tcp_timeout_last_ack,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
+                .procname       = "ip_conntrack_tcp_timeout_time_wait",
+                .data           = &ip_ct_tcp_timeout_time_wait,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
+                .procname       = "ip_conntrack_tcp_timeout_close",
+                .data           = &ip_ct_tcp_timeout_close,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,
+                .procname       = "ip_conntrack_udp_timeout",
+                .data           = &ip_ct_udp_timeout,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
+                .procname       = "ip_conntrack_udp_timeout_stream",
+                .data           = &ip_ct_udp_timeout_stream,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,
+                .procname       = "ip_conntrack_icmp_timeout",
+                .data           = &ip_ct_icmp_timeout,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,
+                .procname       = "ip_conntrack_generic_timeout",
+                .data           = &ip_ct_generic_timeout,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
+                .procname       = "ip_conntrack_log_invalid",
+                .data           = &ip_ct_log_invalid,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &log_invalid_proto_min,
+                .extra2         = &log_invalid_proto_max,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
+                .procname       = "ip_conntrack_tcp_timeout_max_retrans",
+                .data           = &ip_ct_tcp_timeout_max_retrans,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
+                .procname       = "ip_conntrack_tcp_loose",
+                .data           = &ip_ct_tcp_loose,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
+                .procname       = "ip_conntrack_tcp_be_liberal",
+                .data           = &ip_ct_tcp_be_liberal,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
+                .procname       = "ip_conntrack_tcp_max_retrans",
+                .data           = &ip_ct_tcp_max_retrans,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0 }
+};
+#define NET_IP_CONNTRACK_MAX 2089
+static ctl_table ip_ct_netfilter_table[] = {
+        {
+                .ctl_name       = NET_IPV4_NETFILTER,
+                .procname       = "netfilter",
+                .mode           = 0555,
+                .child          = ip_ct_sysctl_table,
+        },
+        {
+                .ctl_name       = NET_IP_CONNTRACK_MAX,
+                .procname       = "ip_conntrack_max",
+                .data           = &ip_conntrack_max,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ip_ct_ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4",
+                .mode           = 0555,
+                .child          = ip_ct_netfilter_table,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ip_ct_net_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net",
+                .mode           = 0555, 
+                .child          = ip_ct_ipv4_table,
+        },
+        { .ctl_name = 0 }
+};
+EXPORT_SYMBOL(ip_ct_log_invalid);
+#endif /* CONFIG_SYSCTL */
+static int init_or_cleanup(int init)
+{
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *proc, *proc_exp, *proc_stat;
+#endif
+        int ret = 0;
+        if (!init) goto cleanup;
+        ret = ip_conntrack_init();
+        if (ret < 0)
+                goto cleanup_nothing;
+#ifdef CONFIG_PROC_FS
+        ret = -ENOMEM;
+        proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
+        if (!proc) goto cleanup_init;
+        proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
+                                        &exp_file_ops);
+        if (!proc_exp) goto cleanup_proc;
+        proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
+        if (!proc_stat)
+                goto cleanup_proc_exp;
+        proc_stat->proc_fops = &ct_cpu_seq_fops;
+        proc_stat->owner = THIS_MODULE;
+#endif
+        ret = nf_register_hook(&ip_conntrack_defrag_ops);
+        if (ret < 0) {
+                printk("ip_conntrack: can't register pre-routing defrag hook.\n");
+                goto cleanup_proc_stat;
+        }
+        ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops);
+        if (ret < 0) {
+                printk("ip_conntrack: can't register local_out defrag hook.\n");
+                goto cleanup_defragops;
+        }
+        ret = nf_register_hook(&ip_conntrack_in_ops);
+        if (ret < 0) {
+                printk("ip_conntrack: can't register pre-routing hook.\n");
+                goto cleanup_defraglocalops;
+        }
+        ret = nf_register_hook(&ip_conntrack_local_out_ops);
+        if (ret < 0) {
+                printk("ip_conntrack: can't register local out hook.\n");
+                goto cleanup_inops;
+        }
+        ret = nf_register_hook(&ip_conntrack_out_ops);
+        if (ret < 0) {
+                printk("ip_conntrack: can't register post-routing hook.\n");
+                goto cleanup_inandlocalops;
+        }
+        ret = nf_register_hook(&ip_conntrack_local_in_ops);
+        if (ret < 0) {
+                printk("ip_conntrack: can't register local in hook.\n");
+                goto cleanup_inoutandlocalops;
+        }
+#ifdef CONFIG_SYSCTL
+        ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
+        if (ip_ct_sysctl_header == NULL) {
+                printk("ip_conntrack: can't register to sysctl.\n");
+                ret = -ENOMEM;
+                goto cleanup_localinops;
+        }
+#endif
+        return ret;
+ cleanup:
+#ifdef CONFIG_SYSCTL
+        unregister_sysctl_table(ip_ct_sysctl_header);
+ cleanup_localinops:
+#endif
+        nf_unregister_hook(&ip_conntrack_local_in_ops);
+ cleanup_inoutandlocalops:
+        nf_unregister_hook(&ip_conntrack_out_ops);
+ cleanup_inandlocalops:
+        nf_unregister_hook(&ip_conntrack_local_out_ops);
+ cleanup_inops:
+        nf_unregister_hook(&ip_conntrack_in_ops);
+ cleanup_defraglocalops:
+        nf_unregister_hook(&ip_conntrack_defrag_local_out_ops);
+ cleanup_defragops:
+        nf_unregister_hook(&ip_conntrack_defrag_ops);
+ cleanup_proc_stat:
+#ifdef CONFIG_PROC_FS
+        remove_proc_entry("ip_conntrack", proc_net_stat);
+ cleanup_proc_exp:
+        proc_net_remove("ip_conntrack_expect");
+ cleanup_proc:
+        proc_net_remove("ip_conntrack");
+ cleanup_init:
+#endif /* CONFIG_PROC_FS */
+        ip_conntrack_cleanup();
+ cleanup_nothing:
+        return ret;
+}
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+   them. --RR */
+int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
+{
+        int ret = 0;
+        WRITE_LOCK(&ip_conntrack_lock);
+        if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
+                ret = -EBUSY;
+                goto out;
+        }
+        ip_ct_protos[proto->proto] = proto;
+ out:
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        return ret;
+}
+void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
+{
+        WRITE_LOCK(&ip_conntrack_lock);
+        ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
+        WRITE_UNLOCK(&ip_conntrack_lock);
+        
+        /* Somebody could be still looking at the proto in bh. */
+        synchronize_net();
+        /* Remove all contrack entries for this protocol */
+        ip_ct_iterate_cleanup(kill_proto, &proto->proto);
+}
+static int __init init(void)
+{
+        return init_or_cleanup(1);
+}
+static void __exit fini(void)
+{
+        init_or_cleanup(0);
+}
+module_init(init);
+module_exit(fini);
+/* Some modules need us, but don't depend directly on any symbol.
+   They should call this. */
+void need_ip_conntrack(void)
+{
+}
+EXPORT_SYMBOL(ip_conntrack_protocol_register);
+EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
+EXPORT_SYMBOL(ip_ct_get_tuple);
+EXPORT_SYMBOL(invert_tuplepr);
+EXPORT_SYMBOL(ip_conntrack_alter_reply);
+EXPORT_SYMBOL(ip_conntrack_destroyed);
+EXPORT_SYMBOL(need_ip_conntrack);
+EXPORT_SYMBOL(ip_conntrack_helper_register);
+EXPORT_SYMBOL(ip_conntrack_helper_unregister);
+EXPORT_SYMBOL(ip_ct_iterate_cleanup);
+EXPORT_SYMBOL(ip_ct_refresh_acct);
+EXPORT_SYMBOL(ip_ct_protos);
+EXPORT_SYMBOL(ip_ct_find_proto);
+EXPORT_SYMBOL(ip_conntrack_expect_alloc);
+EXPORT_SYMBOL(ip_conntrack_expect_free);
+EXPORT_SYMBOL(ip_conntrack_expect_related);
+EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL(ip_conntrack_tuple_taken);
+EXPORT_SYMBOL(ip_ct_gather_frags);
+EXPORT_SYMBOL(ip_conntrack_htable_size);
+EXPORT_SYMBOL(ip_conntrack_lock);
+EXPORT_SYMBOL(ip_conntrack_hash);
+EXPORT_SYMBOL(ip_conntrack_untracked);
+EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_put);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+EXPORT_SYMBOL(ip_conntrack_tcp_update);
+#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
new file mode 100644
index 000000000000..992fac3e36ee
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -0,0 +1,159 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Version: 0.0.7
+ *
+ * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
+ *      - port to newnat API
+ *
+ */
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
+#include <linux/moduleparam.h>
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("tftp connection tracking helper");
+MODULE_LICENSE("GPL");
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, int, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of tftp servers");
+#if 0
+#define DEBUGP(format, args...) printk("%s:%s:" format, \
+                                       __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb,
+                                 enum ip_conntrack_info ctinfo,
+                                 struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_tftp_hook);
+static int tftp_help(struct sk_buff **pskb,
+                     struct ip_conntrack *ct,
+                     enum ip_conntrack_info ctinfo)
+{
+        struct tftphdr _tftph, *tfh;
+        struct ip_conntrack_expect *exp;
+        unsigned int ret = NF_ACCEPT;
+        tfh = skb_header_pointer(*pskb,
+                                 (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr),
+                                 sizeof(_tftph), &_tftph);
+        if (tfh == NULL)
+                return NF_ACCEPT;
+        switch (ntohs(tfh->opcode)) {
+        /* RRQ and WRQ works the same way */
+        case TFTP_OPCODE_READ:
+        case TFTP_OPCODE_WRITE:
+                DEBUGP("");
+                DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+                DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+                exp = ip_conntrack_expect_alloc();
+                if (exp == NULL)
+                        return NF_DROP;
+                exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+                exp->mask.src.ip = 0xffffffff;
+                exp->mask.dst.ip = 0xffffffff;
+                exp->mask.dst.u.udp.port = 0xffff;
+                exp->mask.dst.protonum = 0xff;
+                exp->expectfn = NULL;
+                exp->master = ct;
+                DEBUGP("expect: ");
+                DUMP_TUPLE(&exp->tuple);
+                DUMP_TUPLE(&exp->mask);
+                if (ip_nat_tftp_hook)
+                        ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
+                else if (ip_conntrack_expect_related(exp) != 0) {
+                        ip_conntrack_expect_free(exp);
+                        ret = NF_DROP;
+                }
+                break;
+        case TFTP_OPCODE_DATA:
+        case TFTP_OPCODE_ACK:
+                DEBUGP("Data/ACK opcode\n");
+                break;
+        case TFTP_OPCODE_ERROR:
+                DEBUGP("Error opcode\n");
+                break;
+        default:
+                DEBUGP("Unknown opcode\n");
+        }
+        return NF_ACCEPT;
+}
+static struct ip_conntrack_helper tftp[MAX_PORTS];
+static char tftp_names[MAX_PORTS][10];
+static void fini(void)
+{
+        int i;
+        for (i = 0 ; i < ports_c; i++) {
+                DEBUGP("unregistering helper for port %d\n",
+                        ports[i]);
+                ip_conntrack_helper_unregister(&tftp[i]);
+        } 
+}
+static int __init init(void)
+{
+        int i, ret;
+        char *tmpname;
+        if (ports_c == 0)
+                ports[ports_c++] = TFTP_PORT;
+        for (i = 0; i < ports_c; i++) {
+                /* Create helper structure */
+                memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper));
+                tftp[i].tuple.dst.protonum = IPPROTO_UDP;
+                tftp[i].tuple.src.u.udp.port = htons(ports[i]);
+                tftp[i].mask.dst.protonum = 0xFF;
+                tftp[i].mask.src.u.udp.port = 0xFFFF;
+                tftp[i].max_expected = 1;
+                tftp[i].timeout = 5 * 60; /* 5 minutes */
+                tftp[i].me = THIS_MODULE;
+                tftp[i].help = tftp_help;
+                tmpname = &tftp_names[i][0];
+                if (ports[i] == TFTP_PORT)
+                        sprintf(tmpname, "tftp");
+                else
+                        sprintf(tmpname, "tftp-%d", i);
+                tftp[i].name = tmpname;
+                DEBUGP("port #%d: %d\n", i, ports[i]);
+                ret=ip_conntrack_helper_register(&tftp[i]);
+                if (ret) {
+                        printk("ERROR registering helper for port %d\n",
+                                ports[i]);
+                        fini();
+                        return(ret);
+                }
+        }
+        return(0);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
new file mode 100644
index 000000000000..da1f412583ed
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -0,0 +1,88 @@
+/* Amanda extension for TCP NAT alteration.
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on a copy of HW's ip_nat_irc.c as well as other modules
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      Module load syntax:
+ *      insmod ip_nat_amanda.o
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda NAT helper");
+MODULE_LICENSE("GPL");
+static unsigned int help(struct sk_buff **pskb,
+                         enum ip_conntrack_info ctinfo,
+                         unsigned int matchoff,
+                         unsigned int matchlen,
+                         struct ip_conntrack_expect *exp)
+{
+        char buffer[sizeof("65535")];
+        u_int16_t port;
+        unsigned int ret;
+        /* Connection comes from client. */
+        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+        exp->dir = IP_CT_DIR_ORIGINAL;
+        /* When you see the packet, we need to NAT it the same as the
+         * this one (ie. same IP: it will be TCP and master is UDP). */
+        exp->expectfn = ip_nat_follow_master;
+        /* Try to get same port: if not, try to change it. */
+        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+                exp->tuple.dst.u.tcp.port = htons(port);
+                if (ip_conntrack_expect_related(exp) == 0)
+                        break;
+        }
+        if (port == 0) {
+                ip_conntrack_expect_free(exp);
+                return NF_DROP;
+        }
+        sprintf(buffer, "%u", port);
+        ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
+                                       matchoff, matchlen,
+                                       buffer, strlen(buffer));
+        if (ret != NF_ACCEPT)
+                ip_conntrack_unexpect_related(exp);
+        return ret;
+}
+static void __exit fini(void)
+{
+        ip_nat_amanda_hook = NULL;
+        /* Make sure noone calls it, meanwhile. */
+        synchronize_net();
+}
+static int __init init(void)
+{
+        BUG_ON(ip_nat_amanda_hook);
+        ip_nat_amanda_hook = help;
+        return 0;
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
new file mode 100644
index 000000000000..162ceacfc29a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -0,0 +1,556 @@
+/* NAT for netfilter; shared with compatibility layer. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>  /* For tcp_prot in getorigdst */
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <linux/jhash.h>
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+DECLARE_RWLOCK(ip_nat_lock);
+/* Calculated at init based on memory size */
+static unsigned int ip_nat_htable_size;
+static struct list_head *bysource;
+struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct ip_conntrack_tuple *tuple)
+{
+        /* Original src, to ensure we map it consistently if poss. */
+        return jhash_3words(tuple->src.ip, tuple->src.u.all,
+                            tuple->dst.protonum, 0) % ip_nat_htable_size;
+}
+/* Noone using conntrack by the time this called. */
+static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
+{
+        if (!(conn->status & IPS_NAT_DONE_MASK))
+                return;
+        WRITE_LOCK(&ip_nat_lock);
+        list_del(&conn->nat.info.bysource);
+        WRITE_UNLOCK(&ip_nat_lock);
+}
+/* We do checksum mangling, so if they were wrong before they're still
+ * wrong.  Also works for incomplete packets (eg. ICMP dest
+ * unreachables.) */
+u_int16_t
+ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+        u_int32_t diffs[] = { oldvalinv, newval };
+        return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+                                      oldcheck^0xFFFF));
+}
+/* Is this tuple already taken? (not by us) */
+int
+ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
+                  const struct ip_conntrack *ignored_conntrack)
+{
+        /* Conntrack tracking doesn't keep track of outgoing tuples; only
+           incoming ones.  NAT means they don't have a fixed mapping,
+           so we invert the tuple and look for the incoming reply.
+           We could keep a separate hash if this proves too slow. */
+        struct ip_conntrack_tuple reply;
+        invert_tuplepr(&reply, tuple);
+        return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+/* If we source map this tuple so reply looks like reply_tuple, will
+ * that meet the constraints of range. */
+static int
+in_range(const struct ip_conntrack_tuple *tuple,
+         const struct ip_nat_range *range)
+{
+        struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+        /* If we are supposed to map IPs, then we must be in the
+           range specified, otherwise let this drag us onto a new src IP. */
+        if (range->flags & IP_NAT_RANGE_MAP_IPS) {
+                if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
+                    || ntohl(tuple->src.ip) > ntohl(range->max_ip))
+                        return 0;
+        }
+        if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+            || proto->in_range(tuple, IP_NAT_MANIP_SRC,
+                               &range->min, &range->max))
+                return 1;
+        return 0;
+}
+static inline int
+same_src(const struct ip_conntrack *ct,
+         const struct ip_conntrack_tuple *tuple)
+{
+        return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
+                == tuple->dst.protonum
+                && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
+                == tuple->src.ip
+                && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
+                == tuple->src.u.all);
+}
+/* Only called for SRC manip */
+static int
+find_appropriate_src(const struct ip_conntrack_tuple *tuple,
+                     struct ip_conntrack_tuple *result,
+                     const struct ip_nat_range *range)
+{
+        unsigned int h = hash_by_src(tuple);
+        struct ip_conntrack *ct;
+        READ_LOCK(&ip_nat_lock);
+        list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
+                if (same_src(ct, tuple)) {
+                        /* Copy source part from reply tuple. */
+                        invert_tuplepr(result,
+                                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+                        result->dst = tuple->dst;
+                        if (in_range(result, range)) {
+                                READ_UNLOCK(&ip_nat_lock);
+                                return 1;
+                        }
+                }
+        }
+        READ_UNLOCK(&ip_nat_lock);
+        return 0;
+}
+/* For [FUTURE] fragmentation handling, we want the least-used
+   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
+   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+   1-65535, we don't do pro-rata allocation based on ports; we choose
+   the ip with the lowest src-ip/dst-ip/proto usage.
+*/
+static void
+find_best_ips_proto(struct ip_conntrack_tuple *tuple,
+                    const struct ip_nat_range *range,
+                    const struct ip_conntrack *conntrack,
+                    enum ip_nat_manip_type maniptype)
+{
+        u_int32_t *var_ipp;
+        /* Host order */
+        u_int32_t minip, maxip, j;
+        /* No IP mapping?  Do nothing. */
+        if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
+                return;
+        if (maniptype == IP_NAT_MANIP_SRC)
+                var_ipp = &tuple->src.ip;
+        else
+                var_ipp = &tuple->dst.ip;
+        /* Fast path: only one choice. */
+        if (range->min_ip == range->max_ip) {
+                *var_ipp = range->min_ip;
+                return;
+        }
+        /* Hashing source and destination IPs gives a fairly even
+         * spread in practice (if there are a small number of IPs
+         * involved, there usually aren't that many connections
+         * anyway).  The consistency means that servers see the same
+         * client coming from the same IP (some Internet Banking sites
+         * like this), even across reboots. */
+        minip = ntohl(range->min_ip);
+        maxip = ntohl(range->max_ip);
+        j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
+        *var_ipp = htonl(minip + j % (maxip - minip + 1));
+}
+/* Manipulate the tuple into the range given.  For NF_IP_POST_ROUTING,
+ * we change the source to map into the range.  For NF_IP_PRE_ROUTING
+ * and NF_IP_LOCAL_OUT, we change the destination to map into the
+ * range.  It might not be possible to get a unique tuple, but we try.
+ * At worst (or if we race), we will end up with a final duplicate in
+ * __ip_conntrack_confirm and drop the packet. */
+static void
+get_unique_tuple(struct ip_conntrack_tuple *tuple,
+                 const struct ip_conntrack_tuple *orig_tuple,
+                 const struct ip_nat_range *range,
+                 struct ip_conntrack *conntrack,
+                 enum ip_nat_manip_type maniptype)
+{
+        struct ip_nat_protocol *proto
+                = ip_nat_find_proto(orig_tuple->dst.protonum);
+        /* 1) If this srcip/proto/src-proto-part is currently mapped,
+           and that same mapping gives a unique tuple within the given
+           range, use that.
+           This is only required for source (ie. NAT/masq) mappings.
+           So far, we don't do local source mappings, so multiple
+           manips not an issue.  */
+        if (maniptype == IP_NAT_MANIP_SRC) {
+                if (find_appropriate_src(orig_tuple, tuple, range)) {
+                        DEBUGP("get_unique_tuple: Found current src map\n");
+                        if (!ip_nat_used_tuple(tuple, conntrack))
+                                return;
+                }
+        }
+        /* 2) Select the least-used IP/proto combination in the given
+           range. */
+        *tuple = *orig_tuple;
+        find_best_ips_proto(tuple, range, conntrack, maniptype);
+        /* 3) The per-protocol part of the manip is made to map into
+           the range to make a unique tuple. */
+        /* Only bother mapping if it's not already in range and unique */
+        if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+             || proto->in_range(tuple, maniptype, &range->min, &range->max))
+            && !ip_nat_used_tuple(tuple, conntrack))
+                return;
+        /* Last change: get protocol to try to obtain unique tuple. */
+        proto->unique_tuple(tuple, range, maniptype, conntrack);
+}
+unsigned int
+ip_nat_setup_info(struct ip_conntrack *conntrack,
+                  const struct ip_nat_range *range,
+                  unsigned int hooknum)
+{
+        struct ip_conntrack_tuple curr_tuple, new_tuple;
+        struct ip_nat_info *info = &conntrack->nat.info;
+        int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
+        enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+        IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+                     || hooknum == NF_IP_POST_ROUTING
+                     || hooknum == NF_IP_LOCAL_IN
+                     || hooknum == NF_IP_LOCAL_OUT);
+        BUG_ON(ip_nat_initialized(conntrack, maniptype));
+        /* What we've got will look like inverse of reply. Normally
+           this is what is in the conntrack, except for prior
+           manipulations (future optimization: if num_manips == 0,
+           orig_tp =
+           conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
+        invert_tuplepr(&curr_tuple,
+                       &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
+        get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
+        if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
+                struct ip_conntrack_tuple reply;
+                /* Alter conntrack table so will recognize replies. */
+                invert_tuplepr(&reply, &new_tuple);
+                ip_conntrack_alter_reply(conntrack, &reply);
+                /* Non-atomic: we own this at the moment. */
+                if (maniptype == IP_NAT_MANIP_SRC)
+                        conntrack->status |= IPS_SRC_NAT;
+                else
+                        conntrack->status |= IPS_DST_NAT;
+        }
+        /* Place in source hash if this is the first time. */
+        if (have_to_hash) {
+                unsigned int srchash
+                        = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+                                      .tuple);
+                WRITE_LOCK(&ip_nat_lock);
+                list_add(&info->bysource, &bysource[srchash]);
+                WRITE_UNLOCK(&ip_nat_lock);
+        }
+        /* It's done. */
+        if (maniptype == IP_NAT_MANIP_DST)
+                set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
+        else
+                set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
+        return NF_ACCEPT;
+}
+/* Returns true if succeeded. */
+static int
+manip_pkt(u_int16_t proto,
+          struct sk_buff **pskb,
+          unsigned int iphdroff,
+          const struct ip_conntrack_tuple *target,
+          enum ip_nat_manip_type maniptype)
+{
+        struct iphdr *iph;
+        (*pskb)->nfcache |= NFC_ALTERED;
+        if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+                return 0;
+        iph = (void *)(*pskb)->data + iphdroff;
+        /* Manipulate protcol part. */
+        if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
+                                                 target, maniptype))
+                return 0;
+        iph = (void *)(*pskb)->data + iphdroff;
+        if (maniptype == IP_NAT_MANIP_SRC) {
+                iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
+                                                iph->check);
+                iph->saddr = target->src.ip;
+        } else {
+                iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
+                                                iph->check);
+                iph->daddr = target->dst.ip;
+        }
+        return 1;
+}
+/* Do packet manipulations according to ip_nat_setup_info. */
+unsigned int nat_packet(struct ip_conntrack *ct,
+                        enum ip_conntrack_info ctinfo,
+                        unsigned int hooknum,
+                        struct sk_buff **pskb)
+{
+        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+        unsigned long statusbit;
+        enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
+        if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
+            && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) {
+                DEBUGP("ip_nat_core: adjusting sequence number\n");
+                /* future: put this in a l4-proto specific function,
+                 * and call this function here. */
+                if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
+                        return NF_DROP;
+        }
+        if (mtype == IP_NAT_MANIP_SRC)
+                statusbit = IPS_SRC_NAT;
+        else
+                statusbit = IPS_DST_NAT;
+        /* Invert if this is reply dir. */
+        if (dir == IP_CT_DIR_REPLY)
+                statusbit ^= IPS_NAT_MASK;
+        /* Non-atomic: these bits don't change. */
+        if (ct->status & statusbit) {
+                struct ip_conntrack_tuple target;
+                /* We are aiming to look like inverse of other direction. */
+                invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+                if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
+                        return NF_DROP;
+        }
+        return NF_ACCEPT;
+}
+/* Dir is direction ICMP is coming from (opposite to packet it contains) */
+int icmp_reply_translation(struct sk_buff **pskb,
+                           struct ip_conntrack *ct,
+                           enum ip_nat_manip_type manip,
+                           enum ip_conntrack_dir dir)
+{
+        struct {
+                struct icmphdr icmp;
+                struct iphdr ip;
+        } *inside;
+        struct ip_conntrack_tuple inner, target;
+        int hdrlen = (*pskb)->nh.iph->ihl * 4;
+        if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+                return 0;
+        inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+        /* We're actually going to mangle it beyond trivial checksum
+           adjustment, so make sure the current checksum is correct. */
+        if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
+                hdrlen = (*pskb)->nh.iph->ihl * 4;
+                if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
+                                                (*pskb)->len - hdrlen, 0)))
+                        return 0;
+        }
+        /* Must be RELATED */
+        IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
+                     (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
+        /* Redirects on non-null nats must be dropped, else they'll
+           start talking to each other without our translation, and be
+           confused... --RR */
+        if (inside->icmp.type == ICMP_REDIRECT) {
+                /* If NAT isn't finished, assume it and drop. */
+                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+                        return 0;
+                if (ct->status & IPS_NAT_MASK)
+                        return 0;
+        }
+        DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
+               *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+        if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
+                             sizeof(struct icmphdr) + inside->ip.ihl*4,
+                             &inner, ip_ct_find_proto(inside->ip.protocol)))
+                return 0;
+        /* Change inner back to look like incoming packet.  We do the
+           opposite manip on this hook to normal, because it might not
+           pass all hooks (locally-generated ICMP).  Consider incoming
+           packet: PREROUTING (DST manip), routing produces ICMP, goes
+           through POSTROUTING (which must correct the DST manip). */
+        if (!manip_pkt(inside->ip.protocol, pskb,
+                       (*pskb)->nh.iph->ihl*4
+                       + sizeof(inside->icmp),
+                       &ct->tuplehash[!dir].tuple,
+                       !manip))
+                return 0;
+        /* Reloading "inside" here since manip_pkt inner. */
+        inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+        inside->icmp.checksum = 0;
+        inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
+                                                       (*pskb)->len - hdrlen,
+                                                       0));
+        /* Change outer to look the reply to an incoming packet
+         * (proto 0 means don't invert per-proto part). */
+        /* Obviously, we need to NAT destination IP, but source IP
+           should be NAT'ed only if it is from a NAT'd host.
+           Explanation: some people use NAT for anonymizing.  Also,
+           CERT recommends dropping all packets from private IP
+           addresses (although ICMP errors from internal links with
+           such addresses are not too uncommon, as Alan Cox points
+           out) */
+        if (manip != IP_NAT_MANIP_SRC
+            || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
+                invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+                if (!manip_pkt(0, pskb, 0, &target, manip))
+                        return 0;
+        }
+        return 1;
+}
+/* Protocol registration. */
+int ip_nat_protocol_register(struct ip_nat_protocol *proto)
+{
+        int ret = 0;
+        WRITE_LOCK(&ip_nat_lock);
+        if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
+                ret = -EBUSY;
+                goto out;
+        }
+        ip_nat_protos[proto->protonum] = proto;
+ out:
+        WRITE_UNLOCK(&ip_nat_lock);
+        return ret;
+}
+/* Noone stores the protocol anywhere; simply delete it. */
+void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
+{
+        WRITE_LOCK(&ip_nat_lock);
+        ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
+        WRITE_UNLOCK(&ip_nat_lock);
+        /* Someone could be still looking at the proto in a bh. */
+        synchronize_net();
+}
+int __init ip_nat_init(void)
+{
+        size_t i;
+        /* Leave them the same for the moment. */
+        ip_nat_htable_size = ip_conntrack_htable_size;
+        /* One vmalloc for both hash tables */
+        bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
+        if (!bysource)
+                return -ENOMEM;
+        /* Sew in builtin protocols. */
+        WRITE_LOCK(&ip_nat_lock);
+        for (i = 0; i < MAX_IP_NAT_PROTO; i++)
+                ip_nat_protos[i] = &ip_nat_unknown_protocol;
+        ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
+        ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
+        ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
+        WRITE_UNLOCK(&ip_nat_lock);
+        for (i = 0; i < ip_nat_htable_size; i++) {
+                INIT_LIST_HEAD(&bysource[i]);
+        }
+        /* FIXME: Man, this is a hack.  <SIGH> */
+        IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
+        ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+        /* Initialize fake conntrack so that NAT will skip it */
+        ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+        return 0;
+}
+/* Clear NAT section of all conntracks, in case we're loaded again. */
+static int clean_nat(struct ip_conntrack *i, void *data)
+{
+        memset(&i->nat, 0, sizeof(i->nat));
+        i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
+        return 0;
+}
+/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
+void ip_nat_cleanup(void)
+{
+        ip_ct_iterate_cleanup(&clean_nat, NULL);
+        ip_conntrack_destroyed = NULL;
+        vfree(bysource);
+}
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
new file mode 100644
index 000000000000..c6000e794ad6
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -0,0 +1,183 @@
+/* FTP extension for TCP NAT alteration. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/moduleparam.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp NAT helper");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+/* FIXME: Time out? --RR */
+static int
+mangle_rfc959_packet(struct sk_buff **pskb,
+                     u_int32_t newip,
+                     u_int16_t port,
+                     unsigned int matchoff,
+                     unsigned int matchlen,
+                     struct ip_conntrack *ct,
+                     enum ip_conntrack_info ctinfo,
+                     u32 *seq)
+{
+        char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
+        sprintf(buffer, "%u,%u,%u,%u,%u,%u",
+                NIPQUAD(newip), port>>8, port&0xFF);
+        DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+        *seq += strlen(buffer) - matchlen;
+        return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, 
+                                        matchlen, buffer, strlen(buffer));
+}
+/* |1|132.235.1.2|6275| */
+static int
+mangle_eprt_packet(struct sk_buff **pskb,
+                   u_int32_t newip,
+                   u_int16_t port,
+                   unsigned int matchoff,
+                   unsigned int matchlen,
+                   struct ip_conntrack *ct,
+                   enum ip_conntrack_info ctinfo,
+                   u32 *seq)
+{
+        char buffer[sizeof("|1|255.255.255.255|65535|")];
+        sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
+        DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+        *seq += strlen(buffer) - matchlen;
+        return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, 
+                                        matchlen, buffer, strlen(buffer));
+}
+/* |1|132.235.1.2|6275| */
+static int
+mangle_epsv_packet(struct sk_buff **pskb,
+                   u_int32_t newip,
+                   u_int16_t port,
+                   unsigned int matchoff,
+                   unsigned int matchlen,
+                   struct ip_conntrack *ct,
+                   enum ip_conntrack_info ctinfo,
+                   u32 *seq)
+{
+        char buffer[sizeof("|||65535|")];
+        sprintf(buffer, "|||%u|", port);
+        DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+        *seq += strlen(buffer) - matchlen;
+        return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, 
+                                        matchlen, buffer, strlen(buffer));
+}
+static int (*mangle[])(struct sk_buff **, u_int32_t, u_int16_t,
+                     unsigned int,
+                     unsigned int,
+                     struct ip_conntrack *,
+                     enum ip_conntrack_info,
+                     u32 *seq)
+= { [IP_CT_FTP_PORT] = mangle_rfc959_packet,
+    [IP_CT_FTP_PASV] = mangle_rfc959_packet,
+    [IP_CT_FTP_EPRT] = mangle_eprt_packet,
+    [IP_CT_FTP_EPSV] = mangle_epsv_packet
+};
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_ftp(struct sk_buff **pskb,
+                               enum ip_conntrack_info ctinfo,
+                               enum ip_ct_ftp_type type,
+                               unsigned int matchoff,
+                               unsigned int matchlen,
+                               struct ip_conntrack_expect *exp,
+                               u32 *seq)
+{
+        u_int32_t newip;
+        u_int16_t port;
+        int dir = CTINFO2DIR(ctinfo);
+        struct ip_conntrack *ct = exp->master;
+        DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+        /* Connection will come from wherever this packet goes, hence !dir */
+        newip = ct->tuplehash[!dir].tuple.dst.ip;
+        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+        exp->dir = !dir;
+        /* When you see the packet, we need to NAT it the same as the
+         * this one. */
+        exp->expectfn = ip_nat_follow_master;
+        /* Try to get same port: if not, try to change it. */
+        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+                exp->tuple.dst.u.tcp.port = htons(port);
+                if (ip_conntrack_expect_related(exp) == 0)
+                        break;
+        }
+        if (port == 0) {
+                ip_conntrack_expect_free(exp);
+                return NF_DROP;
+        }
+        if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
+                          seq)) {
+                ip_conntrack_unexpect_related(exp);
+                return NF_DROP;
+        }
+        return NF_ACCEPT;
+}
+static void __exit fini(void)
+{
+        ip_nat_ftp_hook = NULL;
+        /* Make sure noone calls it, meanwhile. */
+        synchronize_net();
+}
+static int __init init(void)
+{
+        BUG_ON(ip_nat_ftp_hook);
+        ip_nat_ftp_hook = ip_nat_ftp;
+        return 0;
+}
+/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+        printk(KERN_INFO __stringify(KBUILD_MODNAME)
+               ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+        return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
new file mode 100644
index 000000000000..1637b96d8c01
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -0,0 +1,430 @@
+/* ip_nat_helper.c - generic support functions for NAT helpers 
+ *
+ * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
+ * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *      14 Jan 2002 Harald Welte <laforge@gnumonks.org>:
+ *              - add support for SACK adjustment 
+ *      14 Mar 2002 Harald Welte <laforge@gnumonks.org>:
+ *              - merge SACK support into newnat API
+ *      16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>:
+ *              - make ip_nat_resize_packet more generic (TCP and UDP)
+ *              - add ip_nat_mangle_udp_packet
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+#if 0
+#define DEBUGP printk
+#define DUMP_OFFSET(x)  printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
+#else
+#define DEBUGP(format, args...)
+#define DUMP_OFFSET(x)
+#endif
+static DECLARE_LOCK(ip_nat_seqofs_lock);
+/* Setup TCP sequence correction given this change at this sequence */
+static inline void 
+adjust_tcp_sequence(u32 seq,
+                    int sizediff,
+                    struct ip_conntrack *ct, 
+                    enum ip_conntrack_info ctinfo)
+{
+        int dir;
+        struct ip_nat_seq *this_way, *other_way;
+        DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n",
+                (*skb)->len, new_size);
+        dir = CTINFO2DIR(ctinfo);
+        this_way = &ct->nat.info.seq[dir];
+        other_way = &ct->nat.info.seq[!dir];
+        DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
+        DUMP_OFFSET(this_way);
+        LOCK_BH(&ip_nat_seqofs_lock);
+        /* SYN adjust. If it's uninitialized, or this is after last
+         * correction, record it: we don't handle more than one
+         * adjustment in the window, but do deal with common case of a
+         * retransmit */
+        if (this_way->offset_before == this_way->offset_after
+            || before(this_way->correction_pos, seq)) {
+                    this_way->correction_pos = seq;
+                    this_way->offset_before = this_way->offset_after;
+                    this_way->offset_after += sizediff;
+        }
+        UNLOCK_BH(&ip_nat_seqofs_lock);
+        DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
+        DUMP_OFFSET(this_way);
+}
+/* Frobs data inside this packet, which is linear. */
+static void mangle_contents(struct sk_buff *skb,
+                            unsigned int dataoff,
+                            unsigned int match_offset,
+                            unsigned int match_len,
+                            const char *rep_buffer,
+                            unsigned int rep_len)
+{
+        unsigned char *data;
+        BUG_ON(skb_is_nonlinear(skb));
+        data = (unsigned char *)skb->nh.iph + dataoff;
+        /* move post-replacement */
+        memmove(data + match_offset + rep_len,
+                data + match_offset + match_len,
+                skb->tail - (data + match_offset + match_len));
+        /* insert data from buffer */
+        memcpy(data + match_offset, rep_buffer, rep_len);
+        /* update skb info */
+        if (rep_len > match_len) {
+                DEBUGP("ip_nat_mangle_packet: Extending packet by "
+                        "%u from %u bytes\n", rep_len - match_len,
+                       skb->len);
+                skb_put(skb, rep_len - match_len);
+        } else {
+                DEBUGP("ip_nat_mangle_packet: Shrinking packet from "
+                        "%u from %u bytes\n", match_len - rep_len,
+                       skb->len);
+                __skb_trim(skb, skb->len + rep_len - match_len);
+        }
+        /* fix IP hdr checksum information */
+        skb->nh.iph->tot_len = htons(skb->len);
+        ip_send_check(skb->nh.iph);
+}
+/* Unusual, but possible case. */
+static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
+{
+        struct sk_buff *nskb;
+        if ((*pskb)->len + extra > 65535)
+                return 0;
+        nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC);
+        if (!nskb)
+                return 0;
+        /* Transfer socket to new skb. */
+        if ((*pskb)->sk)
+                skb_set_owner_w(nskb, (*pskb)->sk);
+#ifdef CONFIG_NETFILTER_DEBUG
+        nskb->nf_debug = (*pskb)->nf_debug;
+#endif
+        kfree_skb(*pskb);
+        *pskb = nskb;
+        return 1;
+}
+/* Generic function for mangling variable-length address changes inside
+ * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
+ * command in FTP).
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * */
+int 
+ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
+                         struct ip_conntrack *ct,
+                         enum ip_conntrack_info ctinfo,
+                         unsigned int match_offset,
+                         unsigned int match_len,
+                         const char *rep_buffer,
+                         unsigned int rep_len)
+{
+        struct iphdr *iph;
+        struct tcphdr *tcph;
+        int datalen;
+        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+                return 0;
+        if (rep_len > match_len
+            && rep_len - match_len > skb_tailroom(*pskb)
+            && !enlarge_skb(pskb, rep_len - match_len))
+                return 0;
+        SKB_LINEAR_ASSERT(*pskb);
+        iph = (*pskb)->nh.iph;
+        tcph = (void *)iph + iph->ihl*4;
+        mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
+                        match_offset, match_len, rep_buffer, rep_len);
+        datalen = (*pskb)->len - iph->ihl*4;
+        tcph->check = 0;
+        tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr,
+                                   csum_partial((char *)tcph, datalen, 0));
+        if (rep_len != match_len) {
+                set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+                adjust_tcp_sequence(ntohl(tcph->seq),
+                                    (int)rep_len - (int)match_len,
+                                    ct, ctinfo);
+                /* Tell TCP window tracking about seq change */
+                ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo));
+        }
+        return 1;
+}
+                        
+/* Generic function for mangling variable-length address changes inside
+ * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
+ * command in the Amanda protocol)
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * XXX - This function could be merged with ip_nat_mangle_tcp_packet which
+ *       should be fairly easy to do.
+ */
+int 
+ip_nat_mangle_udp_packet(struct sk_buff **pskb,
+                         struct ip_conntrack *ct,
+                         enum ip_conntrack_info ctinfo,
+                         unsigned int match_offset,
+                         unsigned int match_len,
+                         const char *rep_buffer,
+                         unsigned int rep_len)
+{
+        struct iphdr *iph;
+        struct udphdr *udph;
+        /* UDP helpers might accidentally mangle the wrong packet */
+        iph = (*pskb)->nh.iph;
+        if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + 
+                               match_offset + match_len)
+                return 0;
+        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+                return 0;
+        if (rep_len > match_len
+            && rep_len - match_len > skb_tailroom(*pskb)
+            && !enlarge_skb(pskb, rep_len - match_len))
+                return 0;
+        iph = (*pskb)->nh.iph;
+        udph = (void *)iph + iph->ihl*4;
+        mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
+                        match_offset, match_len, rep_buffer, rep_len);
+        /* update the length of the UDP packet */
+        udph->len = htons((*pskb)->len - iph->ihl*4);
+        /* fix udp checksum if udp checksum was previously calculated */
+        if (udph->check) {
+                int datalen = (*pskb)->len - iph->ihl * 4;
+                udph->check = 0;
+                udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                datalen, IPPROTO_UDP,
+                                                csum_partial((char *)udph,
+                                                             datalen, 0));
+        }
+        return 1;
+}
+/* Adjust one found SACK option including checksum correction */
+static void
+sack_adjust(struct sk_buff *skb,
+            struct tcphdr *tcph, 
+            unsigned int sackoff,
+            unsigned int sackend,
+            struct ip_nat_seq *natseq)
+{
+        while (sackoff < sackend) {
+                struct tcp_sack_block *sack;
+                u_int32_t new_start_seq, new_end_seq;
+                sack = (void *)skb->data + sackoff;
+                if (after(ntohl(sack->start_seq) - natseq->offset_before,
+                          natseq->correction_pos))
+                        new_start_seq = ntohl(sack->start_seq) 
+                                        - natseq->offset_after;
+                else
+                        new_start_seq = ntohl(sack->start_seq) 
+                                        - natseq->offset_before;
+                new_start_seq = htonl(new_start_seq);
+                if (after(ntohl(sack->end_seq) - natseq->offset_before,
+                          natseq->correction_pos))
+                        new_end_seq = ntohl(sack->end_seq)
+                                      - natseq->offset_after;
+                else
+                        new_end_seq = ntohl(sack->end_seq)
+                                      - natseq->offset_before;
+                new_end_seq = htonl(new_end_seq);
+                DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+                        ntohl(sack->start_seq), new_start_seq,
+                        ntohl(sack->end_seq), new_end_seq);
+                tcph->check = 
+                        ip_nat_cheat_check(~sack->start_seq, new_start_seq,
+                                           ip_nat_cheat_check(~sack->end_seq, 
+                                                              new_end_seq,
+                                                              tcph->check));
+                sack->start_seq = new_start_seq;
+                sack->end_seq = new_end_seq;
+                sackoff += sizeof(*sack);
+        }
+}
+/* TCP SACK sequence number adjustment */
+static inline unsigned int
+ip_nat_sack_adjust(struct sk_buff **pskb,
+                   struct tcphdr *tcph,
+                   struct ip_conntrack *ct,
+                   enum ip_conntrack_info ctinfo)
+{
+        unsigned int dir, optoff, optend;
+        optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
+        optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
+        if (!skb_ip_make_writable(pskb, optend))
+                return 0;
+        dir = CTINFO2DIR(ctinfo);
+        while (optoff < optend) {
+                /* Usually: option, length. */
+                unsigned char *op = (*pskb)->data + optoff;
+                switch (op[0]) {
+                case TCPOPT_EOL:
+                        return 1;
+                case TCPOPT_NOP:
+                        optoff++;
+                        continue;
+                default:
+                        /* no partial options */
+                        if (optoff + 1 == optend
+                            || optoff + op[1] > optend
+                            || op[1] < 2)
+                                return 0;
+                        if (op[0] == TCPOPT_SACK
+                            && op[1] >= 2+TCPOLEN_SACK_PERBLOCK
+                            && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+                                sack_adjust(*pskb, tcph, optoff+2,
+                                            optoff+op[1],
+                                            &ct->nat.info.seq[!dir]);
+                        optoff += op[1];
+                }
+        }
+        return 1;
+}
+/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
+int
+ip_nat_seq_adjust(struct sk_buff **pskb, 
+                  struct ip_conntrack *ct, 
+                  enum ip_conntrack_info ctinfo)
+{
+        struct tcphdr *tcph;
+        int dir, newseq, newack;
+        struct ip_nat_seq *this_way, *other_way;        
+        dir = CTINFO2DIR(ctinfo);
+        this_way = &ct->nat.info.seq[dir];
+        other_way = &ct->nat.info.seq[!dir];
+        if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+                return 0;
+        tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+        if (after(ntohl(tcph->seq), this_way->correction_pos))
+                newseq = ntohl(tcph->seq) + this_way->offset_after;
+        else
+                newseq = ntohl(tcph->seq) + this_way->offset_before;
+        newseq = htonl(newseq);
+        if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+                  other_way->correction_pos))
+                newack = ntohl(tcph->ack_seq) - other_way->offset_after;
+        else
+                newack = ntohl(tcph->ack_seq) - other_way->offset_before;
+        newack = htonl(newack);
+        tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
+                                         ip_nat_cheat_check(~tcph->ack_seq, 
+                                                            newack, 
+                                                            tcph->check));
+        DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+                ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+                ntohl(newack));
+        tcph->seq = newseq;
+        tcph->ack_seq = newack;
+        if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
+                return 0;
+        ip_conntrack_tcp_update(*pskb, ct, dir);
+        return 1;
+}
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void ip_nat_follow_master(struct ip_conntrack *ct,
+                          struct ip_conntrack_expect *exp)
+{
+        struct ip_nat_range range;
+        /* This must be a fresh one. */
+        BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+        /* Change src to where master sends to */
+        range.flags = IP_NAT_RANGE_MAP_IPS;
+        range.min_ip = range.max_ip
+                = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
+        /* hook doesn't matter, but it has to do source manip */
+        ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+        /* For DST manip, map port here to where it's expected. */
+        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+        range.min = range.max = exp->saved_proto;
+        range.min_ip = range.max_ip
+                = ct->master->tuplehash[!exp->dir].tuple.src.ip;
+        /* hook doesn't matter, but it has to do destination manip */
+        ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+}
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
new file mode 100644
index 000000000000..9c1ca3381d56
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -0,0 +1,125 @@
+/* IRC extension for TCP NAT alteration.
+ * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * based on a copy of RR's ip_nat_ftp.c
+ *
+ * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/moduleparam.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("IRC (DCC) NAT helper");
+MODULE_LICENSE("GPL");
+static unsigned int help(struct sk_buff **pskb,
+                         enum ip_conntrack_info ctinfo,
+                         unsigned int matchoff,
+                         unsigned int matchlen,
+                         struct ip_conntrack_expect *exp)
+{
+        u_int16_t port;
+        unsigned int ret;
+        /* "4294967296 65635 " */
+        char buffer[18];
+        DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n",
+               expect->seq, exp_irc_info->len,
+               ntohl(tcph->seq));
+        /* Reply comes from server. */
+        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+        exp->dir = IP_CT_DIR_REPLY;
+        /* When you see the packet, we need to NAT it the same as the
+         * this one. */
+        exp->expectfn = ip_nat_follow_master;
+        /* Try to get same port: if not, try to change it. */
+        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+                exp->tuple.dst.u.tcp.port = htons(port);
+                if (ip_conntrack_expect_related(exp) == 0)
+                        break;
+        }
+        if (port == 0) {
+                ip_conntrack_expect_free(exp);
+                return NF_DROP;
+        }
+        /*      strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
+         *      strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
+         *      strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
+         *      strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
+         *      strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
+         *              AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
+         *                      255.255.255.255==4294967296, 10 digits)
+         *              P:         bound port (min 1 d, max 5d (65635))
+         *              F:         filename   (min 1 d )
+         *              S:         size       (min 1 d )
+         *              0x01, \n:  terminators
+         */
+        /* AAA = "us", ie. where server normally talks to. */
+        sprintf(buffer, "%u %u",
+                ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip),
+                port);
+        DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n",
+               buffer, NIPQUAD(exp->tuple.src.ip), port);
+        ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo, 
+                                       matchoff, matchlen, buffer, 
+                                       strlen(buffer));
+        if (ret != NF_ACCEPT)
+                ip_conntrack_unexpect_related(exp);
+        return ret;
+}
+static void __exit fini(void)
+{
+        ip_nat_irc_hook = NULL;
+        /* Make sure noone calls it, meanwhile. */
+        synchronize_net();
+}
+static int __init init(void)
+{
+        BUG_ON(ip_nat_irc_hook);
+        ip_nat_irc_hook = help;
+        return 0;
+}
+/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+        printk(KERN_INFO __stringify(KBUILD_MODNAME)
+               ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+        return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
new file mode 100644
index 000000000000..a558cf0eee8a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -0,0 +1,115 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+static int
+icmp_in_range(const struct ip_conntrack_tuple *tuple,
+              enum ip_nat_manip_type maniptype,
+              const union ip_conntrack_manip_proto *min,
+              const union ip_conntrack_manip_proto *max)
+{
+        return (tuple->src.u.icmp.id >= min->icmp.id
+                && tuple->src.u.icmp.id <= max->icmp.id);
+}
+static int
+icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
+                  const struct ip_nat_range *range,
+                  enum ip_nat_manip_type maniptype,
+                  const struct ip_conntrack *conntrack)
+{
+        static u_int16_t id;
+        unsigned int range_size
+                = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
+        unsigned int i;
+        /* If no range specified... */
+        if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
+                range_size = 0xFFFF;
+        for (i = 0; i < range_size; i++, id++) {
+                tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size);
+                if (!ip_nat_used_tuple(tuple, conntrack))
+                        return 1;
+        }
+        return 0;
+}
+static int
+icmp_manip_pkt(struct sk_buff **pskb,
+               unsigned int iphdroff,
+               const struct ip_conntrack_tuple *tuple,
+               enum ip_nat_manip_type maniptype)
+{
+        struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+        struct icmphdr *hdr;
+        unsigned int hdroff = iphdroff + iph->ihl*4;
+        if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+                return 0;
+        hdr = (struct icmphdr *)((*pskb)->data + hdroff);
+        hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF,
+                                            tuple->src.u.icmp.id,
+                                            hdr->checksum);
+        hdr->un.echo.id = tuple->src.u.icmp.id;
+        return 1;
+}
+static unsigned int
+icmp_print(char *buffer,
+           const struct ip_conntrack_tuple *match,
+           const struct ip_conntrack_tuple *mask)
+{
+        unsigned int len = 0;
+        if (mask->src.u.icmp.id)
+                len += sprintf(buffer + len, "id=%u ",
+                               ntohs(match->src.u.icmp.id));
+        if (mask->dst.u.icmp.type)
+                len += sprintf(buffer + len, "type=%u ",
+                               ntohs(match->dst.u.icmp.type));
+        if (mask->dst.u.icmp.code)
+                len += sprintf(buffer + len, "code=%u ",
+                               ntohs(match->dst.u.icmp.code));
+        return len;
+}
+static unsigned int
+icmp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+        if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF)
+                return sprintf(buffer, "id %u-%u ",
+                               ntohs(range->min.icmp.id),
+                               ntohs(range->max.icmp.id));
+        else return 0;
+}
+struct ip_nat_protocol ip_nat_protocol_icmp
+= { "ICMP", IPPROTO_ICMP,
+    icmp_manip_pkt,
+    icmp_in_range,
+    icmp_unique_tuple,
+    icmp_print,
+    icmp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
new file mode 100644
index 000000000000..a91cfceff272
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -0,0 +1,178 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+static int
+tcp_in_range(const struct ip_conntrack_tuple *tuple,
+             enum ip_nat_manip_type maniptype,
+             const union ip_conntrack_manip_proto *min,
+             const union ip_conntrack_manip_proto *max)
+{
+        u_int16_t port;
+        if (maniptype == IP_NAT_MANIP_SRC)
+                port = tuple->src.u.tcp.port;
+        else
+                port = tuple->dst.u.tcp.port;
+        return ntohs(port) >= ntohs(min->tcp.port)
+                && ntohs(port) <= ntohs(max->tcp.port);
+}
+static int
+tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
+                 const struct ip_nat_range *range,
+                 enum ip_nat_manip_type maniptype,
+                 const struct ip_conntrack *conntrack)
+{
+        static u_int16_t port, *portptr;
+        unsigned int range_size, min, i;
+        if (maniptype == IP_NAT_MANIP_SRC)
+                portptr = &tuple->src.u.tcp.port;
+        else
+                portptr = &tuple->dst.u.tcp.port;
+        /* If no range specified... */
+        if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+                /* If it's dst rewrite, can't change port */
+                if (maniptype == IP_NAT_MANIP_DST)
+                        return 0;
+                /* Map privileged onto privileged. */
+                if (ntohs(*portptr) < 1024) {
+                        /* Loose convention: >> 512 is credential passing */
+                        if (ntohs(*portptr)<512) {
+                                min = 1;
+                                range_size = 511 - min + 1;
+                        } else {
+                                min = 600;
+                                range_size = 1023 - min + 1;
+                        }
+                } else {
+                        min = 1024;
+                        range_size = 65535 - 1024 + 1;
+                }
+        } else {
+                min = ntohs(range->min.tcp.port);
+                range_size = ntohs(range->max.tcp.port) - min + 1;
+        }
+        for (i = 0; i < range_size; i++, port++) {
+                *portptr = htons(min + port % range_size);
+                if (!ip_nat_used_tuple(tuple, conntrack)) {
+                        return 1;
+                }
+        }
+        return 0;
+}
+static int
+tcp_manip_pkt(struct sk_buff **pskb,
+              unsigned int iphdroff,
+              const struct ip_conntrack_tuple *tuple,
+              enum ip_nat_manip_type maniptype)
+{
+        struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+        struct tcphdr *hdr;
+        unsigned int hdroff = iphdroff + iph->ihl*4;
+        u32 oldip, newip;
+        u16 *portptr, newport, oldport;
+        int hdrsize = 8; /* TCP connection tracking guarantees this much */
+        /* this could be a inner header returned in icmp packet; in such
+           cases we cannot update the checksum field since it is outside of
+           the 8 bytes of transport layer headers we are guaranteed */
+        if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
+                hdrsize = sizeof(struct tcphdr);
+        if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+                return 0;
+        iph = (struct iphdr *)((*pskb)->data + iphdroff);
+        hdr = (struct tcphdr *)((*pskb)->data + hdroff);
+        if (maniptype == IP_NAT_MANIP_SRC) {
+                /* Get rid of src ip and src pt */
+                oldip = iph->saddr;
+                newip = tuple->src.ip;
+                newport = tuple->src.u.tcp.port;
+                portptr = &hdr->source;
+        } else {
+                /* Get rid of dst ip and dst pt */
+                oldip = iph->daddr;
+                newip = tuple->dst.ip;
+                newport = tuple->dst.u.tcp.port;
+                portptr = &hdr->dest;
+        }
+        oldport = *portptr;
+        *portptr = newport;
+        if (hdrsize < sizeof(*hdr))
+                return 1;
+        hdr->check = ip_nat_cheat_check(~oldip, newip,
+                                        ip_nat_cheat_check(oldport ^ 0xFFFF,
+                                                           newport,
+                                                           hdr->check));
+        return 1;
+}
+static unsigned int
+tcp_print(char *buffer,
+          const struct ip_conntrack_tuple *match,
+          const struct ip_conntrack_tuple *mask)
+{
+        unsigned int len = 0;
+        if (mask->src.u.tcp.port)
+                len += sprintf(buffer + len, "srcpt=%u ",
+                               ntohs(match->src.u.tcp.port));
+        if (mask->dst.u.tcp.port)
+                len += sprintf(buffer + len, "dstpt=%u ",
+                               ntohs(match->dst.u.tcp.port));
+        return len;
+}
+static unsigned int
+tcp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+        if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) {
+                if (range->min.tcp.port == range->max.tcp.port)
+                        return sprintf(buffer, "port %u ",
+                                       ntohs(range->min.tcp.port));
+                else
+                        return sprintf(buffer, "ports %u-%u ",
+                                       ntohs(range->min.tcp.port),
+                                       ntohs(range->max.tcp.port));
+        }
+        else return 0;
+}
+struct ip_nat_protocol ip_nat_protocol_tcp
+= { "TCP", IPPROTO_TCP,
+    tcp_manip_pkt,
+    tcp_in_range,
+    tcp_unique_tuple,
+    tcp_print,
+    tcp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
new file mode 100644
index 000000000000..c669e3b5f5d0
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -0,0 +1,165 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+static int
+udp_in_range(const struct ip_conntrack_tuple *tuple,
+             enum ip_nat_manip_type maniptype,
+             const union ip_conntrack_manip_proto *min,
+             const union ip_conntrack_manip_proto *max)
+{
+        u_int16_t port;
+        if (maniptype == IP_NAT_MANIP_SRC)
+                port = tuple->src.u.udp.port;
+        else
+                port = tuple->dst.u.udp.port;
+        return ntohs(port) >= ntohs(min->udp.port)
+                && ntohs(port) <= ntohs(max->udp.port);
+}
+static int
+udp_unique_tuple(struct ip_conntrack_tuple *tuple,
+                 const struct ip_nat_range *range,
+                 enum ip_nat_manip_type maniptype,
+                 const struct ip_conntrack *conntrack)
+{
+        static u_int16_t port, *portptr;
+        unsigned int range_size, min, i;
+        if (maniptype == IP_NAT_MANIP_SRC)
+                portptr = &tuple->src.u.udp.port;
+        else
+                portptr = &tuple->dst.u.udp.port;
+        /* If no range specified... */
+        if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+                /* If it's dst rewrite, can't change port */
+                if (maniptype == IP_NAT_MANIP_DST)
+                        return 0;
+                if (ntohs(*portptr) < 1024) {
+                        /* Loose convention: >> 512 is credential passing */
+                        if (ntohs(*portptr)<512) {
+                                min = 1;
+                                range_size = 511 - min + 1;
+                        } else {
+                                min = 600;
+                                range_size = 1023 - min + 1;
+                        }
+                } else {
+                        min = 1024;
+                        range_size = 65535 - 1024 + 1;
+                }
+        } else {
+                min = ntohs(range->min.udp.port);
+                range_size = ntohs(range->max.udp.port) - min + 1;
+        }
+        for (i = 0; i < range_size; i++, port++) {
+                *portptr = htons(min + port % range_size);
+                if (!ip_nat_used_tuple(tuple, conntrack))
+                        return 1;
+        }
+        return 0;
+}
+static int
+udp_manip_pkt(struct sk_buff **pskb,
+              unsigned int iphdroff,
+              const struct ip_conntrack_tuple *tuple,
+              enum ip_nat_manip_type maniptype)
+{
+        struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+        struct udphdr *hdr;
+        unsigned int hdroff = iphdroff + iph->ihl*4;
+        u32 oldip, newip;
+        u16 *portptr, newport;
+        if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+                return 0;
+        iph = (struct iphdr *)((*pskb)->data + iphdroff);
+        hdr = (struct udphdr *)((*pskb)->data + hdroff);
+        if (maniptype == IP_NAT_MANIP_SRC) {
+                /* Get rid of src ip and src pt */
+                oldip = iph->saddr;
+                newip = tuple->src.ip;
+                newport = tuple->src.u.udp.port;
+                portptr = &hdr->source;
+        } else {
+                /* Get rid of dst ip and dst pt */
+                oldip = iph->daddr;
+                newip = tuple->dst.ip;
+                newport = tuple->dst.u.udp.port;
+                portptr = &hdr->dest;
+        }
+        if (hdr->check) /* 0 is a special case meaning no checksum */
+                hdr->check = ip_nat_cheat_check(~oldip, newip,
+                                        ip_nat_cheat_check(*portptr ^ 0xFFFF,
+                                                           newport,
+                                                           hdr->check));
+        *portptr = newport;
+        return 1;
+}
+static unsigned int
+udp_print(char *buffer,
+          const struct ip_conntrack_tuple *match,
+          const struct ip_conntrack_tuple *mask)
+{
+        unsigned int len = 0;
+        if (mask->src.u.udp.port)
+                len += sprintf(buffer + len, "srcpt=%u ",
+                               ntohs(match->src.u.udp.port));
+        if (mask->dst.u.udp.port)
+                len += sprintf(buffer + len, "dstpt=%u ",
+                               ntohs(match->dst.u.udp.port));
+        return len;
+}
+static unsigned int
+udp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+        if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) {
+                if (range->min.udp.port == range->max.udp.port)
+                        return sprintf(buffer, "port %u ",
+                                       ntohs(range->min.udp.port));
+                else
+                        return sprintf(buffer, "ports %u-%u ",
+                                       ntohs(range->min.udp.port),
+                                       ntohs(range->max.udp.port));
+        }
+        else return 0;
+}
+struct ip_nat_protocol ip_nat_protocol_udp
+= { "UDP", IPPROTO_UDP,
+    udp_manip_pkt,
+    udp_in_range,
+    udp_unique_tuple,
+    udp_print,
+    udp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
new file mode 100644
index 000000000000..f5525bd58d16
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -0,0 +1,70 @@
+/* The "unknown" protocol.  This is what is used for protocols we
+ * don't understand.  It's returned by ip_ct_find_proto().
+ */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+static int unknown_in_range(const struct ip_conntrack_tuple *tuple,
+                            enum ip_nat_manip_type manip_type,
+                            const union ip_conntrack_manip_proto *min,
+                            const union ip_conntrack_manip_proto *max)
+{
+        return 1;
+}
+static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
+                                const struct ip_nat_range *range,
+                                enum ip_nat_manip_type maniptype,
+                                const struct ip_conntrack *conntrack)
+{
+        /* Sorry: we can't help you; if it's not unique, we can't frob
+           anything. */
+        return 0;
+}
+static int
+unknown_manip_pkt(struct sk_buff **pskb,
+                  unsigned int iphdroff,
+                  const struct ip_conntrack_tuple *tuple,
+                  enum ip_nat_manip_type maniptype)
+{
+        return 1;
+}
+static unsigned int
+unknown_print(char *buffer,
+              const struct ip_conntrack_tuple *match,
+              const struct ip_conntrack_tuple *mask)
+{
+        return 0;
+}
+static unsigned int
+unknown_print_range(char *buffer, const struct ip_nat_range *range)
+{
+        return 0;
+}
+struct ip_nat_protocol ip_nat_unknown_protocol = {
+        "unknown", 0,
+        unknown_manip_pkt,
+        unknown_in_range,
+        unknown_unique_tuple,
+        unknown_print,
+        unknown_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
new file mode 100644
index 000000000000..581f097f5a24
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -0,0 +1,319 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* Everything about the rules for NAT. */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/bitops.h>
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
+static struct
+{
+        struct ipt_replace repl;
+        struct ipt_standard entries[3];
+        struct ipt_error term;
+} nat_initial_table __initdata
+= { { "nat", NAT_VALID_HOOKS, 4,
+      sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+      { [NF_IP_PRE_ROUTING] = 0,
+        [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
+        [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      { [NF_IP_PRE_ROUTING] = 0,
+        [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
+        [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      0, NULL, { } },
+    {
+            /* PRE_ROUTING */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* POST_ROUTING */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* LOCAL_OUT */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } }
+    },
+    /* ERROR */
+    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+        0,
+        sizeof(struct ipt_entry),
+        sizeof(struct ipt_error),
+        0, { 0, 0 }, { } },
+      { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+          { } },
+        "ERROR"
+      }
+    }
+};
+static struct ipt_table nat_table = {
+        .name           = "nat",
+        .valid_hooks    = NAT_VALID_HOOKS,
+        .lock           = RW_LOCK_UNLOCKED,
+        .me             = THIS_MODULE,
+};
+/* Source NAT */
+static unsigned int ipt_snat_target(struct sk_buff **pskb,
+                                    const struct net_device *in,
+                                    const struct net_device *out,
+                                    unsigned int hooknum,
+                                    const void *targinfo,
+                                    void *userinfo)
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        const struct ip_nat_multi_range_compat *mr = targinfo;
+        IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        /* Connection must be valid and new. */
+        IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
+                            || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+        IP_NF_ASSERT(out);
+        return ip_nat_setup_info(ct, &mr->range[0], hooknum);
+}
+/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
+static void warn_if_extra_mangle(u32 dstip, u32 srcip)
+{
+        static int warned = 0;
+        struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
+        struct rtable *rt;
+        if (ip_route_output_key(&rt, &fl) != 0)
+                return;
+        if (rt->rt_src != srcip && !warned) {
+                printk("NAT: no longer support implicit source local NAT\n");
+                printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
+                       NIPQUAD(srcip), NIPQUAD(dstip));
+                warned = 1;
+        }
+        ip_rt_put(rt);
+}
+static unsigned int ipt_dnat_target(struct sk_buff **pskb,
+                                    const struct net_device *in,
+                                    const struct net_device *out,
+                                    unsigned int hooknum,
+                                    const void *targinfo,
+                                    void *userinfo)
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        const struct ip_nat_multi_range_compat *mr = targinfo;
+        IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+                     || hooknum == NF_IP_LOCAL_OUT);
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        /* Connection must be valid and new. */
+        IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+        if (hooknum == NF_IP_LOCAL_OUT
+            && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
+                warn_if_extra_mangle((*pskb)->nh.iph->daddr,
+                                     mr->range[0].min_ip);
+        return ip_nat_setup_info(ct, &mr->range[0], hooknum);
+}
+static int ipt_snat_checkentry(const char *tablename,
+                               const struct ipt_entry *e,
+                               void *targinfo,
+                               unsigned int targinfosize,
+                               unsigned int hook_mask)
+{
+        struct ip_nat_multi_range_compat *mr = targinfo;
+        /* Must be a valid range */
+        if (mr->rangesize != 1) {
+                printk("SNAT: multiple ranges no longer supported\n");
+                return 0;
+        }
+        if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
+                DEBUGP("SNAT: Target size %u wrong for %u ranges\n",
+                       targinfosize, mr->rangesize);
+                return 0;
+        }
+        /* Only allow these for NAT. */
+        if (strcmp(tablename, "nat") != 0) {
+                DEBUGP("SNAT: wrong table %s\n", tablename);
+                return 0;
+        }
+        if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+                DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask);
+                return 0;
+        }
+        return 1;
+}
+static int ipt_dnat_checkentry(const char *tablename,
+                               const struct ipt_entry *e,
+                               void *targinfo,
+                               unsigned int targinfosize,
+                               unsigned int hook_mask)
+{
+        struct ip_nat_multi_range_compat *mr = targinfo;
+        /* Must be a valid range */
+        if (mr->rangesize != 1) {
+                printk("DNAT: multiple ranges no longer supported\n");
+                return 0;
+        }
+        if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
+                DEBUGP("DNAT: Target size %u wrong for %u ranges\n",
+                       targinfosize, mr->rangesize);
+                return 0;
+        }
+        /* Only allow these for NAT. */
+        if (strcmp(tablename, "nat") != 0) {
+                DEBUGP("DNAT: wrong table %s\n", tablename);
+                return 0;
+        }
+        if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+                DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask);
+                return 0;
+        }
+        
+        return 1;
+}
+inline unsigned int
+alloc_null_binding(struct ip_conntrack *conntrack,
+                   struct ip_nat_info *info,
+                   unsigned int hooknum)
+{
+        /* Force range to this IP; let proto decide mapping for
+           per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+           Use reply in case it's already been mangled (eg local packet).
+        */
+        u_int32_t ip
+                = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+                   ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+                   : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+        struct ip_nat_range range
+                = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
+        DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack,
+               NIPQUAD(ip));
+        return ip_nat_setup_info(conntrack, &range, hooknum);
+}
+int ip_nat_rule_find(struct sk_buff **pskb,
+                     unsigned int hooknum,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     struct ip_conntrack *ct,
+                     struct ip_nat_info *info)
+{
+        int ret;
+        ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL);
+        if (ret == NF_ACCEPT) {
+                if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
+                        /* NUL mapping */
+                        ret = alloc_null_binding(ct, info, hooknum);
+        }
+        return ret;
+}
+static struct ipt_target ipt_snat_reg = {
+        .name           = "SNAT",
+        .target         = ipt_snat_target,
+        .checkentry     = ipt_snat_checkentry,
+};
+static struct ipt_target ipt_dnat_reg = {
+        .name           = "DNAT",
+        .target         = ipt_dnat_target,
+        .checkentry     = ipt_dnat_checkentry,
+};
+int __init ip_nat_rule_init(void)
+{
+        int ret;
+        ret = ipt_register_table(&nat_table, &nat_initial_table.repl);
+        if (ret != 0)
+                return ret;
+        ret = ipt_register_target(&ipt_snat_reg);
+        if (ret != 0)
+                goto unregister_table;
+        ret = ipt_register_target(&ipt_dnat_reg);
+        if (ret != 0)
+                goto unregister_snat;
+        return ret;
+ unregister_snat:
+        ipt_unregister_target(&ipt_snat_reg);
+ unregister_table:
+        ipt_unregister_table(&nat_table);
+        return ret;
+}
+void ip_nat_rule_cleanup(void)
+{
+        ipt_unregister_target(&ipt_dnat_reg);
+        ipt_unregister_target(&ipt_snat_reg);
+        ipt_unregister_table(&nat_table);
+}
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
new file mode 100644
index 000000000000..2a48b6e635ae
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -0,0 +1,1347 @@
+/*
+ * ip_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network 
+ * discovery and monitoring applications where target networks use 
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network 
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified.  The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory 
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Updates:
+ * 2000-08-06: Convert to new helper API (Harald Welte).
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <asm/uaccess.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+#define NOCT1(n) (u_int8_t )((n) & 0xff)
+static int debug;
+static DEFINE_SPINLOCK(snmp_lock);
+/* 
+ * Application layer address mapping mimics the NAT mapping, but 
+ * only for the first octet in this case (a more flexible system
+ * can be implemented if needed).
+ */
+struct oct1_map
+{
+        u_int8_t from;
+        u_int8_t to;
+};
+                                  
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+/* Class */
+#define ASN1_UNI        0       /* Universal */
+#define ASN1_APL        1       /* Application */
+#define ASN1_CTX        2       /* Context */
+#define ASN1_PRV        3       /* Private */
+/* Tag */
+#define ASN1_EOC        0       /* End Of Contents */
+#define ASN1_BOL        1       /* Boolean */
+#define ASN1_INT        2       /* Integer */
+#define ASN1_BTS        3       /* Bit String */
+#define ASN1_OTS        4       /* Octet String */
+#define ASN1_NUL        5       /* Null */
+#define ASN1_OJI        6       /* Object Identifier  */
+#define ASN1_OJD        7       /* Object Description */
+#define ASN1_EXT        8       /* External */
+#define ASN1_SEQ        16      /* Sequence */
+#define ASN1_SET        17      /* Set */
+#define ASN1_NUMSTR     18      /* Numerical String */
+#define ASN1_PRNSTR     19      /* Printable String */
+#define ASN1_TEXSTR     20      /* Teletext String */
+#define ASN1_VIDSTR     21      /* Video String */
+#define ASN1_IA5STR     22      /* IA5 String */
+#define ASN1_UNITIM     23      /* Universal Time */
+#define ASN1_GENTIM     24      /* General Time */
+#define ASN1_GRASTR     25      /* Graphical String */
+#define ASN1_VISSTR     26      /* Visible String */
+#define ASN1_GENSTR     27      /* General String */
+/* Primitive / Constructed methods*/
+#define ASN1_PRI        0       /* Primitive */
+#define ASN1_CON        1       /* Constructed */
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR                0
+#define ASN1_ERR_DEC_EMPTY              2
+#define ASN1_ERR_DEC_EOC_MISMATCH       3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH    4
+#define ASN1_ERR_DEC_BADVALUE           5
+/* 
+ * ASN.1 context.
+ */
+struct asn1_ctx
+{
+        int error;                      /* Error condition */
+        unsigned char *pointer;         /* Octet just to be decoded */
+        unsigned char *begin;           /* First octet */
+        unsigned char *end;             /* Octet after last octet */
+};
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr
+{
+        unsigned char *data;
+        unsigned int len;
+};
+        
+static void asn1_open(struct asn1_ctx *ctx,
+                      unsigned char *buf,
+                      unsigned int len)
+{
+        ctx->begin = buf;
+        ctx->end = buf + len;
+        ctx->pointer = buf;
+        ctx->error = ASN1_ERR_NOERROR;
+}
+static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+        if (ctx->pointer >= ctx->end) {
+                ctx->error = ASN1_ERR_DEC_EMPTY;
+                return 0;
+        }
+        *ch = *(ctx->pointer)++;
+        return 1;
+}
+static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+        unsigned char ch;
+        
+        *tag = 0;
+        
+        do
+        {
+                if (!asn1_octet_decode(ctx, &ch))
+                        return 0;
+                *tag <<= 7;
+                *tag |= ch & 0x7F;
+        } while ((ch & 0x80) == 0x80);
+        return 1;
+}
+static unsigned char asn1_id_decode(struct asn1_ctx *ctx, 
+                                    unsigned int *cls,
+                                    unsigned int *con,
+                                    unsigned int *tag)
+{
+        unsigned char ch;
+        
+        if (!asn1_octet_decode(ctx, &ch))
+                return 0;
+                
+        *cls = (ch & 0xC0) >> 6;
+        *con = (ch & 0x20) >> 5;
+        *tag = (ch & 0x1F);
+        
+        if (*tag == 0x1F) {
+                if (!asn1_tag_decode(ctx, tag))
+                        return 0;
+        }
+        return 1;
+}
+static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
+                                        unsigned int *def,
+                                        unsigned int *len)
+{
+        unsigned char ch, cnt;
+        
+        if (!asn1_octet_decode(ctx, &ch))
+                return 0;
+                
+        if (ch == 0x80)
+                *def = 0;
+        else {
+                *def = 1;
+                
+                if (ch < 0x80)
+                        *len = ch;
+                else {
+                        cnt = (unsigned char) (ch & 0x7F);
+                        *len = 0;
+                        
+                        while (cnt > 0) {
+                                if (!asn1_octet_decode(ctx, &ch))
+                                        return 0;
+                                *len <<= 8;
+                                *len |= ch;
+                                cnt--;
+                        }
+                }
+        }
+        return 1;
+}
+static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
+                                        unsigned char **eoc,
+                                        unsigned int *cls,
+                                        unsigned int *con,
+                                        unsigned int *tag)
+{
+        unsigned int def, len;
+        
+        if (!asn1_id_decode(ctx, cls, con, tag))
+                return 0;
+                
+        if (!asn1_length_decode(ctx, &def, &len))
+                return 0;
+                
+        if (def)
+                *eoc = ctx->pointer + len;
+        else
+                *eoc = NULL;
+        return 1;
+}
+static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+        unsigned char ch;
+        
+        if (eoc == 0) {
+                if (!asn1_octet_decode(ctx, &ch))
+                        return 0;
+                        
+                if (ch != 0x00) {
+                        ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+                        return 0;
+                }
+                
+                if (!asn1_octet_decode(ctx, &ch))
+                        return 0;
+                        
+                if (ch != 0x00) {
+                        ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+                        return 0;
+                }
+                return 1;
+        } else {
+                if (ctx->pointer != eoc) {
+                        ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+                        return 0;
+                }
+                return 1;
+        }
+}
+static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+        ctx->pointer = eoc;
+        return 1;
+}
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+                                      unsigned char *eoc,
+                                      long *integer)
+{
+        unsigned char ch;
+        unsigned int  len;
+        
+        if (!asn1_octet_decode(ctx, &ch))
+                return 0;
+                
+        *integer = (signed char) ch;
+        len = 1;
+        
+        while (ctx->pointer < eoc) {
+                if (++len > sizeof (long)) {
+                        ctx->error = ASN1_ERR_DEC_BADVALUE;
+                        return 0;
+                }
+                
+                if (!asn1_octet_decode(ctx, &ch))
+                        return 0;
+                        
+                *integer <<= 8;
+                *integer |= ch;
+        }
+        return 1;
+}
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+                                      unsigned char *eoc,
+                                      unsigned int *integer)
+{
+        unsigned char ch;
+        unsigned int  len;
+        
+        if (!asn1_octet_decode(ctx, &ch))
+                return 0;
+                
+        *integer = ch;
+        if (ch == 0) len = 0;
+        else len = 1;
+        
+        while (ctx->pointer < eoc) {
+                if (++len > sizeof (unsigned int)) {
+                        ctx->error = ASN1_ERR_DEC_BADVALUE;
+                        return 0;
+                }
+                
+                if (!asn1_octet_decode(ctx, &ch))
+                        return 0;
+                        
+                *integer <<= 8;
+                *integer |= ch;
+        }
+        return 1;
+}
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+                                       unsigned char *eoc,
+                                       unsigned long *integer)
+{
+        unsigned char ch;
+        unsigned int  len;
+        
+        if (!asn1_octet_decode(ctx, &ch))
+                return 0;
+                
+        *integer = ch;
+        if (ch == 0) len = 0;
+        else len = 1;
+        
+        while (ctx->pointer < eoc) {
+                if (++len > sizeof (unsigned long)) {
+                        ctx->error = ASN1_ERR_DEC_BADVALUE;
+                        return 0;
+                }
+                
+                if (!asn1_octet_decode(ctx, &ch))
+                        return 0;
+                        
+                *integer <<= 8;
+                *integer |= ch;
+        }
+        return 1;
+}
+static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
+                                        unsigned char *eoc,
+                                        unsigned char **octets,
+                                        unsigned int *len)
+{
+        unsigned char *ptr;
+        
+        *len = 0;
+        
+        *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+        if (*octets == NULL) {
+                if (net_ratelimit())
+                        printk("OOM in bsalg (%d)\n", __LINE__);
+                return 0;
+        }
+        
+        ptr = *octets;
+        while (ctx->pointer < eoc) {
+                if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+                        kfree(*octets);
+                        *octets = NULL;
+                        return 0;
+                }
+                (*len)++;
+        }
+        return 1;
+}
+static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
+                                       unsigned long *subid)
+{
+        unsigned char ch;
+        
+        *subid = 0;
+        
+        do {
+                if (!asn1_octet_decode(ctx, &ch))
+                        return 0;
+                
+                *subid <<= 7;
+                *subid |= ch & 0x7F;
+        } while ((ch & 0x80) == 0x80);
+        return 1;
+}
+static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
+                                     unsigned char *eoc,
+                                     unsigned long **oid,
+                                     unsigned int *len)
+{
+        unsigned long subid;
+        unsigned int  size;
+        unsigned long *optr;
+        
+        size = eoc - ctx->pointer + 1;
+        *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+        if (*oid == NULL) {
+                if (net_ratelimit())
+                        printk("OOM in bsalg (%d)\n", __LINE__);
+                return 0;
+        }
+        
+        optr = *oid;
+        
+        if (!asn1_subid_decode(ctx, &subid)) {
+                kfree(*oid);
+                *oid = NULL;
+                return 0;
+        }
+        
+        if (subid < 40) {
+                optr [0] = 0;
+                optr [1] = subid;
+        } else if (subid < 80) {
+                optr [0] = 1;
+                optr [1] = subid - 40;
+        } else {
+                optr [0] = 2;
+                optr [1] = subid - 80;
+        }
+        
+        *len = 2;
+        optr += 2;
+        
+        while (ctx->pointer < eoc) {
+                if (++(*len) > size) {
+                        ctx->error = ASN1_ERR_DEC_BADVALUE;
+                        kfree(*oid);
+                        *oid = NULL;
+                        return 0;
+                }
+                
+                if (!asn1_subid_decode(ctx, optr++)) {
+                        kfree(*oid);
+                        *oid = NULL;
+                        return 0;
+                }
+        }
+        return 1;
+}
+/*****************************************************************************
+ *
+ * SNMP decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+/* SNMP Versions */
+#define SNMP_V1                         0
+#define SNMP_V2C                        1
+#define SNMP_V2                         2
+#define SNMP_V3                         3
+/* Default Sizes */
+#define SNMP_SIZE_COMM                  256
+#define SNMP_SIZE_OBJECTID              128
+#define SNMP_SIZE_BUFCHR                256
+#define SNMP_SIZE_BUFINT                128
+#define SNMP_SIZE_SMALLOBJECTID         16
+/* Requests */
+#define SNMP_PDU_GET                    0
+#define SNMP_PDU_NEXT                   1
+#define SNMP_PDU_RESPONSE               2
+#define SNMP_PDU_SET                    3
+#define SNMP_PDU_TRAP1                  4
+#define SNMP_PDU_BULK                   5
+#define SNMP_PDU_INFORM                 6
+#define SNMP_PDU_TRAP2                  7
+/* Errors */
+#define SNMP_NOERROR                    0
+#define SNMP_TOOBIG                     1
+#define SNMP_NOSUCHNAME                 2
+#define SNMP_BADVALUE                   3
+#define SNMP_READONLY                   4
+#define SNMP_GENERROR                   5
+#define SNMP_NOACCESS                   6
+#define SNMP_WRONGTYPE                  7
+#define SNMP_WRONGLENGTH                8
+#define SNMP_WRONGENCODING              9
+#define SNMP_WRONGVALUE                 10
+#define SNMP_NOCREATION                 11
+#define SNMP_INCONSISTENTVALUE          12
+#define SNMP_RESOURCEUNAVAILABLE        13
+#define SNMP_COMMITFAILED               14
+#define SNMP_UNDOFAILED                 15
+#define SNMP_AUTHORIZATIONERROR         16
+#define SNMP_NOTWRITABLE                17
+#define SNMP_INCONSISTENTNAME           18
+/* General SNMP V1 Traps */
+#define SNMP_TRAP_COLDSTART             0
+#define SNMP_TRAP_WARMSTART             1
+#define SNMP_TRAP_LINKDOWN              2
+#define SNMP_TRAP_LINKUP                3
+#define SNMP_TRAP_AUTFAILURE            4
+#define SNMP_TRAP_EQPNEIGHBORLOSS       5
+#define SNMP_TRAP_ENTSPECIFIC           6
+/* SNMPv1 Types */
+#define SNMP_NULL                0
+#define SNMP_INTEGER             1    /* l  */
+#define SNMP_OCTETSTR            2    /* c  */
+#define SNMP_DISPLAYSTR          2    /* c  */
+#define SNMP_OBJECTID            3    /* ul */
+#define SNMP_IPADDR              4    /* uc */
+#define SNMP_COUNTER             5    /* ul */
+#define SNMP_GAUGE               6    /* ul */
+#define SNMP_TIMETICKS           7    /* ul */
+#define SNMP_OPAQUE              8    /* c  */
+/* Additional SNMPv2 Types */
+#define SNMP_UINTEGER            5    /* ul */
+#define SNMP_BITSTR              9    /* uc */
+#define SNMP_NSAP               10    /* uc */
+#define SNMP_COUNTER64          11    /* ul */
+#define SNMP_NOSUCHOBJECT       12
+#define SNMP_NOSUCHINSTANCE     13
+#define SNMP_ENDOFMIBVIEW       14
+union snmp_syntax
+{
+        unsigned char uc[0];    /* 8 bit unsigned */
+        char c[0];              /* 8 bit signed */
+        unsigned long ul[0];    /* 32 bit unsigned */
+        long l[0];              /* 32 bit signed */
+};
+struct snmp_object
+{
+        unsigned long *id;
+        unsigned int id_len;
+        unsigned short type;
+        unsigned int syntax_len;
+        union snmp_syntax syntax;
+};
+struct snmp_request
+{
+        unsigned long id;
+        unsigned int error_status;
+        unsigned int error_index;
+};
+struct snmp_v1_trap
+{
+        unsigned long *id;
+        unsigned int id_len;
+        unsigned long ip_address;       /* pointer  */
+        unsigned int general;
+        unsigned int specific;
+        unsigned long time;
+};
+/* SNMP types */
+#define SNMP_IPA    0
+#define SNMP_CNT    1
+#define SNMP_GGE    2
+#define SNMP_TIT    3
+#define SNMP_OPQ    4
+#define SNMP_C64    6
+/* SNMP errors */
+#define SERR_NSO    0
+#define SERR_NSI    1
+#define SERR_EOM    2
+static inline void mangle_address(unsigned char *begin,
+                                  unsigned char *addr,
+                                  const struct oct1_map *map,
+                                  u_int16_t *check);
+struct snmp_cnv
+{
+        unsigned int class;
+        unsigned int tag;
+        int syntax;
+};
+static struct snmp_cnv snmp_conv [] =
+{
+        {ASN1_UNI, ASN1_NUL, SNMP_NULL},
+        {ASN1_UNI, ASN1_INT, SNMP_INTEGER},
+        {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
+        {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
+        {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
+        {ASN1_APL, SNMP_IPA, SNMP_IPADDR},
+        {ASN1_APL, SNMP_CNT, SNMP_COUNTER},     /* Counter32 */
+        {ASN1_APL, SNMP_GGE, SNMP_GAUGE},       /* Gauge32 == Unsigned32  */
+        {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
+        {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
+        
+        /* SNMPv2 data types and errors */
+        {ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
+        {ASN1_APL, SNMP_C64, SNMP_COUNTER64},
+        {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
+        {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
+        {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
+        {0,       0,       -1}
+};
+static unsigned char snmp_tag_cls2syntax(unsigned int tag,
+                                         unsigned int cls,
+                                         unsigned short *syntax)
+{
+        struct snmp_cnv *cnv;
+        
+        cnv = snmp_conv;
+        
+        while (cnv->syntax != -1) {
+                if (cnv->tag == tag && cnv->class == cls) {
+                        *syntax = cnv->syntax;
+                        return 1;
+                }
+                cnv++;
+        }
+        return 0;
+}
+static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
+                                        struct snmp_object **obj)
+{
+        unsigned int cls, con, tag, len, idlen;
+        unsigned short type;
+        unsigned char *eoc, *end, *p;
+        unsigned long *lp, *id;
+        unsigned long ul;
+        long  l;
+        
+        *obj = NULL;
+        id = NULL;
+        
+        if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
+                return 0;
+                
+        if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+                return 0;
+        
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                return 0;
+        
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+                return 0;
+        
+        if (!asn1_oid_decode(ctx, end, &id, &idlen))
+                return 0;
+                
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
+                kfree(id);
+                return 0;
+        }
+        
+        if (con != ASN1_PRI) {
+                kfree(id);
+                return 0;
+        }
+        
+        if (!snmp_tag_cls2syntax(tag, cls, &type)) {
+                kfree(id);
+                return 0;
+        }
+        
+        switch (type) {
+                case SNMP_INTEGER:
+                        len = sizeof(long);
+                        if (!asn1_long_decode(ctx, end, &l)) {
+                                kfree(id);
+                                return 0;
+                        }
+                        *obj = kmalloc(sizeof(struct snmp_object) + len,
+                                       GFP_ATOMIC);
+                        if (*obj == NULL) {
+                                kfree(id);
+                                if (net_ratelimit())
+                                        printk("OOM in bsalg (%d)\n", __LINE__);
+                                return 0;
+                        }
+                        (*obj)->syntax.l[0] = l;
+                        break;
+                case SNMP_OCTETSTR:
+                case SNMP_OPAQUE:
+                        if (!asn1_octets_decode(ctx, end, &p, &len)) {
+                                kfree(id);
+                                return 0;
+                        }
+                        *obj = kmalloc(sizeof(struct snmp_object) + len,
+                                       GFP_ATOMIC);
+                        if (*obj == NULL) {
+                                kfree(id);
+                                if (net_ratelimit())
+                                        printk("OOM in bsalg (%d)\n", __LINE__);
+                                return 0;
+                        }
+                        memcpy((*obj)->syntax.c, p, len);
+                        kfree(p);
+                        break;
+                case SNMP_NULL:
+                case SNMP_NOSUCHOBJECT:
+                case SNMP_NOSUCHINSTANCE:
+                case SNMP_ENDOFMIBVIEW:
+                        len = 0;
+                        *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+                        if (*obj == NULL) {
+                                kfree(id);
+                                if (net_ratelimit())
+                                        printk("OOM in bsalg (%d)\n", __LINE__);
+                                return 0;
+                        }
+                        if (!asn1_null_decode(ctx, end)) {
+                                kfree(id);
+                                kfree(*obj);
+                                *obj = NULL;
+                                return 0;
+                        }
+                        break;
+                case SNMP_OBJECTID:
+                        if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+                                kfree(id);
+                                return 0;
+                        }
+                        len *= sizeof(unsigned long);
+                        *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+                        if (*obj == NULL) {
+                                kfree(id);
+                                if (net_ratelimit())
+                                        printk("OOM in bsalg (%d)\n", __LINE__);
+                                return 0;
+                        }
+                        memcpy((*obj)->syntax.ul, lp, len);
+                        kfree(lp);
+                        break;
+                case SNMP_IPADDR:
+                        if (!asn1_octets_decode(ctx, end, &p, &len)) {
+                                kfree(id);
+                                return 0;
+                        }
+                        if (len != 4) {
+                                kfree(p);
+                                kfree(id);
+                                return 0;
+                        }
+                        *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+                        if (*obj == NULL) {
+                                kfree(p);
+                                kfree(id);
+                                if (net_ratelimit())
+                                        printk("OOM in bsalg (%d)\n", __LINE__);
+                                return 0;
+                        }
+                        memcpy((*obj)->syntax.uc, p, len);
+                        kfree(p);
+                        break;
+                case SNMP_COUNTER:
+                case SNMP_GAUGE:
+                case SNMP_TIMETICKS:
+                        len = sizeof(unsigned long);
+                        if (!asn1_ulong_decode(ctx, end, &ul)) {
+                                kfree(id);
+                                return 0;
+                        }
+                        *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+                        if (*obj == NULL) {
+                                kfree(id);
+                                if (net_ratelimit())
+                                        printk("OOM in bsalg (%d)\n", __LINE__);
+                                return 0;
+                        }
+                        (*obj)->syntax.ul[0] = ul;
+                        break;
+                default:
+                        kfree(id);
+                        return 0;
+        }
+        
+        (*obj)->syntax_len = len;
+        (*obj)->type = type;
+        (*obj)->id = id;
+        (*obj)->id_len = idlen;
+        
+        if (!asn1_eoc_decode(ctx, eoc)) {
+                kfree(id);
+                kfree(*obj);
+                *obj = NULL;
+                return 0;
+        }
+        return 1;
+}
+static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
+                                         struct snmp_request *request)
+{
+        unsigned int cls, con, tag;
+        unsigned char *end;
+        
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                return 0;
+                
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+                return 0;
+                
+        if (!asn1_ulong_decode(ctx, end, &request->id))
+                return 0;
+                
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                return 0;
+                
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+                return 0;
+                
+        if (!asn1_uint_decode(ctx, end, &request->error_status))
+                return 0;
+                
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                return 0;
+                
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+                return 0;
+                
+        if (!asn1_uint_decode(ctx, end, &request->error_index))
+                return 0;
+        
+        return 1;
+}
+/* 
+ * Fast checksum update for possibly oddly-aligned UDP byte, from the
+ * code example in the draft.
+ */
+static void fast_csum(unsigned char *csum,
+                      const unsigned char *optr,
+                      const unsigned char *nptr,
+                      int odd)
+{
+        long x, old, new;
+        
+        x = csum[0] * 256 + csum[1];
+        
+        x =~ x & 0xFFFF;
+        
+        if (odd) old = optr[0] * 256;
+        else old = optr[0];
+        
+        x -= old & 0xFFFF;
+        if (x <= 0) {
+                x--;
+                x &= 0xFFFF;
+        }
+        
+        if (odd) new = nptr[0] * 256;
+        else new = nptr[0];
+        
+        x += new & 0xFFFF;
+        if (x & 0x10000) {
+                x++;
+                x &= 0xFFFF;
+        }
+        
+        x =~ x & 0xFFFF;
+        csum[0] = x / 256;
+        csum[1] = x & 0xFF;
+}
+/* 
+ * Mangle IP address.
+ *      - begin points to the start of the snmp messgae
+ *      - addr points to the start of the address
+ */
+static inline void mangle_address(unsigned char *begin,
+                                  unsigned char *addr,
+                                  const struct oct1_map *map,
+                                  u_int16_t *check)
+{
+        if (map->from == NOCT1(*addr)) {
+                u_int32_t old;
+                
+                if (debug)
+                        memcpy(&old, (unsigned char *)addr, sizeof(old));
+                        
+                *addr = map->to;
+                
+                /* Update UDP checksum if being used */
+                if (*check) {
+                        unsigned char odd = !((addr - begin) % 2);
+                        
+                        fast_csum((unsigned char *)check,
+                                  &map->from, &map->to, odd);
+                                  
+                }
+                
+                if (debug)
+                        printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to "
+                               "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr));
+        }
+}
+static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
+                                      struct snmp_v1_trap *trap,
+                                      const struct oct1_map *map,
+                                      u_int16_t *check)
+{
+        unsigned int cls, con, tag, len;
+        unsigned char *end;
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                return 0;
+                
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+                return 0;
+        
+        if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
+                return 0;
+                
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                goto err_id_free;
+        if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
+              (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
+                goto err_id_free;
+        
+        if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
+                goto err_id_free;
+        
+        /* IPv4 only */
+        if (len != 4)
+                goto err_addr_free;
+        
+        mangle_address(ctx->begin, ctx->pointer - 4, map, check);
+        
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                goto err_addr_free;
+                
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+                goto err_addr_free;
+                
+        if (!asn1_uint_decode(ctx, end, &trap->general))
+                goto err_addr_free;
+                
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                goto err_addr_free;
+        
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+                goto err_addr_free;
+                
+        if (!asn1_uint_decode(ctx, end, &trap->specific))
+                goto err_addr_free;
+                
+        if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+                goto err_addr_free;
+                
+        if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
+              (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
+                goto err_addr_free;
+                
+        if (!asn1_ulong_decode(ctx, end, &trap->time))
+                goto err_addr_free;
+                
+        return 1;
+err_id_free:
+        kfree(trap->id);
+err_addr_free:
+        kfree((unsigned long *)trap->ip_address);
+        
+        return 0;
+}
+/*****************************************************************************
+ *
+ * Misc. routines
+ *
+ *****************************************************************************/
+static void hex_dump(unsigned char *buf, size_t len)
+{
+        size_t i;
+        
+        for (i = 0; i < len; i++) {
+                if (i && !(i % 16))
+                        printk("\n");
+                printk("%02x ", *(buf + i));
+        }
+        printk("\n");
+}
+/*
+ * Parse and mangle SNMP message according to mapping.
+ * (And this is the fucking 'basic' method).
+ */
+static int snmp_parse_mangle(unsigned char *msg,
+                             u_int16_t len,
+                             const struct oct1_map *map,
+                             u_int16_t *check)
+{
+        unsigned char *eoc, *end;
+        unsigned int cls, con, tag, vers, pdutype;
+        struct asn1_ctx ctx;
+        struct asn1_octstr comm;
+        struct snmp_object **obj;
+        
+        if (debug > 1)
+                hex_dump(msg, len);
+        asn1_open(&ctx, msg, len);
+        
+        /* 
+         * Start of SNMP message.
+         */
+        if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+                return 0;
+        if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+                return 0;
+        
+        /* 
+         * Version 1 or 2 handled.
+         */
+        if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
+                return 0;
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+                return 0;
+        if (!asn1_uint_decode (&ctx, end, &vers))
+                return 0;
+        if (debug > 1)
+                printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+        if (vers > 1)
+                return 1;
+        
+        /*
+         * Community.
+         */
+        if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
+                return 0;
+        if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
+                return 0;
+        if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
+                return 0;
+        if (debug > 1) {
+                unsigned int i;
+                
+                printk(KERN_DEBUG "bsalg: community: ");
+                for (i = 0; i < comm.len; i++)
+                        printk("%c", comm.data[i]);
+                printk("\n");
+        }
+        kfree(comm.data);
+        
+        /*
+         * PDU type
+         */
+        if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
+                return 0;
+        if (cls != ASN1_CTX || con != ASN1_CON)
+                return 0;
+        if (debug > 1) {
+                unsigned char *pdus[] = {
+                        [SNMP_PDU_GET] = "get",
+                        [SNMP_PDU_NEXT] = "get-next",
+                        [SNMP_PDU_RESPONSE] = "response",
+                        [SNMP_PDU_SET] = "set",
+                        [SNMP_PDU_TRAP1] = "trapv1",
+                        [SNMP_PDU_BULK] = "bulk",
+                        [SNMP_PDU_INFORM] = "inform",
+                        [SNMP_PDU_TRAP2] = "trapv2"
+                };
+                
+                if (pdutype > SNMP_PDU_TRAP2)
+                        printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+                else
+                        printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+        }
+        if (pdutype != SNMP_PDU_RESPONSE &&
+            pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
+                return 1;
+        
+        /*
+         * Request header or v1 trap
+         */
+        if (pdutype == SNMP_PDU_TRAP1) {
+                struct snmp_v1_trap trap;
+                unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
+                
+                /* Discard trap allocations regardless */
+                kfree(trap.id);
+                kfree((unsigned long *)trap.ip_address);
+                
+                if (!ret)
+                        return ret;
+                
+        } else {
+                struct snmp_request req;
+                
+                if (!snmp_request_decode(&ctx, &req))
+                        return 0;
+                        
+                if (debug > 1)
+                        printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+                        "error_index=%u\n", req.id, req.error_status,
+                        req.error_index);
+        }
+        
+        /*
+         * Loop through objects, look for IP addresses to mangle.
+         */
+        if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+                return 0;
+                
+        if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+                return 0;
+        
+        obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+        if (obj == NULL) {
+                if (net_ratelimit())
+                        printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
+                return 0;       
+        }
+        while (!asn1_eoc_decode(&ctx, eoc)) {
+                unsigned int i;
+                
+                if (!snmp_object_decode(&ctx, obj)) {
+                        if (*obj) {
+                                if ((*obj)->id)
+                                        kfree((*obj)->id);
+                                kfree(*obj);
+                        }       
+                        kfree(obj);
+                        return 0;
+                }
+                if (debug > 1) {
+                        printk(KERN_DEBUG "bsalg: object: ");
+                        for (i = 0; i < (*obj)->id_len; i++) {
+                                if (i > 0)
+                                        printk(".");
+                                printk("%lu", (*obj)->id[i]);
+                        }
+                        printk(": type=%u\n", (*obj)->type);
+                        
+                }
+                if ((*obj)->type == SNMP_IPADDR)
+                        mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+                
+                kfree((*obj)->id);
+                kfree(*obj);
+        }
+        kfree(obj);
+        
+        if (!asn1_eoc_decode(&ctx, eoc))
+                return 0;
+                
+        return 1;
+}
+/*****************************************************************************
+ *
+ * NAT routines.
+ *
+ *****************************************************************************/
+/* 
+ * SNMP translation routine.
+ */
+static int snmp_translate(struct ip_conntrack *ct,
+                          enum ip_conntrack_info ctinfo,
+                          struct sk_buff **pskb)
+{
+        struct iphdr *iph = (*pskb)->nh.iph;
+        struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
+        u_int16_t udplen = ntohs(udph->len);
+        u_int16_t paylen = udplen - sizeof(struct udphdr);
+        int dir = CTINFO2DIR(ctinfo);
+        struct oct1_map map;
+        /*
+         * Determine mappping for application layer addresses based
+         * on NAT manipulations for the packet.
+         */
+        if (dir == IP_CT_DIR_ORIGINAL) {
+                /* SNAT traps */
+                map.from = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip);
+                map.to = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip);
+        } else {
+                /* DNAT replies */
+                map.from = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+                map.to = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip);
+        }
+        
+        if (map.from == map.to)
+                return NF_ACCEPT;
+        
+        if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
+                               paylen, &map, &udph->check)) {
+                if (net_ratelimit())
+                        printk(KERN_WARNING "bsalg: parser failed\n");
+                return NF_DROP;
+        }
+        return NF_ACCEPT;
+}
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted */
+static int help(struct sk_buff **pskb,
+                struct ip_conntrack *ct,
+                enum ip_conntrack_info ctinfo)
+{
+        int dir = CTINFO2DIR(ctinfo);
+        unsigned int ret;
+        struct iphdr *iph = (*pskb)->nh.iph;
+        struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
+        /* SNMP replies and originating SNMP traps get mangled */
+        if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+                return NF_ACCEPT;
+        if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+                return NF_ACCEPT;
+        /* No NAT? */
+        if (!(ct->status & IPS_NAT_MASK))
+                return NF_ACCEPT;
+        /* 
+         * Make sure the packet length is ok.  So far, we were only guaranteed
+         * to have a valid length IP header plus 8 bytes, which means we have
+         * enough room for a UDP header.  Just verify the UDP length field so we
+         * can mess around with the payload.
+         */
+        if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) {
+                 if (net_ratelimit())
+                         printk(KERN_WARNING "SNMP: dropping malformed packet "
+                                "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n",
+                                NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+                 return NF_DROP;
+        }
+        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+                return NF_DROP;
+        spin_lock_bh(&snmp_lock);
+        ret = snmp_translate(ct, ctinfo, pskb);
+        spin_unlock_bh(&snmp_lock);
+        return ret;
+}
+static struct ip_conntrack_helper snmp_helper = {
+        .max_expected = 0,
+        .timeout = 180,
+        .me = THIS_MODULE,
+        .help = help,
+        .name = "snmp",
+        .tuple = { .src = { .u = { __constant_htons(SNMP_PORT) } },
+                   .dst = { .protonum = IPPROTO_UDP },
+        },
+        .mask = { .src = { .u = { 0xFFFF } },
+                 .dst = { .protonum = 0xFF },
+        },
+};
+static struct ip_conntrack_helper snmp_trap_helper = {
+        .max_expected = 0,
+        .timeout = 180,
+        .me = THIS_MODULE,
+        .help = help,
+        .name = "snmp_trap",
+        .tuple = { .src = { .u = { __constant_htons(SNMP_TRAP_PORT) } },
+                   .dst = { .protonum = IPPROTO_UDP },
+        },
+        .mask = { .src = { .u = { 0xFFFF } },
+                 .dst = { .protonum = 0xFF },
+        },
+};
+/*****************************************************************************
+ *
+ * Module stuff.
+ *
+ *****************************************************************************/
+ 
+static int __init init(void)
+{
+        int ret = 0;
+        ret = ip_conntrack_helper_register(&snmp_helper);
+        if (ret < 0)
+                return ret;
+        ret = ip_conntrack_helper_register(&snmp_trap_helper);
+        if (ret < 0) {
+                ip_conntrack_helper_unregister(&snmp_helper);
+                return ret;
+        }
+        return ret;
+}
+static void __exit fini(void)
+{
+        ip_conntrack_helper_unregister(&snmp_helper);
+        ip_conntrack_helper_unregister(&snmp_trap_helper);
+}
+module_init(init);
+module_exit(fini);
+module_param(debug, bool, 0600);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
new file mode 100644
index 000000000000..dec4a74212cd
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -0,0 +1,349 @@
+/* This file contains all the functions required for the standalone
+   ip_nat module.
+   These are not required by the compatibility layer.
+*/
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/*
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ *      - new API and handling of conntrack/nat helpers
+ *      - now capable of multiple expectations for one master
+ * */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/spinlock.h>
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING"  \
+                           : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \
+                              : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT"  \
+                                 : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN"  \
+                                    : "*ERROR*")))
+static unsigned int
+ip_nat_fn(unsigned int hooknum,
+          struct sk_buff **pskb,
+          const struct net_device *in,
+          const struct net_device *out,
+          int (*okfn)(struct sk_buff *))
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        struct ip_nat_info *info;
+        /* maniptype == SRC for postrouting. */
+        enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+        /* We never see fragments: conntrack defrags on pre-routing
+           and local-out, and ip_nat_out protects post-routing. */
+        IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
+                       & htons(IP_MF|IP_OFFSET)));
+        (*pskb)->nfcache |= NFC_UNKNOWN;
+        /* If we had a hardware checksum before, it's now invalid */
+        if ((*pskb)->ip_summed == CHECKSUM_HW)
+                if (skb_checksum_help(*pskb, (out == NULL)))
+                        return NF_DROP;
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        /* Can't track?  It's not due to stress, or conntrack would
+           have dropped it.  Hence it's the user's responsibilty to
+           packet filter it out, or implement conntrack/NAT for that
+           protocol. 8) --RR */
+        if (!ct) {
+                /* Exception: ICMP redirect to new connection (not in
+                   hash table yet).  We must not let this through, in
+                   case we're doing NAT to the same network. */
+                if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
+                        struct icmphdr _hdr, *hp;
+                        hp = skb_header_pointer(*pskb,
+                                                (*pskb)->nh.iph->ihl*4,
+                                                sizeof(_hdr), &_hdr);
+                        if (hp != NULL &&
+                            hp->type == ICMP_REDIRECT)
+                                return NF_DROP;
+                }
+                return NF_ACCEPT;
+        }
+        switch (ctinfo) {
+        case IP_CT_RELATED:
+        case IP_CT_RELATED+IP_CT_IS_REPLY:
+                if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
+                        if (!icmp_reply_translation(pskb, ct, maniptype,
+                                                    CTINFO2DIR(ctinfo)))
+                                return NF_DROP;
+                        else
+                                return NF_ACCEPT;
+                }
+                /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+        case IP_CT_NEW:
+                info = &ct->nat.info;
+                /* Seen it before?  This can happen for loopback, retrans,
+                   or local packets.. */
+                if (!ip_nat_initialized(ct, maniptype)) {
+                        unsigned int ret;
+                        /* LOCAL_IN hook doesn't have a chain!  */
+                        if (hooknum == NF_IP_LOCAL_IN)
+                                ret = alloc_null_binding(ct, info, hooknum);
+                        else
+                                ret = ip_nat_rule_find(pskb, hooknum,
+                                                       in, out, ct,
+                                                       info);
+                        if (ret != NF_ACCEPT) {
+                                return ret;
+                        }
+                } else
+                        DEBUGP("Already setup manip %s for ct %p\n",
+                               maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
+                               ct);
+                break;
+        default:
+                /* ESTABLISHED */
+                IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
+                             || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
+                info = &ct->nat.info;
+        }
+        IP_NF_ASSERT(info);
+        return nat_packet(ct, ctinfo, hooknum, pskb);
+}
+static unsigned int
+ip_nat_in(unsigned int hooknum,
+          struct sk_buff **pskb,
+          const struct net_device *in,
+          const struct net_device *out,
+          int (*okfn)(struct sk_buff *))
+{
+        u_int32_t saddr, daddr;
+        unsigned int ret;
+        saddr = (*pskb)->nh.iph->saddr;
+        daddr = (*pskb)->nh.iph->daddr;
+        ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+        if (ret != NF_DROP && ret != NF_STOLEN
+            && ((*pskb)->nh.iph->saddr != saddr
+                || (*pskb)->nh.iph->daddr != daddr)) {
+                dst_release((*pskb)->dst);
+                (*pskb)->dst = NULL;
+        }
+        return ret;
+}
+static unsigned int
+ip_nat_out(unsigned int hooknum,
+           struct sk_buff **pskb,
+           const struct net_device *in,
+           const struct net_device *out,
+           int (*okfn)(struct sk_buff *))
+{
+        /* root is playing with raw sockets. */
+        if ((*pskb)->len < sizeof(struct iphdr)
+            || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
+                return NF_ACCEPT;
+        /* We can hit fragment here; forwarded packets get
+           defragmented by connection tracking coming in, then
+           fragmented (grr) by the forward code.
+           In future: If we have nfct != NULL, AND we have NAT
+           initialized, AND there is no helper, then we can do full
+           NAPT on the head, and IP-address-only NAT on the rest.
+           I'm starting to have nightmares about fragments.  */
+        if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+                *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT);
+                if (!*pskb)
+                        return NF_STOLEN;
+        }
+        return ip_nat_fn(hooknum, pskb, in, out, okfn);
+}
+static unsigned int
+ip_nat_local_fn(unsigned int hooknum,
+                struct sk_buff **pskb,
+                const struct net_device *in,
+                const struct net_device *out,
+                int (*okfn)(struct sk_buff *))
+{
+        u_int32_t saddr, daddr;
+        unsigned int ret;
+        /* root is playing with raw sockets. */
+        if ((*pskb)->len < sizeof(struct iphdr)
+            || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
+                return NF_ACCEPT;
+        saddr = (*pskb)->nh.iph->saddr;
+        daddr = (*pskb)->nh.iph->daddr;
+        ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+        if (ret != NF_DROP && ret != NF_STOLEN
+            && ((*pskb)->nh.iph->saddr != saddr
+                || (*pskb)->nh.iph->daddr != daddr))
+                return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+        return ret;
+}
+/* We must be after connection tracking and before packet filtering. */
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_in_ops = {
+        .hook           = ip_nat_in,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_PRE_ROUTING,
+        .priority       = NF_IP_PRI_NAT_DST,
+};
+/* After packet filtering, change source */
+static struct nf_hook_ops ip_nat_out_ops = {
+        .hook           = ip_nat_out,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_POST_ROUTING,
+        .priority       = NF_IP_PRI_NAT_SRC,
+};
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_local_out_ops = {
+        .hook           = ip_nat_local_fn,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_LOCAL_OUT,
+        .priority       = NF_IP_PRI_NAT_DST,
+};
+/* After packet filtering, change source for reply packets of LOCAL_OUT DNAT */
+static struct nf_hook_ops ip_nat_local_in_ops = {
+        .hook           = ip_nat_fn,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_LOCAL_IN,
+        .priority       = NF_IP_PRI_NAT_SRC,
+};
+static int init_or_cleanup(int init)
+{
+        int ret = 0;
+        need_ip_conntrack();
+        if (!init) goto cleanup;
+        ret = ip_nat_rule_init();
+        if (ret < 0) {
+                printk("ip_nat_init: can't setup rules.\n");
+                goto cleanup_nothing;
+        }
+        ret = ip_nat_init();
+        if (ret < 0) {
+                printk("ip_nat_init: can't setup rules.\n");
+                goto cleanup_rule_init;
+        }
+        ret = nf_register_hook(&ip_nat_in_ops);
+        if (ret < 0) {
+                printk("ip_nat_init: can't register in hook.\n");
+                goto cleanup_nat;
+        }
+        ret = nf_register_hook(&ip_nat_out_ops);
+        if (ret < 0) {
+                printk("ip_nat_init: can't register out hook.\n");
+                goto cleanup_inops;
+        }
+        ret = nf_register_hook(&ip_nat_local_out_ops);
+        if (ret < 0) {
+                printk("ip_nat_init: can't register local out hook.\n");
+                goto cleanup_outops;
+        }
+        ret = nf_register_hook(&ip_nat_local_in_ops);
+        if (ret < 0) {
+                printk("ip_nat_init: can't register local in hook.\n");
+                goto cleanup_localoutops;
+        }
+        return ret;
+ cleanup:
+        nf_unregister_hook(&ip_nat_local_in_ops);
+ cleanup_localoutops:
+        nf_unregister_hook(&ip_nat_local_out_ops);
+ cleanup_outops:
+        nf_unregister_hook(&ip_nat_out_ops);
+ cleanup_inops:
+        nf_unregister_hook(&ip_nat_in_ops);
+ cleanup_nat:
+        ip_nat_cleanup();
+ cleanup_rule_init:
+        ip_nat_rule_cleanup();
+ cleanup_nothing:
+        MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
+        return ret;
+}
+static int __init init(void)
+{
+        return init_or_cleanup(1);
+}
+static void __exit fini(void)
+{
+        init_or_cleanup(0);
+}
+module_init(init);
+module_exit(fini);
+EXPORT_SYMBOL(ip_nat_setup_info);
+EXPORT_SYMBOL(ip_nat_protocol_register);
+EXPORT_SYMBOL(ip_nat_protocol_unregister);
+EXPORT_SYMBOL(ip_nat_cheat_check);
+EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
+EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
+EXPORT_SYMBOL(ip_nat_used_tuple);
+EXPORT_SYMBOL(ip_nat_follow_master);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
new file mode 100644
index 000000000000..0343e0d64674
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -0,0 +1,70 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Version: 0.0.7
+ *
+ * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
+ *      - Port to newnat API
+ *
+ * This module currently supports DNAT:
+ * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y
+ *
+ * and SNAT:
+ * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x }
+ *
+ * It has not been tested with
+ * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip
+ * If you do test this please let me know if it works or not.
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/moduleparam.h>
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("tftp NAT helper");
+MODULE_LICENSE("GPL");
+static unsigned int help(struct sk_buff **pskb,
+                         enum ip_conntrack_info ctinfo,
+                         struct ip_conntrack_expect *exp)
+{
+        exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
+        exp->dir = IP_CT_DIR_REPLY;
+        exp->expectfn = ip_nat_follow_master;
+        if (ip_conntrack_expect_related(exp) != 0) {
+                ip_conntrack_expect_free(exp);
+                return NF_DROP;
+        }
+        return NF_ACCEPT;
+}
+static void __exit fini(void)
+{
+        ip_nat_tftp_hook = NULL;
+        /* Make sure noone calls it, meanwhile. */
+        synchronize_net();
+}
+static int __init init(void)
+{
+        BUG_ON(ip_nat_tftp_hook);
+        ip_nat_tftp_hook = help;
+        return 0;
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
new file mode 100644
index 000000000000..9e40dffc204f
--- /dev/null
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -0,0 +1,741 @@
+/*
+ * This is a module which is used for queueing IPv4 packets and
+ * communicating with userspace via netlink.
+ *
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
+ * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
+ * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian 
+ *             Zander).
+ * 2000-08-01: Added Nick Williams' MAC support.
+ * 2002-06-25: Code cleanup.
+ * 2005-01-10: Added /proc counter for dropped packets; fixed so
+ *             packets aren't delivered to user space if they're going 
+ *             to be dropped. 
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_queue.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <net/sock.h>
+#include <net/route.h>
+#define IPQ_QMAX_DEFAULT 1024
+#define IPQ_PROC_FS_NAME "ip_queue"
+#define NET_IPQ_QMAX 2088
+#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
+struct ipq_rt_info {
+        __u8 tos;
+        __u32 daddr;
+        __u32 saddr;
+};
+struct ipq_queue_entry {
+        struct list_head list;
+        struct nf_info *info;
+        struct sk_buff *skb;
+        struct ipq_rt_info rt_info;
+};
+typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
+static unsigned char copy_mode = IPQ_COPY_NONE;
+static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
+static DEFINE_RWLOCK(queue_lock);
+static int peer_pid;
+static unsigned int copy_range;
+static unsigned int queue_total;
+static unsigned int queue_dropped = 0;
+static unsigned int queue_user_dropped = 0;
+static struct sock *ipqnl;
+static LIST_HEAD(queue_list);
+static DECLARE_MUTEX(ipqnl_sem);
+static void
+ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
+{
+        nf_reinject(entry->skb, entry->info, verdict);
+        kfree(entry);
+}
+static inline void
+__ipq_enqueue_entry(struct ipq_queue_entry *entry)
+{
+       list_add(&entry->list, &queue_list);
+       queue_total++;
+}
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct ipq_queue_entry *
+__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+        struct list_head *p;
+        list_for_each_prev(p, &queue_list) {
+                struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
+                
+                if (!cmpfn || cmpfn(entry, data))
+                        return entry;
+        }
+        return NULL;
+}
+static inline void
+__ipq_dequeue_entry(struct ipq_queue_entry *entry)
+{
+        list_del(&entry->list);
+        queue_total--;
+}
+static inline struct ipq_queue_entry *
+__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+        struct ipq_queue_entry *entry;
+        entry = __ipq_find_entry(cmpfn, data);
+        if (entry == NULL)
+                return NULL;
+        __ipq_dequeue_entry(entry);
+        return entry;
+}
+static inline void
+__ipq_flush(int verdict)
+{
+        struct ipq_queue_entry *entry;
+        
+        while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
+                ipq_issue_verdict(entry, verdict);
+}
+static inline int
+__ipq_set_mode(unsigned char mode, unsigned int range)
+{
+        int status = 0;
+        
+        switch(mode) {
+        case IPQ_COPY_NONE:
+        case IPQ_COPY_META:
+                copy_mode = mode;
+                copy_range = 0;
+                break;
+                
+        case IPQ_COPY_PACKET:
+                copy_mode = mode;
+                copy_range = range;
+                if (copy_range > 0xFFFF)
+                        copy_range = 0xFFFF;
+                break;
+                
+        default:
+                status = -EINVAL;
+        }
+        return status;
+}
+static inline void
+__ipq_reset(void)
+{
+        peer_pid = 0;
+        net_disable_timestamp();
+        __ipq_set_mode(IPQ_COPY_NONE, 0);
+        __ipq_flush(NF_DROP);
+}
+static struct ipq_queue_entry *
+ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+        struct ipq_queue_entry *entry;
+        
+        write_lock_bh(&queue_lock);
+        entry = __ipq_find_dequeue_entry(cmpfn, data);
+        write_unlock_bh(&queue_lock);
+        return entry;
+}
+static void
+ipq_flush(int verdict)
+{
+        write_lock_bh(&queue_lock);
+        __ipq_flush(verdict);
+        write_unlock_bh(&queue_lock);
+}
+static struct sk_buff *
+ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
+{
+        unsigned char *old_tail;
+        size_t size = 0;
+        size_t data_len = 0;
+        struct sk_buff *skb;
+        struct ipq_packet_msg *pmsg;
+        struct nlmsghdr *nlh;
+        read_lock_bh(&queue_lock);
+        
+        switch (copy_mode) {
+        case IPQ_COPY_META:
+        case IPQ_COPY_NONE:
+                size = NLMSG_SPACE(sizeof(*pmsg));
+                data_len = 0;
+                break;
+        
+        case IPQ_COPY_PACKET:
+                if (copy_range == 0 || copy_range > entry->skb->len)
+                        data_len = entry->skb->len;
+                else
+                        data_len = copy_range;
+                
+                size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
+                break;
+        
+        default:
+                *errp = -EINVAL;
+                read_unlock_bh(&queue_lock);
+                return NULL;
+        }
+        read_unlock_bh(&queue_lock);
+        skb = alloc_skb(size, GFP_ATOMIC);
+        if (!skb)
+                goto nlmsg_failure;
+                
+        old_tail= skb->tail;
+        nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
+        pmsg = NLMSG_DATA(nlh);
+        memset(pmsg, 0, sizeof(*pmsg));
+        pmsg->packet_id       = (unsigned long )entry;
+        pmsg->data_len        = data_len;
+        pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
+        pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+        pmsg->mark            = entry->skb->nfmark;
+        pmsg->hook            = entry->info->hook;
+        pmsg->hw_protocol     = entry->skb->protocol;
+        
+        if (entry->info->indev)
+                strcpy(pmsg->indev_name, entry->info->indev->name);
+        else
+                pmsg->indev_name[0] = '\0';
+        
+        if (entry->info->outdev)
+                strcpy(pmsg->outdev_name, entry->info->outdev->name);
+        else
+                pmsg->outdev_name[0] = '\0';
+        
+        if (entry->info->indev && entry->skb->dev) {
+                pmsg->hw_type = entry->skb->dev->type;
+                if (entry->skb->dev->hard_header_parse)
+                        pmsg->hw_addrlen =
+                                entry->skb->dev->hard_header_parse(entry->skb,
+                                                                   pmsg->hw_addr);
+        }
+        
+        if (data_len)
+                if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
+                        BUG();
+                
+        nlh->nlmsg_len = skb->tail - old_tail;
+        return skb;
+nlmsg_failure:
+        if (skb)
+                kfree_skb(skb);
+        *errp = -EINVAL;
+        printk(KERN_ERR "ip_queue: error creating packet message\n");
+        return NULL;
+}
+static int
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+{
+        int status = -EINVAL;
+        struct sk_buff *nskb;
+        struct ipq_queue_entry *entry;
+        if (copy_mode == IPQ_COPY_NONE)
+                return -EAGAIN;
+        entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+        if (entry == NULL) {
+                printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
+                return -ENOMEM;
+        }
+        entry->info = info;
+        entry->skb = skb;
+        if (entry->info->hook == NF_IP_LOCAL_OUT) {
+                struct iphdr *iph = skb->nh.iph;
+                entry->rt_info.tos = iph->tos;
+                entry->rt_info.daddr = iph->daddr;
+                entry->rt_info.saddr = iph->saddr;
+        }
+        nskb = ipq_build_packet_message(entry, &status);
+        if (nskb == NULL)
+                goto err_out_free;
+                
+        write_lock_bh(&queue_lock);
+        
+        if (!peer_pid)
+                goto err_out_free_nskb; 
+        if (queue_total >= queue_maxlen) {
+                queue_dropped++;
+                status = -ENOSPC;
+                if (net_ratelimit())
+                          printk (KERN_WARNING "ip_queue: full at %d entries, "
+                                  "dropping packets(s). Dropped: %d\n", queue_total,
+                                  queue_dropped);
+                goto err_out_free_nskb;
+        }
+        /* netlink_unicast will either free the nskb or attach it to a socket */ 
+        status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
+        if (status < 0) {
+                queue_user_dropped++;
+                goto err_out_unlock;
+        }
+        __ipq_enqueue_entry(entry);
+        write_unlock_bh(&queue_lock);
+        return status;
+err_out_free_nskb:
+        kfree_skb(nskb); 
+        
+err_out_unlock:
+        write_unlock_bh(&queue_lock);
+err_out_free:
+        kfree(entry);
+        return status;
+}
+static int
+ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
+{
+        int diff;
+        struct iphdr *user_iph = (struct iphdr *)v->payload;
+        if (v->data_len < sizeof(*user_iph))
+                return 0;
+        diff = v->data_len - e->skb->len;
+        if (diff < 0)
+                skb_trim(e->skb, v->data_len);
+        else if (diff > 0) {
+                if (v->data_len > 0xFFFF)
+                        return -EINVAL;
+                if (diff > skb_tailroom(e->skb)) {
+                        struct sk_buff *newskb;
+                        
+                        newskb = skb_copy_expand(e->skb,
+                                                 skb_headroom(e->skb),
+                                                 diff,
+                                                 GFP_ATOMIC);
+                        if (newskb == NULL) {
+                                printk(KERN_WARNING "ip_queue: OOM "
+                                      "in mangle, dropping packet\n");
+                                return -ENOMEM;
+                        }
+                        if (e->skb->sk)
+                                skb_set_owner_w(newskb, e->skb->sk);
+                        kfree_skb(e->skb);
+                        e->skb = newskb;
+                }
+                skb_put(e->skb, diff);
+        }
+        if (!skb_ip_make_writable(&e->skb, v->data_len))
+                return -ENOMEM;
+        memcpy(e->skb->data, v->payload, v->data_len);
+        e->skb->nfcache |= NFC_ALTERED;
+        /*
+         * Extra routing may needed on local out, as the QUEUE target never
+         * returns control to the table.
+         */
+        if (e->info->hook == NF_IP_LOCAL_OUT) {
+                struct iphdr *iph = e->skb->nh.iph;
+                if (!(iph->tos == e->rt_info.tos
+                      && iph->daddr == e->rt_info.daddr
+                      && iph->saddr == e->rt_info.saddr))
+                        return ip_route_me_harder(&e->skb);
+        }
+        return 0;
+}
+static inline int
+id_cmp(struct ipq_queue_entry *e, unsigned long id)
+{
+        return (id == (unsigned long )e);
+}
+static int
+ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
+{
+        struct ipq_queue_entry *entry;
+        if (vmsg->value > NF_MAX_VERDICT)
+                return -EINVAL;
+        entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
+        if (entry == NULL)
+                return -ENOENT;
+        else {
+                int verdict = vmsg->value;
+                
+                if (vmsg->data_len && vmsg->data_len == len)
+                        if (ipq_mangle_ipv4(vmsg, entry) < 0)
+                                verdict = NF_DROP;
+                
+                ipq_issue_verdict(entry, verdict);
+                return 0;
+        }
+}
+static int
+ipq_set_mode(unsigned char mode, unsigned int range)
+{
+        int status;
+        write_lock_bh(&queue_lock);
+        status = __ipq_set_mode(mode, range);
+        write_unlock_bh(&queue_lock);
+        return status;
+}
+static int
+ipq_receive_peer(struct ipq_peer_msg *pmsg,
+                 unsigned char type, unsigned int len)
+{
+        int status = 0;
+        if (len < sizeof(*pmsg))
+                return -EINVAL;
+        switch (type) {
+        case IPQM_MODE:
+                status = ipq_set_mode(pmsg->msg.mode.value,
+                                      pmsg->msg.mode.range);
+                break;
+                
+        case IPQM_VERDICT:
+                if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
+                        status = -EINVAL;
+                else
+                        status = ipq_set_verdict(&pmsg->msg.verdict,
+                                                 len - sizeof(*pmsg));
+                        break;
+        default:
+                status = -EINVAL;
+        }
+        return status;
+}
+static int
+dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
+{
+        if (entry->info->indev)
+                if (entry->info->indev->ifindex == ifindex)
+                        return 1;
+                        
+        if (entry->info->outdev)
+                if (entry->info->outdev->ifindex == ifindex)
+                        return 1;
+        return 0;
+}
+static void
+ipq_dev_drop(int ifindex)
+{
+        struct ipq_queue_entry *entry;
+        
+        while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
+                ipq_issue_verdict(entry, NF_DROP);
+}
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+static inline void
+ipq_rcv_skb(struct sk_buff *skb)
+{
+        int status, type, pid, flags, nlmsglen, skblen;
+        struct nlmsghdr *nlh;
+        skblen = skb->len;
+        if (skblen < sizeof(*nlh))
+                return;
+        nlh = (struct nlmsghdr *)skb->data;
+        nlmsglen = nlh->nlmsg_len;
+        if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
+                return;
+        pid = nlh->nlmsg_pid;
+        flags = nlh->nlmsg_flags;
+        
+        if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
+                RCV_SKB_FAIL(-EINVAL);
+                
+        if (flags & MSG_TRUNC)
+                RCV_SKB_FAIL(-ECOMM);
+                
+        type = nlh->nlmsg_type;
+        if (type < NLMSG_NOOP || type >= IPQM_MAX)
+                RCV_SKB_FAIL(-EINVAL);
+                
+        if (type <= IPQM_BASE)
+                return;
+                
+        if (security_netlink_recv(skb))
+                RCV_SKB_FAIL(-EPERM);
+        
+        write_lock_bh(&queue_lock);
+        
+        if (peer_pid) {
+                if (peer_pid != pid) {
+                        write_unlock_bh(&queue_lock);
+                        RCV_SKB_FAIL(-EBUSY);
+                }
+        } else {
+                net_enable_timestamp();
+                peer_pid = pid;
+        }
+                
+        write_unlock_bh(&queue_lock);
+        
+        status = ipq_receive_peer(NLMSG_DATA(nlh), type,
+                                  skblen - NLMSG_LENGTH(0));
+        if (status < 0)
+                RCV_SKB_FAIL(status);
+                
+        if (flags & NLM_F_ACK)
+                netlink_ack(skb, nlh, 0);
+        return;
+}
+static void
+ipq_rcv_sk(struct sock *sk, int len)
+{
+        do {
+                struct sk_buff *skb;
+                if (down_trylock(&ipqnl_sem))
+                        return;
+                        
+                while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+                        ipq_rcv_skb(skb);
+                        kfree_skb(skb);
+                }
+                
+                up(&ipqnl_sem);
+        } while (ipqnl && ipqnl->sk_receive_queue.qlen);
+}
+static int
+ipq_rcv_dev_event(struct notifier_block *this,
+                  unsigned long event, void *ptr)
+{
+        struct net_device *dev = ptr;
+        /* Drop any packets associated with the downed device */
+        if (event == NETDEV_DOWN)
+                ipq_dev_drop(dev->ifindex);
+        return NOTIFY_DONE;
+}
+static struct notifier_block ipq_dev_notifier = {
+        .notifier_call  = ipq_rcv_dev_event,
+};
+static int
+ipq_rcv_nl_event(struct notifier_block *this,
+                 unsigned long event, void *ptr)
+{
+        struct netlink_notify *n = ptr;
+        if (event == NETLINK_URELEASE &&
+            n->protocol == NETLINK_FIREWALL && n->pid) {
+                write_lock_bh(&queue_lock);
+                if (n->pid == peer_pid)
+                        __ipq_reset();
+                write_unlock_bh(&queue_lock);
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block ipq_nl_notifier = {
+        .notifier_call  = ipq_rcv_nl_event,
+};
+static struct ctl_table_header *ipq_sysctl_header;
+static ctl_table ipq_table[] = {
+        {
+                .ctl_name       = NET_IPQ_QMAX,
+                .procname       = NET_IPQ_QMAX_NAME,
+                .data           = &queue_maxlen,
+                .maxlen         = sizeof(queue_maxlen),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipq_dir_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4",
+                .mode           = 0555,
+                .child          = ipq_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipq_root_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net",
+                .mode           = 0555,
+                .child          = ipq_dir_table
+        },
+        { .ctl_name = 0 }
+};
+#ifdef CONFIG_PROC_FS
+static int
+ipq_get_info(char *buffer, char **start, off_t offset, int length)
+{
+        int len;
+        read_lock_bh(&queue_lock);
+        
+        len = sprintf(buffer,
+                      "Peer PID          : %d\n"
+                      "Copy mode         : %hu\n"
+                      "Copy range        : %u\n"
+                      "Queue length      : %u\n"
+                      "Queue max. length : %u\n"
+                      "Queue dropped     : %u\n"
+                      "Netlink dropped   : %u\n",
+                      peer_pid,
+                      copy_mode,
+                      copy_range,
+                      queue_total,
+                      queue_maxlen,
+                      queue_dropped,
+                      queue_user_dropped);
+        read_unlock_bh(&queue_lock);
+        
+        *start = buffer + offset;
+        len -= offset;
+        if (len > length)
+                len = length;
+        else if (len < 0)
+                len = 0;
+        return len;
+}
+#endif /* CONFIG_PROC_FS */
+static int
+init_or_cleanup(int init)
+{
+        int status = -ENOMEM;
+        struct proc_dir_entry *proc;
+        
+        if (!init)
+                goto cleanup;
+        netlink_register_notifier(&ipq_nl_notifier);
+        ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+        if (ipqnl == NULL) {
+                printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
+                goto cleanup_netlink_notifier;
+        }
+        proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
+        if (proc)
+                proc->owner = THIS_MODULE;
+        else {
+                printk(KERN_ERR "ip_queue: failed to create proc entry\n");
+                goto cleanup_ipqnl;
+        }
+        
+        register_netdevice_notifier(&ipq_dev_notifier);
+        ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
+        
+        status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+        if (status < 0) {
+                printk(KERN_ERR "ip_queue: failed to register queue handler\n");
+                goto cleanup_sysctl;
+        }
+        return status;
+cleanup:
+        nf_unregister_queue_handler(PF_INET);
+        synchronize_net();
+        ipq_flush(NF_DROP);
+        
+cleanup_sysctl:
+        unregister_sysctl_table(ipq_sysctl_header);
+        unregister_netdevice_notifier(&ipq_dev_notifier);
+        proc_net_remove(IPQ_PROC_FS_NAME);
+        
+cleanup_ipqnl:
+        sock_release(ipqnl->sk_socket);
+        down(&ipqnl_sem);
+        up(&ipqnl_sem);
+        
+cleanup_netlink_notifier:
+        netlink_unregister_notifier(&ipq_nl_notifier);
+        return status;
+}
+static int __init init(void)
+{
+        
+        return init_or_cleanup(1);
+}
+static void __exit fini(void)
+{
+        init_or_cleanup(0);
+}
+MODULE_DESCRIPTION("IPv4 packet queue handler");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_LICENSE("GPL");
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 000000000000..8a54f92b8496
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,1964 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
+ *      - increase module usage count as soon as we have rules inside
+ *        a table
+ */
+#include <linux/config.h>
+#include <linux/cache.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv4 packet filter");
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...)  printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x)                                         \
+do {                                                            \
+        if (!(x))                                               \
+                printk("IP_NF_ASSERT: %s:%s:%u\n",              \
+                       __FUNCTION__, __FILE__, __LINE__);       \
+} while(0)
+#else
+#define IP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+static DECLARE_MUTEX(ipt_mutex);
+/* Must have mutex */
+#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+/*
+   We keep a set of rules for each CPU, so we can avoid write-locking
+   them in the softirq when updating the counters and therefore
+   only need to read-lock in the softirq; doing a write_lock_bh() in user
+   context stops packets coming through and allows user context to read
+   the counters or update the rules.
+   To be cache friendly on SMP, we arrange them like so:
+   [ n-entries ]
+   ... cache-align padding ...
+   [ n-entries ]
+   Hence the start of any table is given by get_table() below.  */
+/* The table itself */
+struct ipt_table_info
+{
+        /* Size per table */
+        unsigned int size;
+        /* Number of entries: FIXME. --RR */
+        unsigned int number;
+        /* Initial number of entries. Needed for module usage count */
+        unsigned int initial_entries;
+        /* Entry points and underflows */
+        unsigned int hook_entry[NF_IP_NUMHOOKS];
+        unsigned int underflow[NF_IP_NUMHOOKS];
+        /* ipt_entry tables: one per CPU */
+        char entries[0] ____cacheline_aligned;
+};
+static LIST_HEAD(ipt_target);
+static LIST_HEAD(ipt_match);
+static LIST_HEAD(ipt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+#if 0
+#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
+#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
+#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
+#endif
+/* Returns whether matches rule or not. */
+static inline int
+ip_packet_match(const struct iphdr *ip,
+                const char *indev,
+                const char *outdev,
+                const struct ipt_ip *ipinfo,
+                int isfrag)
+{
+        size_t i;
+        unsigned long ret;
+#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
+        if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+                  IPT_INV_SRCIP)
+            || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+                     IPT_INV_DSTIP)) {
+                dprintf("Source or dest mismatch.\n");
+                dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+                        NIPQUAD(ip->saddr),
+                        NIPQUAD(ipinfo->smsk.s_addr),
+                        NIPQUAD(ipinfo->src.s_addr),
+                        ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
+                dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+                        NIPQUAD(ip->daddr),
+                        NIPQUAD(ipinfo->dmsk.s_addr),
+                        NIPQUAD(ipinfo->dst.s_addr),
+                        ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+                return 0;
+        }
+        /* Look for ifname matches; this should unroll nicely. */
+        for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+                ret |= (((const unsigned long *)indev)[i]
+                        ^ ((const unsigned long *)ipinfo->iniface)[i])
+                        & ((const unsigned long *)ipinfo->iniface_mask)[i];
+        }
+        if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
+                dprintf("VIA in mismatch (%s vs %s).%s\n",
+                        indev, ipinfo->iniface,
+                        ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+                return 0;
+        }
+        for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+                ret |= (((const unsigned long *)outdev)[i]
+                        ^ ((const unsigned long *)ipinfo->outiface)[i])
+                        & ((const unsigned long *)ipinfo->outiface_mask)[i];
+        }
+        if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
+                dprintf("VIA out mismatch (%s vs %s).%s\n",
+                        outdev, ipinfo->outiface,
+                        ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+                return 0;
+        }
+        /* Check specific protocol */
+        if (ipinfo->proto
+            && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+                dprintf("Packet protocol %hi does not match %hi.%s\n",
+                        ip->protocol, ipinfo->proto,
+                        ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+                return 0;
+        }
+        /* If we have a fragment rule but the packet is not a fragment
+         * then we return zero */
+        if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
+                dprintf("Fragment rule but not fragment.%s\n",
+                        ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+                return 0;
+        }
+        return 1;
+}
+static inline int
+ip_checkentry(const struct ipt_ip *ip)
+{
+        if (ip->flags & ~IPT_F_MASK) {
+                duprintf("Unknown flag bits set: %08X\n",
+                         ip->flags & ~IPT_F_MASK);
+                return 0;
+        }
+        if (ip->invflags & ~IPT_INV_MASK) {
+                duprintf("Unknown invflag bits set: %08X\n",
+                         ip->invflags & ~IPT_INV_MASK);
+                return 0;
+        }
+        return 1;
+}
+static unsigned int
+ipt_error(struct sk_buff **pskb,
+          const struct net_device *in,
+          const struct net_device *out,
+          unsigned int hooknum,
+          const void *targinfo,
+          void *userinfo)
+{
+        if (net_ratelimit())
+                printk("ip_tables: error: `%s'\n", (char *)targinfo);
+        return NF_DROP;
+}
+static inline
+int do_match(struct ipt_entry_match *m,
+             const struct sk_buff *skb,
+             const struct net_device *in,
+             const struct net_device *out,
+             int offset,
+             int *hotdrop)
+{
+        /* Stop iteration if it doesn't match */
+        if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop))
+                return 1;
+        else
+                return 0;
+}
+static inline struct ipt_entry *
+get_entry(void *base, unsigned int offset)
+{
+        return (struct ipt_entry *)(base + offset);
+}
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(struct sk_buff **pskb,
+             unsigned int hook,
+             const struct net_device *in,
+             const struct net_device *out,
+             struct ipt_table *table,
+             void *userdata)
+{
+        static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+        u_int16_t offset;
+        struct iphdr *ip;
+        u_int16_t datalen;
+        int hotdrop = 0;
+        /* Initializing verdict to NF_DROP keeps gcc happy. */
+        unsigned int verdict = NF_DROP;
+        const char *indev, *outdev;
+        void *table_base;
+        struct ipt_entry *e, *back;
+        /* Initialization */
+        ip = (*pskb)->nh.iph;
+        datalen = (*pskb)->len - ip->ihl * 4;
+        indev = in ? in->name : nulldevname;
+        outdev = out ? out->name : nulldevname;
+        /* We handle fragments by dealing with the first fragment as
+         * if it was a normal packet.  All other fragments are treated
+         * normally, except that they will NEVER match rules that ask
+         * things we don't know, ie. tcp syn flag or ports).  If the
+         * rule is also a fragment-specific rule, non-fragments won't
+         * match it. */
+        offset = ntohs(ip->frag_off) & IP_OFFSET;
+        read_lock_bh(&table->lock);
+        IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+        table_base = (void *)table->private->entries
+                + TABLE_OFFSET(table->private, smp_processor_id());
+        e = get_entry(table_base, table->private->hook_entry[hook]);
+#ifdef CONFIG_NETFILTER_DEBUG
+        /* Check noone else using our table */
+        if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
+            && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
+                printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
+                       smp_processor_id(),
+                       table->name,
+                       &((struct ipt_entry *)table_base)->comefrom,
+                       ((struct ipt_entry *)table_base)->comefrom);
+        }
+        ((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
+#endif
+        /* For return from builtin chain */
+        back = get_entry(table_base, table->private->underflow[hook]);
+        do {
+                IP_NF_ASSERT(e);
+                IP_NF_ASSERT(back);
+                (*pskb)->nfcache |= e->nfcache;
+                if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
+                        struct ipt_entry_target *t;
+                        if (IPT_MATCH_ITERATE(e, do_match,
+                                              *pskb, in, out,
+                                              offset, &hotdrop) != 0)
+                                goto no_match;
+                        ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+                        t = ipt_get_target(e);
+                        IP_NF_ASSERT(t->u.kernel.target);
+                        /* Standard target? */
+                        if (!t->u.kernel.target->target) {
+                                int v;
+                                v = ((struct ipt_standard_target *)t)->verdict;
+                                if (v < 0) {
+                                        /* Pop from stack? */
+                                        if (v != IPT_RETURN) {
+                                                verdict = (unsigned)(-v) - 1;
+                                                break;
+                                        }
+                                        e = back;
+                                        back = get_entry(table_base,
+                                                         back->comefrom);
+                                        continue;
+                                }
+                                if (table_base + v
+                                    != (void *)e + e->next_offset) {
+                                        /* Save old back ptr in next entry */
+                                        struct ipt_entry *next
+                                                = (void *)e + e->next_offset;
+                                        next->comefrom
+                                                = (void *)back - table_base;
+                                        /* set back pointer to next entry */
+                                        back = next;
+                                }
+                                e = get_entry(table_base, v);
+                        } else {
+                                /* Targets which reenter must return
+                                   abs. verdicts */
+#ifdef CONFIG_NETFILTER_DEBUG
+                                ((struct ipt_entry *)table_base)->comefrom
+                                        = 0xeeeeeeec;
+#endif
+                                verdict = t->u.kernel.target->target(pskb,
+                                                                     in, out,
+                                                                     hook,
+                                                                     t->data,
+                                                                     userdata);
+#ifdef CONFIG_NETFILTER_DEBUG
+                                if (((struct ipt_entry *)table_base)->comefrom
+                                    != 0xeeeeeeec
+                                    && verdict == IPT_CONTINUE) {
+                                        printk("Target %s reentered!\n",
+                                               t->u.kernel.target->name);
+                                        verdict = NF_DROP;
+                                }
+                                ((struct ipt_entry *)table_base)->comefrom
+                                        = 0x57acc001;
+#endif
+                                /* Target might have changed stuff. */
+                                ip = (*pskb)->nh.iph;
+                                datalen = (*pskb)->len - ip->ihl * 4;
+                                if (verdict == IPT_CONTINUE)
+                                        e = (void *)e + e->next_offset;
+                                else
+                                        /* Verdict */
+                                        break;
+                        }
+                } else {
+                no_match:
+                        e = (void *)e + e->next_offset;
+                }
+        } while (!hotdrop);
+#ifdef CONFIG_NETFILTER_DEBUG
+        ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
+#endif
+        read_unlock_bh(&table->lock);
+#ifdef DEBUG_ALLOW_ALL
+        return NF_ACCEPT;
+#else
+        if (hotdrop)
+                return NF_DROP;
+        else return verdict;
+#endif
+}
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use try_then_request_module().
+ */
+/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
+static inline struct ipt_table *find_table_lock(const char *name)
+{
+        struct ipt_table *t;
+        if (down_interruptible(&ipt_mutex) != 0)
+                return ERR_PTR(-EINTR);
+        list_for_each_entry(t, &ipt_tables, list)
+                if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+                        return t;
+        up(&ipt_mutex);
+        return NULL;
+}
+/* Find match, grabs ref.  Returns ERR_PTR() on error. */
+static inline struct ipt_match *find_match(const char *name, u8 revision)
+{
+        struct ipt_match *m;
+        int err = 0;
+        if (down_interruptible(&ipt_mutex) != 0)
+                return ERR_PTR(-EINTR);
+        list_for_each_entry(m, &ipt_match, list) {
+                if (strcmp(m->name, name) == 0) {
+                        if (m->revision == revision) {
+                                if (try_module_get(m->me)) {
+                                        up(&ipt_mutex);
+                                        return m;
+                                }
+                        } else
+                                err = -EPROTOTYPE; /* Found something. */
+                }
+        }
+        up(&ipt_mutex);
+        return ERR_PTR(err);
+}
+/* Find target, grabs ref.  Returns ERR_PTR() on error. */
+static inline struct ipt_target *find_target(const char *name, u8 revision)
+{
+        struct ipt_target *t;
+        int err = 0;
+        if (down_interruptible(&ipt_mutex) != 0)
+                return ERR_PTR(-EINTR);
+        list_for_each_entry(t, &ipt_target, list) {
+                if (strcmp(t->name, name) == 0) {
+                        if (t->revision == revision) {
+                                if (try_module_get(t->me)) {
+                                        up(&ipt_mutex);
+                                        return t;
+                                }
+                        } else
+                                err = -EPROTOTYPE; /* Found something. */
+                }
+        }
+        up(&ipt_mutex);
+        return ERR_PTR(err);
+}
+struct ipt_target *ipt_find_target(const char *name, u8 revision)
+{
+        struct ipt_target *target;
+        target = try_then_request_module(find_target(name, revision),
+                                         "ipt_%s", name);
+        if (IS_ERR(target) || !target)
+                return NULL;
+        return target;
+}
+static int match_revfn(const char *name, u8 revision, int *bestp)
+{
+        struct ipt_match *m;
+        int have_rev = 0;
+        list_for_each_entry(m, &ipt_match, list) {
+                if (strcmp(m->name, name) == 0) {
+                        if (m->revision > *bestp)
+                                *bestp = m->revision;
+                        if (m->revision == revision)
+                                have_rev = 1;
+                }
+        }
+        return have_rev;
+}
+static int target_revfn(const char *name, u8 revision, int *bestp)
+{
+        struct ipt_target *t;
+        int have_rev = 0;
+        list_for_each_entry(t, &ipt_target, list) {
+                if (strcmp(t->name, name) == 0) {
+                        if (t->revision > *bestp)
+                                *bestp = t->revision;
+                        if (t->revision == revision)
+                                have_rev = 1;
+                }
+        }
+        return have_rev;
+}
+/* Returns true or false (if no such extension at all) */
+static inline int find_revision(const char *name, u8 revision,
+                                int (*revfn)(const char *, u8, int *),
+                                int *err)
+{
+        int have_rev, best = -1;
+        if (down_interruptible(&ipt_mutex) != 0) {
+                *err = -EINTR;
+                return 1;
+        }
+        have_rev = revfn(name, revision, &best);
+        up(&ipt_mutex);
+        /* Nothing at all?  Return 0 to try loading module. */
+        if (best == -1) {
+                *err = -ENOENT;
+                return 0;
+        }
+        *err = best;
+        if (!have_rev)
+                *err = -EPROTONOSUPPORT;
+        return 1;
+}
+/* All zeroes == unconditional rule. */
+static inline int
+unconditional(const struct ipt_ip *ip)
+{
+        unsigned int i;
+        for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
+                if (((__u32 *)ip)[i])
+                        return 0;
+        return 1;
+}
+/* Figures out from what hook each rule can be called: returns 0 if
+   there are loops.  Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+{
+        unsigned int hook;
+        /* No recursion; use packet counter to save back ptrs (reset
+           to 0 as we leave), and comefrom to save source hook bitmask */
+        for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
+                unsigned int pos = newinfo->hook_entry[hook];
+                struct ipt_entry *e
+                        = (struct ipt_entry *)(newinfo->entries + pos);
+                if (!(valid_hooks & (1 << hook)))
+                        continue;
+                /* Set initial back pointer. */
+                e->counters.pcnt = pos;
+                for (;;) {
+                        struct ipt_standard_target *t
+                                = (void *)ipt_get_target(e);
+                        if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
+                                printk("iptables: loop hook %u pos %u %08X.\n",
+                                       hook, pos, e->comefrom);
+                                return 0;
+                        }
+                        e->comefrom
+                                |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
+                        /* Unconditional return/END. */
+                        if (e->target_offset == sizeof(struct ipt_entry)
+                            && (strcmp(t->target.u.user.name,
+                                       IPT_STANDARD_TARGET) == 0)
+                            && t->verdict < 0
+                            && unconditional(&e->ip)) {
+                                unsigned int oldpos, size;
+                                /* Return: backtrack through the last
+                                   big jump. */
+                                do {
+                                        e->comefrom ^= (1<<NF_IP_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+                                        if (e->comefrom
+                                            & (1 << NF_IP_NUMHOOKS)) {
+                                                duprintf("Back unset "
+                                                         "on hook %u "
+                                                         "rule %u\n",
+                                                         hook, pos);
+                                        }
+#endif
+                                        oldpos = pos;
+                                        pos = e->counters.pcnt;
+                                        e->counters.pcnt = 0;
+                                        /* We're at the start. */
+                                        if (pos == oldpos)
+                                                goto next;
+                                        e = (struct ipt_entry *)
+                                                (newinfo->entries + pos);
+                                } while (oldpos == pos + e->next_offset);
+                                /* Move along one */
+                                size = e->next_offset;
+                                e = (struct ipt_entry *)
+                                        (newinfo->entries + pos + size);
+                                e->counters.pcnt = pos;
+                                pos += size;
+                        } else {
+                                int newpos = t->verdict;
+                                if (strcmp(t->target.u.user.name,
+                                           IPT_STANDARD_TARGET) == 0
+                                    && newpos >= 0) {
+                                        /* This a jump; chase it. */
+                                        duprintf("Jump rule %u -> %u\n",
+                                                 pos, newpos);
+                                } else {
+                                        /* ... this is a fallthru */
+                                        newpos = pos + e->next_offset;
+                                }
+                                e = (struct ipt_entry *)
+                                        (newinfo->entries + newpos);
+                                e->counters.pcnt = pos;
+                                pos = newpos;
+                        }
+                }
+                next:
+                duprintf("Finished chain %u\n", hook);
+        }
+        return 1;
+}
+static inline int
+cleanup_match(struct ipt_entry_match *m, unsigned int *i)
+{
+        if (i && (*i)-- == 0)
+                return 1;
+        if (m->u.kernel.match->destroy)
+                m->u.kernel.match->destroy(m->data,
+                                           m->u.match_size - sizeof(*m));
+        module_put(m->u.kernel.match->me);
+        return 0;
+}
+static inline int
+standard_check(const struct ipt_entry_target *t,
+               unsigned int max_offset)
+{
+        struct ipt_standard_target *targ = (void *)t;
+        /* Check standard info. */
+        if (t->u.target_size
+            != IPT_ALIGN(sizeof(struct ipt_standard_target))) {
+                duprintf("standard_check: target size %u != %u\n",
+                         t->u.target_size,
+                         IPT_ALIGN(sizeof(struct ipt_standard_target)));
+                return 0;
+        }
+        if (targ->verdict >= 0
+            && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
+                duprintf("ipt_standard_check: bad verdict (%i)\n",
+                         targ->verdict);
+                return 0;
+        }
+        if (targ->verdict < -NF_MAX_VERDICT - 1) {
+                duprintf("ipt_standard_check: bad negative verdict (%i)\n",
+                         targ->verdict);
+                return 0;
+        }
+        return 1;
+}
+static inline int
+check_match(struct ipt_entry_match *m,
+            const char *name,
+            const struct ipt_ip *ip,
+            unsigned int hookmask,
+            unsigned int *i)
+{
+        struct ipt_match *match;
+        match = try_then_request_module(find_match(m->u.user.name,
+                                                   m->u.user.revision),
+                                        "ipt_%s", m->u.user.name);
+        if (IS_ERR(match) || !match) {
+                duprintf("check_match: `%s' not found\n", m->u.user.name);
+                return match ? PTR_ERR(match) : -ENOENT;
+        }
+        m->u.kernel.match = match;
+        if (m->u.kernel.match->checkentry
+            && !m->u.kernel.match->checkentry(name, ip, m->data,
+                                              m->u.match_size - sizeof(*m),
+                                              hookmask)) {
+                module_put(m->u.kernel.match->me);
+                duprintf("ip_tables: check failed for `%s'.\n",
+                         m->u.kernel.match->name);
+                return -EINVAL;
+        }
+        (*i)++;
+        return 0;
+}
+static struct ipt_target ipt_standard_target;
+static inline int
+check_entry(struct ipt_entry *e, const char *name, unsigned int size,
+            unsigned int *i)
+{
+        struct ipt_entry_target *t;
+        struct ipt_target *target;
+        int ret;
+        unsigned int j;
+        if (!ip_checkentry(&e->ip)) {
+                duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+                return -EINVAL;
+        }
+        j = 0;
+        ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
+        if (ret != 0)
+                goto cleanup_matches;
+        t = ipt_get_target(e);
+        target = try_then_request_module(find_target(t->u.user.name,
+                                                     t->u.user.revision),
+                                         "ipt_%s", t->u.user.name);
+        if (IS_ERR(target) || !target) {
+                duprintf("check_entry: `%s' not found\n", t->u.user.name);
+                ret = target ? PTR_ERR(target) : -ENOENT;
+                goto cleanup_matches;
+        }
+        t->u.kernel.target = target;
+        if (t->u.kernel.target == &ipt_standard_target) {
+                if (!standard_check(t, size)) {
+                        ret = -EINVAL;
+                        goto cleanup_matches;
+                }
+        } else if (t->u.kernel.target->checkentry
+                   && !t->u.kernel.target->checkentry(name, e, t->data,
+                                                      t->u.target_size
+                                                      - sizeof(*t),
+                                                      e->comefrom)) {
+                module_put(t->u.kernel.target->me);
+                duprintf("ip_tables: check failed for `%s'.\n",
+                         t->u.kernel.target->name);
+                ret = -EINVAL;
+                goto cleanup_matches;
+        }
+        (*i)++;
+        return 0;
+ cleanup_matches:
+        IPT_MATCH_ITERATE(e, cleanup_match, &j);
+        return ret;
+}
+static inline int
+check_entry_size_and_hooks(struct ipt_entry *e,
+                           struct ipt_table_info *newinfo,
+                           unsigned char *base,
+                           unsigned char *limit,
+                           const unsigned int *hook_entries,
+                           const unsigned int *underflows,
+                           unsigned int *i)
+{
+        unsigned int h;
+        if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
+            || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+                duprintf("Bad offset %p\n", e);
+                return -EINVAL;
+        }
+        if (e->next_offset
+            < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+                duprintf("checking: element %p size %u\n",
+                         e, e->next_offset);
+                return -EINVAL;
+        }
+        /* Check hooks & underflows */
+        for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+                if ((unsigned char *)e - base == hook_entries[h])
+                        newinfo->hook_entry[h] = hook_entries[h];
+                if ((unsigned char *)e - base == underflows[h])
+                        newinfo->underflow[h] = underflows[h];
+        }
+        /* FIXME: underflows must be unconditional, standard verdicts
+           < 0 (not IPT_RETURN). --RR */
+        /* Clear counters and comefrom */
+        e->counters = ((struct ipt_counters) { 0, 0 });
+        e->comefrom = 0;
+        (*i)++;
+        return 0;
+}
+static inline int
+cleanup_entry(struct ipt_entry *e, unsigned int *i)
+{
+        struct ipt_entry_target *t;
+        if (i && (*i)-- == 0)
+                return 1;
+        /* Cleanup all matches */
+        IPT_MATCH_ITERATE(e, cleanup_match, NULL);
+        t = ipt_get_target(e);
+        if (t->u.kernel.target->destroy)
+                t->u.kernel.target->destroy(t->data,
+                                            t->u.target_size - sizeof(*t));
+        module_put(t->u.kernel.target->me);
+        return 0;
+}
+/* Checks and translates the user-supplied table segment (held in
+   newinfo) */
+static int
+translate_table(const char *name,
+                unsigned int valid_hooks,
+                struct ipt_table_info *newinfo,
+                unsigned int size,
+                unsigned int number,
+                const unsigned int *hook_entries,
+                const unsigned int *underflows)
+{
+        unsigned int i;
+        int ret;
+        newinfo->size = size;
+        newinfo->number = number;
+        /* Init all hooks to impossible value. */
+        for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+                newinfo->hook_entry[i] = 0xFFFFFFFF;
+                newinfo->underflow[i] = 0xFFFFFFFF;
+        }
+        duprintf("translate_table: size %u\n", newinfo->size);
+        i = 0;
+        /* Walk through entries, checking offsets. */
+        ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+                                check_entry_size_and_hooks,
+                                newinfo,
+                                newinfo->entries,
+                                newinfo->entries + size,
+                                hook_entries, underflows, &i);
+        if (ret != 0)
+                return ret;
+        if (i != number) {
+                duprintf("translate_table: %u not %u entries\n",
+                         i, number);
+                return -EINVAL;
+        }
+        /* Check hooks all assigned */
+        for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+                /* Only hooks which are valid */
+                if (!(valid_hooks & (1 << i)))
+                        continue;
+                if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+                        duprintf("Invalid hook entry %u %u\n",
+                                 i, hook_entries[i]);
+                        return -EINVAL;
+                }
+                if (newinfo->underflow[i] == 0xFFFFFFFF) {
+                        duprintf("Invalid underflow %u %u\n",
+                                 i, underflows[i]);
+                        return -EINVAL;
+                }
+        }
+        if (!mark_source_chains(newinfo, valid_hooks))
+                return -ELOOP;
+        /* Finally, each sanity check must pass */
+        i = 0;
+        ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+                                check_entry, name, size, &i);
+        if (ret != 0) {
+                IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+                                  cleanup_entry, &i);
+                return ret;
+        }
+        /* And one copy for every other CPU */
+        for (i = 1; i < num_possible_cpus(); i++) {
+                memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+                       newinfo->entries,
+                       SMP_ALIGN(newinfo->size));
+        }
+        return ret;
+}
+static struct ipt_table_info *
+replace_table(struct ipt_table *table,
+              unsigned int num_counters,
+              struct ipt_table_info *newinfo,
+              int *error)
+{
+        struct ipt_table_info *oldinfo;
+#ifdef CONFIG_NETFILTER_DEBUG
+        {
+                struct ipt_entry *table_base;
+                unsigned int i;
+                for (i = 0; i < num_possible_cpus(); i++) {
+                        table_base =
+                                (void *)newinfo->entries
+                                + TABLE_OFFSET(newinfo, i);
+                        table_base->comefrom = 0xdead57ac;
+                }
+        }
+#endif
+        /* Do the substitution. */
+        write_lock_bh(&table->lock);
+        /* Check inside lock: is the old number correct? */
+        if (num_counters != table->private->number) {
+                duprintf("num_counters != table->private->number (%u/%u)\n",
+                         num_counters, table->private->number);
+                write_unlock_bh(&table->lock);
+                *error = -EAGAIN;
+                return NULL;
+        }
+        oldinfo = table->private;
+        table->private = newinfo;
+        newinfo->initial_entries = oldinfo->initial_entries;
+        write_unlock_bh(&table->lock);
+        return oldinfo;
+}
+/* Gets counters. */
+static inline int
+add_entry_to_counter(const struct ipt_entry *e,
+                     struct ipt_counters total[],
+                     unsigned int *i)
+{
+        ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+        (*i)++;
+        return 0;
+}
+static void
+get_counters(const struct ipt_table_info *t,
+             struct ipt_counters counters[])
+{
+        unsigned int cpu;
+        unsigned int i;
+        for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+                i = 0;
+                IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+                                  t->size,
+                                  add_entry_to_counter,
+                                  counters,
+                                  &i);
+        }
+}
+static int
+copy_entries_to_user(unsigned int total_size,
+                     struct ipt_table *table,
+                     void __user *userptr)
+{
+        unsigned int off, num, countersize;
+        struct ipt_entry *e;
+        struct ipt_counters *counters;
+        int ret = 0;
+        /* We need atomic snapshot of counters: rest doesn't change
+           (other than comefrom, which userspace doesn't care
+           about). */
+        countersize = sizeof(struct ipt_counters) * table->private->number;
+        counters = vmalloc(countersize);
+        if (counters == NULL)
+                return -ENOMEM;
+        /* First, sum counters... */
+        memset(counters, 0, countersize);
+        write_lock_bh(&table->lock);
+        get_counters(table->private, counters);
+        write_unlock_bh(&table->lock);
+        /* ... then copy entire thing from CPU 0... */
+        if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+                ret = -EFAULT;
+                goto free_counters;
+        }
+        /* FIXME: use iterator macros --RR */
+        /* ... then go back and fix counters and names */
+        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+                unsigned int i;
+                struct ipt_entry_match *m;
+                struct ipt_entry_target *t;
+                e = (struct ipt_entry *)(table->private->entries + off);
+                if (copy_to_user(userptr + off
+                                 + offsetof(struct ipt_entry, counters),
+                                 &counters[num],
+                                 sizeof(counters[num])) != 0) {
+                        ret = -EFAULT;
+                        goto free_counters;
+                }
+                for (i = sizeof(struct ipt_entry);
+                     i < e->target_offset;
+                     i += m->u.match_size) {
+                        m = (void *)e + i;
+                        if (copy_to_user(userptr + off + i
+                                         + offsetof(struct ipt_entry_match,
+                                                    u.user.name),
+                                         m->u.kernel.match->name,
+                                         strlen(m->u.kernel.match->name)+1)
+                            != 0) {
+                                ret = -EFAULT;
+                                goto free_counters;
+                        }
+                }
+                t = ipt_get_target(e);
+                if (copy_to_user(userptr + off + e->target_offset
+                                 + offsetof(struct ipt_entry_target,
+                                            u.user.name),
+                                 t->u.kernel.target->name,
+                                 strlen(t->u.kernel.target->name)+1) != 0) {
+                        ret = -EFAULT;
+                        goto free_counters;
+                }
+        }
+ free_counters:
+        vfree(counters);
+        return ret;
+}
+static int
+get_entries(const struct ipt_get_entries *entries,
+            struct ipt_get_entries __user *uptr)
+{
+        int ret;
+        struct ipt_table *t;
+        t = find_table_lock(entries->name);
+        if (t && !IS_ERR(t)) {
+                duprintf("t->private->number = %u\n",
+                         t->private->number);
+                if (entries->size == t->private->size)
+                        ret = copy_entries_to_user(t->private->size,
+                                                   t, uptr->entrytable);
+                else {
+                        duprintf("get_entries: I've got %u not %u!\n",
+                                 t->private->size,
+                                 entries->size);
+                        ret = -EINVAL;
+                }
+                module_put(t->me);
+                up(&ipt_mutex);
+        } else
+                ret = t ? PTR_ERR(t) : -ENOENT;
+        return ret;
+}
+static int
+do_replace(void __user *user, unsigned int len)
+{
+        int ret;
+        struct ipt_replace tmp;
+        struct ipt_table *t;
+        struct ipt_table_info *newinfo, *oldinfo;
+        struct ipt_counters *counters;
+        if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+                return -EFAULT;
+        /* Hack: Causes ipchains to give correct error msg --RR */
+        if (len != sizeof(tmp) + tmp.size)
+                return -ENOPROTOOPT;
+        /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+        if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
+                return -ENOMEM;
+        newinfo = vmalloc(sizeof(struct ipt_table_info)
+                          + SMP_ALIGN(tmp.size) * num_possible_cpus());
+        if (!newinfo)
+                return -ENOMEM;
+        if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+                           tmp.size) != 0) {
+                ret = -EFAULT;
+                goto free_newinfo;
+        }
+        counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
+        if (!counters) {
+                ret = -ENOMEM;
+                goto free_newinfo;
+        }
+        memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
+        ret = translate_table(tmp.name, tmp.valid_hooks,
+                              newinfo, tmp.size, tmp.num_entries,
+                              tmp.hook_entry, tmp.underflow);
+        if (ret != 0)
+                goto free_newinfo_counters;
+        duprintf("ip_tables: Translated table\n");
+        t = try_then_request_module(find_table_lock(tmp.name),
+                                    "iptable_%s", tmp.name);
+        if (!t || IS_ERR(t)) {
+                ret = t ? PTR_ERR(t) : -ENOENT;
+                goto free_newinfo_counters_untrans;
+        }
+        /* You lied! */
+        if (tmp.valid_hooks != t->valid_hooks) {
+                duprintf("Valid hook crap: %08X vs %08X\n",
+                         tmp.valid_hooks, t->valid_hooks);
+                ret = -EINVAL;
+                goto put_module;
+        }
+        oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+        if (!oldinfo)
+                goto put_module;
+        /* Update module usage count based on number of rules */
+        duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+                oldinfo->number, oldinfo->initial_entries, newinfo->number);
+        if ((oldinfo->number > oldinfo->initial_entries) || 
+            (newinfo->number <= oldinfo->initial_entries)) 
+                module_put(t->me);
+        if ((oldinfo->number > oldinfo->initial_entries) &&
+            (newinfo->number <= oldinfo->initial_entries))
+                module_put(t->me);
+        /* Get the old counters. */
+        get_counters(oldinfo, counters);
+        /* Decrease module usage counts and free resource */
+        IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+        vfree(oldinfo);
+        if (copy_to_user(tmp.counters, counters,
+                         sizeof(struct ipt_counters) * tmp.num_counters) != 0)
+                ret = -EFAULT;
+        vfree(counters);
+        up(&ipt_mutex);
+        return ret;
+ put_module:
+        module_put(t->me);
+        up(&ipt_mutex);
+ free_newinfo_counters_untrans:
+        IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ free_newinfo_counters:
+        vfree(counters);
+ free_newinfo:
+        vfree(newinfo);
+        return ret;
+}
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static inline int
+add_counter_to_entry(struct ipt_entry *e,
+                     const struct ipt_counters addme[],
+                     unsigned int *i)
+{
+#if 0
+        duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
+                 *i,
+                 (long unsigned int)e->counters.pcnt,
+                 (long unsigned int)e->counters.bcnt,
+                 (long unsigned int)addme[*i].pcnt,
+                 (long unsigned int)addme[*i].bcnt);
+#endif
+        ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+        (*i)++;
+        return 0;
+}
+static int
+do_add_counters(void __user *user, unsigned int len)
+{
+        unsigned int i;
+        struct ipt_counters_info tmp, *paddc;
+        struct ipt_table *t;
+        int ret = 0;
+        if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+                return -EFAULT;
+        if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
+                return -EINVAL;
+        paddc = vmalloc(len);
+        if (!paddc)
+                return -ENOMEM;
+        if (copy_from_user(paddc, user, len) != 0) {
+                ret = -EFAULT;
+                goto free;
+        }
+        t = find_table_lock(tmp.name);
+        if (!t || IS_ERR(t)) {
+                ret = t ? PTR_ERR(t) : -ENOENT;
+                goto free;
+        }
+        write_lock_bh(&t->lock);
+        if (t->private->number != paddc->num_counters) {
+                ret = -EINVAL;
+                goto unlock_up_free;
+        }
+        i = 0;
+        IPT_ENTRY_ITERATE(t->private->entries,
+                          t->private->size,
+                          add_counter_to_entry,
+                          paddc->counters,
+                          &i);
+ unlock_up_free:
+        write_unlock_bh(&t->lock);
+        up(&ipt_mutex);
+        module_put(t->me);
+ free:
+        vfree(paddc);
+        return ret;
+}
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+        int ret;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        switch (cmd) {
+        case IPT_SO_SET_REPLACE:
+                ret = do_replace(user, len);
+                break;
+        case IPT_SO_SET_ADD_COUNTERS:
+                ret = do_add_counters(user, len);
+                break;
+        default:
+                duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
+                ret = -EINVAL;
+        }
+        return ret;
+}
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+        int ret;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        switch (cmd) {
+        case IPT_SO_GET_INFO: {
+                char name[IPT_TABLE_MAXNAMELEN];
+                struct ipt_table *t;
+                if (*len != sizeof(struct ipt_getinfo)) {
+                        duprintf("length %u != %u\n", *len,
+                                 sizeof(struct ipt_getinfo));
+                        ret = -EINVAL;
+                        break;
+                }
+                if (copy_from_user(name, user, sizeof(name)) != 0) {
+                        ret = -EFAULT;
+                        break;
+                }
+                name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+                t = try_then_request_module(find_table_lock(name),
+                                            "iptable_%s", name);
+                if (t && !IS_ERR(t)) {
+                        struct ipt_getinfo info;
+                        info.valid_hooks = t->valid_hooks;
+                        memcpy(info.hook_entry, t->private->hook_entry,
+                               sizeof(info.hook_entry));
+                        memcpy(info.underflow, t->private->underflow,
+                               sizeof(info.underflow));
+                        info.num_entries = t->private->number;
+                        info.size = t->private->size;
+                        memcpy(info.name, name, sizeof(info.name));
+                        if (copy_to_user(user, &info, *len) != 0)
+                                ret = -EFAULT;
+                        else
+                                ret = 0;
+                        up(&ipt_mutex);
+                        module_put(t->me);
+                } else
+                        ret = t ? PTR_ERR(t) : -ENOENT;
+        }
+        break;
+        case IPT_SO_GET_ENTRIES: {
+                struct ipt_get_entries get;
+                if (*len < sizeof(get)) {
+                        duprintf("get_entries: %u < %u\n", *len, sizeof(get));
+                        ret = -EINVAL;
+                } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+                        ret = -EFAULT;
+                } else if (*len != sizeof(struct ipt_get_entries) + get.size) {
+                        duprintf("get_entries: %u != %u\n", *len,
+                                 sizeof(struct ipt_get_entries) + get.size);
+                        ret = -EINVAL;
+                } else
+                        ret = get_entries(&get, user);
+                break;
+        }
+        case IPT_SO_GET_REVISION_MATCH:
+        case IPT_SO_GET_REVISION_TARGET: {
+                struct ipt_get_revision rev;
+                int (*revfn)(const char *, u8, int *);
+                if (*len != sizeof(rev)) {
+                        ret = -EINVAL;
+                        break;
+                }
+                if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+                        ret = -EFAULT;
+                        break;
+                }
+                if (cmd == IPT_SO_GET_REVISION_TARGET)
+                        revfn = target_revfn;
+                else
+                        revfn = match_revfn;
+                try_then_request_module(find_revision(rev.name, rev.revision,
+                                                      revfn, &ret),
+                                        "ipt_%s", rev.name);
+                break;
+        }
+        default:
+                duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
+                ret = -EINVAL;
+        }
+        return ret;
+}
+/* Registration hooks for targets. */
+int
+ipt_register_target(struct ipt_target *target)
+{
+        int ret;
+        ret = down_interruptible(&ipt_mutex);
+        if (ret != 0)
+                return ret;
+        list_add(&target->list, &ipt_target);
+        up(&ipt_mutex);
+        return ret;
+}
+void
+ipt_unregister_target(struct ipt_target *target)
+{
+        down(&ipt_mutex);
+        LIST_DELETE(&ipt_target, target);
+        up(&ipt_mutex);
+}
+int
+ipt_register_match(struct ipt_match *match)
+{
+        int ret;
+        ret = down_interruptible(&ipt_mutex);
+        if (ret != 0)
+                return ret;
+        list_add(&match->list, &ipt_match);
+        up(&ipt_mutex);
+        return ret;
+}
+void
+ipt_unregister_match(struct ipt_match *match)
+{
+        down(&ipt_mutex);
+        LIST_DELETE(&ipt_match, match);
+        up(&ipt_mutex);
+}
+int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
+{
+        int ret;
+        struct ipt_table_info *newinfo;
+        static struct ipt_table_info bootstrap
+                = { 0, 0, 0, { 0 }, { 0 }, { } };
+        newinfo = vmalloc(sizeof(struct ipt_table_info)
+                          + SMP_ALIGN(repl->size) * num_possible_cpus());
+        if (!newinfo)
+                return -ENOMEM;
+        memcpy(newinfo->entries, repl->entries, repl->size);
+        ret = translate_table(table->name, table->valid_hooks,
+                              newinfo, repl->size,
+                              repl->num_entries,
+                              repl->hook_entry,
+                              repl->underflow);
+        if (ret != 0) {
+                vfree(newinfo);
+                return ret;
+        }
+        ret = down_interruptible(&ipt_mutex);
+        if (ret != 0) {
+                vfree(newinfo);
+                return ret;
+        }
+        /* Don't autoload: we'd eat our tail... */
+        if (list_named_find(&ipt_tables, table->name)) {
+                ret = -EEXIST;
+                goto free_unlock;
+        }
+        /* Simplifies replace_table code. */
+        table->private = &bootstrap;
+        if (!replace_table(table, 0, newinfo, &ret))
+                goto free_unlock;
+        duprintf("table->private->number = %u\n",
+                 table->private->number);
+        
+        /* save number of initial entries */
+        table->private->initial_entries = table->private->number;
+        rwlock_init(&table->lock);
+        list_prepend(&ipt_tables, table);
+ unlock:
+        up(&ipt_mutex);
+        return ret;
+ free_unlock:
+        vfree(newinfo);
+        goto unlock;
+}
+void ipt_unregister_table(struct ipt_table *table)
+{
+        down(&ipt_mutex);
+        LIST_DELETE(&ipt_tables, table);
+        up(&ipt_mutex);
+        /* Decrease module usage counts and free resources */
+        IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+                          cleanup_entry, NULL);
+        vfree(table->private);
+}
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+static inline int
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
+{
+        int ret;
+        ret = (port >= min && port <= max) ^ invert;
+        return ret;
+}
+static int
+tcp_find_option(u_int8_t option,
+                const struct sk_buff *skb,
+                unsigned int optlen,
+                int invert,
+                int *hotdrop)
+{
+        /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+        u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
+        unsigned int i;
+        duprintf("tcp_match: finding option\n");
+        if (!optlen)
+                return invert;
+        /* If we don't have the whole header, drop packet. */
+        op = skb_header_pointer(skb,
+                                skb->nh.iph->ihl*4 + sizeof(struct tcphdr),
+                                optlen, _opt);
+        if (op == NULL) {
+                *hotdrop = 1;
+                return 0;
+        }
+        for (i = 0; i < optlen; ) {
+                if (op[i] == option) return !invert;
+                if (op[i] < 2) i++;
+                else i += op[i+1]?:1;
+        }
+        return invert;
+}
+static int
+tcp_match(const struct sk_buff *skb,
+          const struct net_device *in,
+          const struct net_device *out,
+          const void *matchinfo,
+          int offset,
+          int *hotdrop)
+{
+        struct tcphdr _tcph, *th;
+        const struct ipt_tcp *tcpinfo = matchinfo;
+        if (offset) {
+                /* To quote Alan:
+                   Don't allow a fragment of TCP 8 bytes in. Nobody normal
+                   causes this. Its a cracker trying to break in by doing a
+                   flag overwrite to pass the direction checks.
+                */
+                if (offset == 1) {
+                        duprintf("Dropping evil TCP offset=1 frag.\n");
+                        *hotdrop = 1;
+                }
+                /* Must not be a fragment. */
+                return 0;
+        }
+#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
+        th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL) {
+                /* We've been asked to examine this packet, and we
+                   can't.  Hence, no choice but to drop. */
+                duprintf("Dropping evil TCP offset=0 tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+                        ntohs(th->source),
+                        !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT)))
+                return 0;
+        if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+                        ntohs(th->dest),
+                        !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT)))
+                return 0;
+        if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
+                      == tcpinfo->flg_cmp,
+                      IPT_TCP_INV_FLAGS))
+                return 0;
+        if (tcpinfo->option) {
+                if (th->doff * 4 < sizeof(_tcph)) {
+                        *hotdrop = 1;
+                        return 0;
+                }
+                if (!tcp_find_option(tcpinfo->option, skb,
+                                     th->doff*4 - sizeof(_tcph),
+                                     tcpinfo->invflags & IPT_TCP_INV_OPTION,
+                                     hotdrop))
+                        return 0;
+        }
+        return 1;
+}
+/* Called when user tries to insert an entry of this type. */
+static int
+tcp_checkentry(const char *tablename,
+               const struct ipt_ip *ip,
+               void *matchinfo,
+               unsigned int matchsize,
+               unsigned int hook_mask)
+{
+        const struct ipt_tcp *tcpinfo = matchinfo;
+        /* Must specify proto == TCP, and no unknown invflags */
+        return ip->proto == IPPROTO_TCP
+                && !(ip->invflags & IPT_INV_PROTO)
+                && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp))
+                && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK);
+}
+static int
+udp_match(const struct sk_buff *skb,
+          const struct net_device *in,
+          const struct net_device *out,
+          const void *matchinfo,
+          int offset,
+          int *hotdrop)
+{
+        struct udphdr _udph, *uh;
+        const struct ipt_udp *udpinfo = matchinfo;
+        /* Must not be a fragment. */
+        if (offset)
+                return 0;
+        uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_udph), &_udph);
+        if (uh == NULL) {
+                /* We've been asked to examine this packet, and we
+                   can't.  Hence, no choice but to drop. */
+                duprintf("Dropping evil UDP tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        return port_match(udpinfo->spts[0], udpinfo->spts[1],
+                          ntohs(uh->source),
+                          !!(udpinfo->invflags & IPT_UDP_INV_SRCPT))
+                && port_match(udpinfo->dpts[0], udpinfo->dpts[1],
+                              ntohs(uh->dest),
+                              !!(udpinfo->invflags & IPT_UDP_INV_DSTPT));
+}
+/* Called when user tries to insert an entry of this type. */
+static int
+udp_checkentry(const char *tablename,
+               const struct ipt_ip *ip,
+               void *matchinfo,
+               unsigned int matchinfosize,
+               unsigned int hook_mask)
+{
+        const struct ipt_udp *udpinfo = matchinfo;
+        /* Must specify proto == UDP, and no unknown invflags */
+        if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) {
+                duprintf("ipt_udp: Protocol %u != %u\n", ip->proto,
+                         IPPROTO_UDP);
+                return 0;
+        }
+        if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) {
+                duprintf("ipt_udp: matchsize %u != %u\n",
+                         matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp)));
+                return 0;
+        }
+        if (udpinfo->invflags & ~IPT_UDP_INV_MASK) {
+                duprintf("ipt_udp: unknown flags %X\n",
+                         udpinfo->invflags);
+                return 0;
+        }
+        return 1;
+}
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline int
+icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+                     u_int8_t type, u_int8_t code,
+                     int invert)
+{
+        return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
+                ^ invert;
+}
+static int
+icmp_match(const struct sk_buff *skb,
+           const struct net_device *in,
+           const struct net_device *out,
+           const void *matchinfo,
+           int offset,
+           int *hotdrop)
+{
+        struct icmphdr _icmph, *ic;
+        const struct ipt_icmp *icmpinfo = matchinfo;
+        /* Must not be a fragment. */
+        if (offset)
+                return 0;
+        ic = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_icmph), &_icmph);
+        if (ic == NULL) {
+                /* We've been asked to examine this packet, and we
+                 * can't.  Hence, no choice but to drop.
+                 */
+                duprintf("Dropping evil ICMP tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        return icmp_type_code_match(icmpinfo->type,
+                                    icmpinfo->code[0],
+                                    icmpinfo->code[1],
+                                    ic->type, ic->code,
+                                    !!(icmpinfo->invflags&IPT_ICMP_INV));
+}
+/* Called when user tries to insert an entry of this type. */
+static int
+icmp_checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        const struct ipt_icmp *icmpinfo = matchinfo;
+        /* Must specify proto == ICMP, and no unknown invflags */
+        return ip->proto == IPPROTO_ICMP
+                && !(ip->invflags & IPT_INV_PROTO)
+                && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp))
+                && !(icmpinfo->invflags & ~IPT_ICMP_INV);
+}
+/* The built-in targets: standard (NULL) and error. */
+static struct ipt_target ipt_standard_target = {
+        .name           = IPT_STANDARD_TARGET,
+};
+static struct ipt_target ipt_error_target = {
+        .name           = IPT_ERROR_TARGET,
+        .target         = ipt_error,
+};
+static struct nf_sockopt_ops ipt_sockopts = {
+        .pf             = PF_INET,
+        .set_optmin     = IPT_BASE_CTL,
+        .set_optmax     = IPT_SO_SET_MAX+1,
+        .set            = do_ipt_set_ctl,
+        .get_optmin     = IPT_BASE_CTL,
+        .get_optmax     = IPT_SO_GET_MAX+1,
+        .get            = do_ipt_get_ctl,
+};
+static struct ipt_match tcp_matchstruct = {
+        .name           = "tcp",
+        .match          = &tcp_match,
+        .checkentry     = &tcp_checkentry,
+};
+static struct ipt_match udp_matchstruct = {
+        .name           = "udp",
+        .match          = &udp_match,
+        .checkentry     = &udp_checkentry,
+};
+static struct ipt_match icmp_matchstruct = {
+        .name           = "icmp",
+        .match          = &icmp_match,
+        .checkentry     = &icmp_checkentry,
+};
+#ifdef CONFIG_PROC_FS
+static inline int print_name(const char *i,
+                             off_t start_offset, char *buffer, int length,
+                             off_t *pos, unsigned int *count)
+{
+        if ((*count)++ >= start_offset) {
+                unsigned int namelen;
+                namelen = sprintf(buffer + *pos, "%s\n",
+                                  i + sizeof(struct list_head));
+                if (*pos + namelen > length) {
+                        /* Stop iterating */
+                        return 1;
+                }
+                *pos += namelen;
+        }
+        return 0;
+}
+static inline int print_target(const struct ipt_target *t,
+                               off_t start_offset, char *buffer, int length,
+                               off_t *pos, unsigned int *count)
+{
+        if (t == &ipt_standard_target || t == &ipt_error_target)
+                return 0;
+        return print_name((char *)t, start_offset, buffer, length, pos, count);
+}
+static int ipt_get_tables(char *buffer, char **start, off_t offset, int length)
+{
+        off_t pos = 0;
+        unsigned int count = 0;
+        if (down_interruptible(&ipt_mutex) != 0)
+                return 0;
+        LIST_FIND(&ipt_tables, print_name, void *,
+                  offset, buffer, length, &pos, &count);
+        up(&ipt_mutex);
+        /* `start' hack - see fs/proc/generic.c line ~105 */
+        *start=(char *)((unsigned long)count-offset);
+        return pos;
+}
+static int ipt_get_targets(char *buffer, char **start, off_t offset, int length)
+{
+        off_t pos = 0;
+        unsigned int count = 0;
+        if (down_interruptible(&ipt_mutex) != 0)
+                return 0;
+        LIST_FIND(&ipt_target, print_target, struct ipt_target *,
+                  offset, buffer, length, &pos, &count);
+        
+        up(&ipt_mutex);
+        *start = (char *)((unsigned long)count - offset);
+        return pos;
+}
+static int ipt_get_matches(char *buffer, char **start, off_t offset, int length)
+{
+        off_t pos = 0;
+        unsigned int count = 0;
+        if (down_interruptible(&ipt_mutex) != 0)
+                return 0;
+        
+        LIST_FIND(&ipt_match, print_name, void *,
+                  offset, buffer, length, &pos, &count);
+        up(&ipt_mutex);
+        *start = (char *)((unsigned long)count - offset);
+        return pos;
+}
+static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] =
+{ { "ip_tables_names", ipt_get_tables },
+  { "ip_tables_targets", ipt_get_targets },
+  { "ip_tables_matches", ipt_get_matches },
+  { NULL, NULL} };
+#endif /*CONFIG_PROC_FS*/
+static int __init init(void)
+{
+        int ret;
+        /* Noone else will be downing sem now, so we won't sleep */
+        down(&ipt_mutex);
+        list_append(&ipt_target, &ipt_standard_target);
+        list_append(&ipt_target, &ipt_error_target);
+        list_append(&ipt_match, &tcp_matchstruct);
+        list_append(&ipt_match, &udp_matchstruct);
+        list_append(&ipt_match, &icmp_matchstruct);
+        up(&ipt_mutex);
+        /* Register setsockopt */
+        ret = nf_register_sockopt(&ipt_sockopts);
+        if (ret < 0) {
+                duprintf("Unable to register sockopts.\n");
+                return ret;
+        }
+#ifdef CONFIG_PROC_FS
+        {
+        struct proc_dir_entry *proc;
+        int i;
+        for (i = 0; ipt_proc_entry[i].name; i++) {
+                proc = proc_net_create(ipt_proc_entry[i].name, 0,
+                                       ipt_proc_entry[i].get_info);
+                if (!proc) {
+                        while (--i >= 0)
+                                proc_net_remove(ipt_proc_entry[i].name);
+                        nf_unregister_sockopt(&ipt_sockopts);
+                        return -ENOMEM;
+                }
+                proc->owner = THIS_MODULE;
+        }
+        }
+#endif
+        printk("ip_tables: (C) 2000-2002 Netfilter core team\n");
+        return 0;
+}
+static void __exit fini(void)
+{
+        nf_unregister_sockopt(&ipt_sockopts);
+#ifdef CONFIG_PROC_FS
+        {
+        int i;
+        for (i = 0; ipt_proc_entry[i].name; i++)
+                proc_net_remove(ipt_proc_entry[i].name);
+        }
+#endif
+}
+EXPORT_SYMBOL(ipt_register_table);
+EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_register_match);
+EXPORT_SYMBOL(ipt_unregister_match);
+EXPORT_SYMBOL(ipt_do_table);
+EXPORT_SYMBOL(ipt_register_target);
+EXPORT_SYMBOL(ipt_unregister_target);
+EXPORT_SYMBOL(ipt_find_target);
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
new file mode 100644
index 000000000000..9842e6e23184
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -0,0 +1,92 @@
+/*
+ * This is a module which is used for setting the skb->priority field
+ * of an skb for qdisc classification.
+ */
+/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLASSIFY.h>
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables qdisc classification target module");
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_classify_target_info *clinfo = targinfo;
+        if((*pskb)->priority != clinfo->priority) {
+                (*pskb)->priority = clinfo->priority;
+                (*pskb)->nfcache |= NFC_ALTERED;
+        }
+        return IPT_CONTINUE;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){
+                printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_classify_target_info)));
+                return 0;
+        }
+        
+        if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
+                          (1 << NF_IP_POST_ROUTING))) {
+                printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD "
+                                "and POST_ROUTING.\n");
+                return 0;
+        }
+        if (strcmp(tablename, "mangle") != 0) {
+                printk(KERN_ERR "CLASSIFY: can only be called from "
+                                "\"mangle\" table, not \"%s\".\n",
+                                tablename);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_classify_reg = { 
+        .name           = "CLASSIFY", 
+        .target         = target,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_classify_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_classify_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
new file mode 100644
index 000000000000..0f12e3a3dc73
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -0,0 +1,761 @@
+/* Cluster IP hashmark target 
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/checksum.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#define CLUSTERIP_VERSION "0.6"
+#define DEBUG_CLUSTERIP
+#ifdef DEBUG_CLUSTERIP
+#define DEBUGP  printk
+#else
+#define DEBUGP
+#endif
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables target for CLUSTERIP");
+struct clusterip_config {
+        struct list_head list;                  /* list of all configs */
+        atomic_t refcount;                      /* reference count */
+        u_int32_t clusterip;                    /* the IP address */
+        u_int8_t clustermac[ETH_ALEN];          /* the MAC address */
+        struct net_device *dev;                 /* device */
+        u_int16_t num_total_nodes;              /* total number of nodes */
+        u_int16_t num_local_nodes;              /* number of local nodes */
+        u_int16_t local_nodes[CLUSTERIP_MAX_NODES];     /* node number array */
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *pde;             /* proc dir entry */
+#endif
+        enum clusterip_hashmode hash_mode;      /* which hashing mode */
+        u_int32_t hash_initval;                 /* hash initialization */
+};
+static LIST_HEAD(clusterip_configs);
+/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
+ * data within all structurses (num_local_nodes, local_nodes[]) */
+static DECLARE_RWLOCK(clusterip_lock);
+#ifdef CONFIG_PROC_FS
+static struct file_operations clusterip_proc_fops;
+static struct proc_dir_entry *clusterip_procdir;
+#endif
+static inline void
+clusterip_config_get(struct clusterip_config *c) {
+        atomic_inc(&c->refcount);
+}
+static inline void
+clusterip_config_put(struct clusterip_config *c) {
+        if (atomic_dec_and_test(&c->refcount)) {
+                WRITE_LOCK(&clusterip_lock);
+                list_del(&c->list);
+                WRITE_UNLOCK(&clusterip_lock);
+                dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
+                dev_put(c->dev);
+                kfree(c);
+        }
+}
+static struct clusterip_config *
+__clusterip_config_find(u_int32_t clusterip)
+{
+        struct list_head *pos;
+        MUST_BE_READ_LOCKED(&clusterip_lock);
+        list_for_each(pos, &clusterip_configs) {
+                struct clusterip_config *c = list_entry(pos, 
+                                        struct clusterip_config, list);
+                if (c->clusterip == clusterip) {
+                        return c;
+                }
+        }
+        return NULL;
+}
+static inline struct clusterip_config *
+clusterip_config_find_get(u_int32_t clusterip)
+{
+        struct clusterip_config *c;
+        READ_LOCK(&clusterip_lock);
+        c = __clusterip_config_find(clusterip);
+        if (!c) {
+                READ_UNLOCK(&clusterip_lock);
+                return NULL;
+        }
+        atomic_inc(&c->refcount);
+        READ_UNLOCK(&clusterip_lock);
+        return c;
+}
+static struct clusterip_config *
+clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
+                        struct net_device *dev)
+{
+        struct clusterip_config *c;
+        char buffer[16];
+        c = kmalloc(sizeof(*c), GFP_ATOMIC);
+        if (!c)
+                return NULL;
+        memset(c, 0, sizeof(*c));
+        c->dev = dev;
+        c->clusterip = ip;
+        memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+        c->num_total_nodes = i->num_total_nodes;
+        c->num_local_nodes = i->num_local_nodes;
+        memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+        c->hash_mode = i->hash_mode;
+        c->hash_initval = i->hash_initval;
+        atomic_set(&c->refcount, 1);
+#ifdef CONFIG_PROC_FS
+        /* create proc dir entry */
+        sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
+        c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir);
+        if (!c->pde) {
+                kfree(c);
+                return NULL;
+        }
+        c->pde->proc_fops = &clusterip_proc_fops;
+        c->pde->data = c;
+#endif
+        WRITE_LOCK(&clusterip_lock);
+        list_add(&c->list, &clusterip_configs);
+        WRITE_UNLOCK(&clusterip_lock);
+        return c;
+}
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+        int i;
+        WRITE_LOCK(&clusterip_lock);
+        if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
+            || nodenum > CLUSTERIP_MAX_NODES) {
+                WRITE_UNLOCK(&clusterip_lock);
+                return 1;
+        }
+        /* check if we alrady have this number in our array */
+        for (i = 0; i < c->num_local_nodes; i++) {
+                if (c->local_nodes[i] == nodenum) {
+                        WRITE_UNLOCK(&clusterip_lock);
+                        return 1;
+                }
+        }
+        c->local_nodes[c->num_local_nodes++] = nodenum;
+        WRITE_UNLOCK(&clusterip_lock);
+        return 0;
+}
+static int
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+        int i;
+        WRITE_LOCK(&clusterip_lock);
+        if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
+                WRITE_UNLOCK(&clusterip_lock);
+                return 1;
+        }
+                
+        for (i = 0; i < c->num_local_nodes; i++) {
+                if (c->local_nodes[i] == nodenum) {
+                        int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
+                        memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
+                        c->num_local_nodes--;
+                        WRITE_UNLOCK(&clusterip_lock);
+                        return 0;
+                }
+        }
+        WRITE_UNLOCK(&clusterip_lock);
+        return 1;
+}
+static inline u_int32_t
+clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
+{
+        struct iphdr *iph = skb->nh.iph;
+        unsigned long hashval;
+        u_int16_t sport, dport;
+        struct tcphdr *th;
+        struct udphdr *uh;
+        struct icmphdr *ih;
+        switch (iph->protocol) {
+        case IPPROTO_TCP:
+                th = (void *)iph+iph->ihl*4;
+                sport = ntohs(th->source);
+                dport = ntohs(th->dest);
+                break;
+        case IPPROTO_UDP:
+                uh = (void *)iph+iph->ihl*4;
+                sport = ntohs(uh->source);
+                dport = ntohs(uh->dest);
+                break;
+        case IPPROTO_ICMP:
+                ih = (void *)iph+iph->ihl*4;
+                sport = ntohs(ih->un.echo.id);
+                dport = (ih->type<<8)|ih->code;
+                break;
+        default:
+                if (net_ratelimit()) {
+                        printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
+                                iph->protocol);
+                }
+                sport = dport = 0;
+        }
+        switch (config->hash_mode) {
+        case CLUSTERIP_HASHMODE_SIP:
+                hashval = jhash_1word(ntohl(iph->saddr),
+                                      config->hash_initval);
+                break;
+        case CLUSTERIP_HASHMODE_SIP_SPT:
+                hashval = jhash_2words(ntohl(iph->saddr), sport, 
+                                       config->hash_initval);
+                break;
+        case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+                hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+                                       config->hash_initval);
+                break;
+        default:
+                /* to make gcc happy */
+                hashval = 0;
+                /* This cannot happen, unless the check function wasn't called
+                 * at rule load time */
+                printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
+                BUG();
+                break;
+        }
+        /* node numbers are 1..n, not 0..n */
+        return ((hashval % config->num_total_nodes)+1);
+}
+static inline int
+clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
+{
+        int i;
+        READ_LOCK(&clusterip_lock);
+        if (config->num_local_nodes == 0) {
+                READ_UNLOCK(&clusterip_lock);
+                return 0;
+        }
+        for (i = 0; i < config->num_local_nodes; i++) {
+                if (config->local_nodes[i] == hash) {
+                        READ_UNLOCK(&clusterip_lock);
+                        return 1;
+                }
+        }
+        READ_UNLOCK(&clusterip_lock);
+        return 0;
+}
+/*********************************************************************** 
+ * IPTABLES TARGET 
+ ***********************************************************************/
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+        enum ip_conntrack_info ctinfo;
+        struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+        u_int32_t hash;
+        /* don't need to clusterip_config_get() here, since refcount
+         * is only decremented by destroy() - and ip_tables guarantees
+         * that the ->target() function isn't called after ->destroy() */
+        if (!ct) {
+                printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
+                        /* FIXME: need to drop invalid ones, since replies
+                         * to outgoing connections of other nodes will be 
+                         * marked as INVALID */
+                return NF_DROP;
+        }
+        /* special case: ICMP error handling. conntrack distinguishes between
+         * error messages (RELATED) and information requests (see below) */
+        if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
+            && (ctinfo == IP_CT_RELATED 
+                || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+                return IPT_CONTINUE;
+        /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 
+         * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+         * on, which all have an ID field [relevant for hashing]. */
+        hash = clusterip_hashfn(*pskb, cipinfo->config);
+        switch (ctinfo) {
+                case IP_CT_NEW:
+                        ct->mark = hash;
+                        break;
+                case IP_CT_RELATED:
+                case IP_CT_RELATED+IP_CT_IS_REPLY:
+                        /* FIXME: we don't handle expectations at the
+                         * moment.  they can arrive on a different node than
+                         * the master connection (e.g. FTP passive mode) */
+                case IP_CT_ESTABLISHED:
+                case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
+                        break;
+                default:
+                        break;
+        }
+#ifdef DEBUG_CLUSTERP
+        DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+        DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+        if (!clusterip_responsible(cipinfo->config, hash)) {
+                DEBUGP("not responsible\n");
+                return NF_DROP;
+        }
+        DEBUGP("responsible\n");
+        /* despite being received via linklayer multicast, this is
+         * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+        (*pskb)->pkt_type = PACKET_HOST;
+        return IPT_CONTINUE;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+        struct clusterip_config *config;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) {
+                printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info)));
+                return 0;
+        }
+        if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+            cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+            cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+                printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
+                        cipinfo->hash_mode);
+                return 0;
+        }
+        if (e->ip.dmsk.s_addr != 0xffffffff
+            || e->ip.dst.s_addr == 0) {
+                printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
+                return 0;
+        }
+        /* FIXME: further sanity checks */
+        config = clusterip_config_find_get(e->ip.dst.s_addr);
+        if (!config) {
+                if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+                        printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
+                        return 0;
+                } else {
+                        struct net_device *dev;
+                        if (e->ip.iniface[0] == '\0') {
+                                printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
+                                return 0;
+                        }
+                        dev = dev_get_by_name(e->ip.iniface);
+                        if (!dev) {
+                                printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+                                return 0;
+                        }
+                        config = clusterip_config_init(cipinfo, 
+                                                        e->ip.dst.s_addr, dev);
+                        if (!config) {
+                                printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
+                                dev_put(dev);
+                                return 0;
+                        }
+                        dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
+                }
+        }
+        cipinfo->config = config;
+        return 1;
+}
+/* drop reference count of cluster config when rule is deleted */
+static void destroy(void *matchinfo, unsigned int matchinfosize)
+{
+        struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
+        /* we first remove the proc entry and then drop the reference
+         * count.  In case anyone still accesses the file, the open/close
+         * functions are also incrementing the refcount on their own */
+#ifdef CONFIG_PROC_FS
+        remove_proc_entry(cipinfo->config->pde->name,
+                          cipinfo->config->pde->parent);
+#endif
+        clusterip_config_put(cipinfo->config);
+}
+static struct ipt_target clusterip_tgt = { 
+        .name = "CLUSTERIP",
+        .target = &target, 
+        .checkentry = &checkentry, 
+        .destroy = &destroy,
+        .me = THIS_MODULE
+};
+/*********************************************************************** 
+ * ARP MANGLING CODE 
+ ***********************************************************************/
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+        u_int8_t src_hw[ETH_ALEN];
+        u_int32_t src_ip;
+        u_int8_t dst_hw[ETH_ALEN];
+        u_int32_t dst_ip;
+} __attribute__ ((packed));
+#ifdef CLUSTERIP_DEBUG
+static void arp_print(struct arp_payload *payload) 
+{
+#define HBUFFERLEN 30
+        char hbuffer[HBUFFERLEN];
+        int j,k;
+        const char hexbuf[]= "0123456789abcdef";
+        for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+                hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15];
+                hbuffer[k++]=hexbuf[payload->src_hw[j]&15];
+                hbuffer[k++]=':';
+        }
+        hbuffer[--k]='\0';
+        printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", 
+                NIPQUAD(payload->src_ip), hbuffer,
+                NIPQUAD(payload->dst_ip));
+}
+#endif
+static unsigned int
+arp_mangle(unsigned int hook,
+           struct sk_buff **pskb,
+           const struct net_device *in,
+           const struct net_device *out,
+           int (*okfn)(struct sk_buff *))
+{
+        struct arphdr *arp = (*pskb)->nh.arph;
+        struct arp_payload *payload;
+        struct clusterip_config *c;
+        /* we don't care about non-ethernet and non-ipv4 ARP */
+        if (arp->ar_hrd != htons(ARPHRD_ETHER)
+            || arp->ar_pro != htons(ETH_P_IP)
+            || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+                return NF_ACCEPT;
+        /* we only want to mangle arp replies */
+        if (arp->ar_op != htons(ARPOP_REPLY))
+                return NF_ACCEPT;
+        payload = (void *)(arp+1);
+        /* if there is no clusterip configuration for the arp reply's 
+         * source ip, we don't want to mangle it */
+        c = clusterip_config_find_get(payload->src_ip);
+        if (!c)
+                return NF_ACCEPT;
+        /* normally the linux kernel always replies to arp queries of 
+         * addresses on different interfacs.  However, in the CLUSTERIP case
+         * this wouldn't work, since we didn't subscribe the mcast group on
+         * other interfaces */
+        if (c->dev != out) {
+                DEBUGP("CLUSTERIP: not mangling arp reply on different "
+                       "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
+                clusterip_config_put(c);
+                return NF_ACCEPT;
+        }
+        /* mangle reply hardware address */
+        memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+#ifdef CLUSTERIP_DEBUG
+        DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
+        arp_print(payload);
+#endif
+        clusterip_config_put(c);
+        return NF_ACCEPT;
+}
+static struct nf_hook_ops cip_arp_ops = {
+        .hook = arp_mangle,
+        .pf = NF_ARP,
+        .hooknum = NF_ARP_OUT,
+        .priority = -1
+};
+/*********************************************************************** 
+ * PROC DIR HANDLING 
+ ***********************************************************************/
+#ifdef CONFIG_PROC_FS
+static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
+{
+        struct proc_dir_entry *pde = s->private;
+        struct clusterip_config *c = pde->data;
+        unsigned int *nodeidx;
+        READ_LOCK(&clusterip_lock);
+        if (*pos >= c->num_local_nodes)
+                return NULL;
+        nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL);
+        if (!nodeidx)
+                return ERR_PTR(-ENOMEM);
+        *nodeidx = *pos;
+        return nodeidx;
+}
+static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        struct proc_dir_entry *pde = s->private;
+        struct clusterip_config *c = pde->data;
+        unsigned int *nodeidx = (unsigned int *)v;
+        *pos = ++(*nodeidx);
+        if (*pos >= c->num_local_nodes) {
+                kfree(v);
+                return NULL;
+        }
+        return nodeidx;
+}
+static void clusterip_seq_stop(struct seq_file *s, void *v)
+{
+        kfree(v);
+        READ_UNLOCK(&clusterip_lock);
+}
+static int clusterip_seq_show(struct seq_file *s, void *v)
+{
+        struct proc_dir_entry *pde = s->private;
+        struct clusterip_config *c = pde->data;
+        unsigned int *nodeidx = (unsigned int *)v;
+        if (*nodeidx != 0) 
+                seq_putc(s, ',');
+        seq_printf(s, "%u", c->local_nodes[*nodeidx]);
+        if (*nodeidx == c->num_local_nodes-1)
+                seq_putc(s, '\n');
+        return 0;
+}
+static struct seq_operations clusterip_seq_ops = {
+        .start  = clusterip_seq_start,
+        .next   = clusterip_seq_next,
+        .stop   = clusterip_seq_stop,
+        .show   = clusterip_seq_show,
+};
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+        int ret = seq_open(file, &clusterip_seq_ops);
+        if (!ret) {
+                struct seq_file *sf = file->private_data;
+                struct proc_dir_entry *pde = PDE(inode);
+                struct clusterip_config *c = pde->data;
+                sf->private = pde;
+                clusterip_config_get(c);
+        }
+        return ret;
+}
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *pde = PDE(inode);
+        struct clusterip_config *c = pde->data;
+        int ret;
+        ret = seq_release(inode, file);
+        if (!ret)
+                clusterip_config_put(c);
+        return ret;
+}
+static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
+                                size_t size, loff_t *ofs)
+{
+#define PROC_WRITELEN   10
+        char buffer[PROC_WRITELEN+1];
+        struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
+        struct clusterip_config *c = pde->data;
+        unsigned long nodenum;
+        if (copy_from_user(buffer, input, PROC_WRITELEN))
+                return -EFAULT;
+        if (*buffer == '+') {
+                nodenum = simple_strtoul(buffer+1, NULL, 10);
+                if (clusterip_add_node(c, nodenum))
+                        return -ENOMEM;
+        } else if (*buffer == '-') {
+                nodenum = simple_strtoul(buffer+1, NULL,10);
+                if (clusterip_del_node(c, nodenum))
+                        return -ENOENT;
+        } else
+                return -EIO;
+        return size;
+}
+static struct file_operations clusterip_proc_fops = {
+        .owner   = THIS_MODULE,
+        .open    = clusterip_proc_open,
+        .read    = seq_read,
+        .write   = clusterip_proc_write,
+        .llseek  = seq_lseek,
+        .release = clusterip_proc_release,
+};
+#endif /* CONFIG_PROC_FS */
+static int init_or_cleanup(int fini)
+{
+        int ret;
+        if (fini)
+                goto cleanup;
+        if (ipt_register_target(&clusterip_tgt)) {
+                ret = -EINVAL;
+                goto cleanup_none;
+        }
+        if (nf_register_hook(&cip_arp_ops) < 0) {
+                ret = -EINVAL;
+                goto cleanup_target;
+        }
+#ifdef CONFIG_PROC_FS
+        clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
+        if (!clusterip_procdir) {
+                printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
+                ret = -ENOMEM;
+                goto cleanup_hook;
+        }
+#endif /* CONFIG_PROC_FS */
+        printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
+                CLUSTERIP_VERSION);
+        return 0;
+cleanup:
+        printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
+                CLUSTERIP_VERSION);
+#ifdef CONFIG_PROC_FS
+        remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+#endif
+cleanup_hook:
+        nf_unregister_hook(&cip_arp_ops);
+cleanup_target:
+        ipt_unregister_target(&clusterip_tgt);
+cleanup_none:
+        return -EINVAL;
+}
+static int __init init(void)
+{
+        return init_or_cleanup(0);
+}
+static void __exit fini(void)
+{
+        init_or_cleanup(1);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
new file mode 100644
index 000000000000..30ddd3e18eb7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -0,0 +1,118 @@
+/* This kernel module is used to modify the connection mark values, or
+ * to optionally restore the skb nfmark from the connection mark
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables CONNMARK matching module");
+MODULE_LICENSE("GPL");
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CONNMARK.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_connmark_target_info *markinfo = targinfo;
+        unsigned long diff;
+        unsigned long nfmark;
+        unsigned long newmark;
+        enum ip_conntrack_info ctinfo;
+        struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+        if (ct) {
+            switch(markinfo->mode) {
+            case IPT_CONNMARK_SET:
+                newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
+                if (newmark != ct->mark)
+                    ct->mark = newmark;
+                break;
+            case IPT_CONNMARK_SAVE:
+                newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
+                if (ct->mark != newmark)
+                    ct->mark = newmark;
+                break;
+            case IPT_CONNMARK_RESTORE:
+                nfmark = (*pskb)->nfmark;
+                diff = (ct->mark ^ nfmark) & markinfo->mask;
+                if (diff != 0) {
+                    (*pskb)->nfmark = nfmark ^ diff;
+                    (*pskb)->nfcache |= NFC_ALTERED;
+                }
+                break;
+            }
+        }
+        return IPT_CONTINUE;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        struct ipt_connmark_target_info *matchinfo = targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) {
+                printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_connmark_target_info)));
+                return 0;
+        }
+        if (matchinfo->mode == IPT_CONNMARK_RESTORE) {
+            if (strcmp(tablename, "mangle") != 0) {
+                    printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+                    return 0;
+            }
+        }
+        return 1;
+}
+static struct ipt_target ipt_connmark_reg = {
+        .name = "CONNMARK",
+        .target = &target,
+        .checkentry = &checkentry,
+        .me = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_connmark_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_connmark_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
new file mode 100644
index 000000000000..3ea4509099f9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -0,0 +1,106 @@
+/* iptables module for setting the IPv4 DSCP field, Version 1.8
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ * See RFC2474 for a description of the DSCP field within the IP Header.
+ *
+ * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp
+*/
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_DSCP.h>
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables DSCP modification module");
+MODULE_LICENSE("GPL");
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_DSCP_info *dinfo = targinfo;
+        u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
+        if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
+                u_int16_t diffs[2];
+                if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+                        return NF_DROP;
+                diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+                (*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK)
+                        | sh_dscp;
+                diffs[1] = htons((*pskb)->nh.iph->tos);
+                (*pskb)->nh.iph->check
+                        = csum_fold(csum_partial((char *)diffs,
+                                                 sizeof(diffs),
+                                                 (*pskb)->nh.iph->check
+                                                 ^ 0xFFFF));
+                (*pskb)->nfcache |= NFC_ALTERED;
+        }
+        return IPT_CONTINUE;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_DSCP_info))) {
+                printk(KERN_WARNING "DSCP: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_DSCP_info)));
+                return 0;
+        }
+        if (strcmp(tablename, "mangle") != 0) {
+                printk(KERN_WARNING "DSCP: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        if ((dscp > IPT_DSCP_MAX)) {
+                printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_dscp_reg = {
+        .name           = "DSCP",
+        .target         = target,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_dscp_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_dscp_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
new file mode 100644
index 000000000000..ada9911118e9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -0,0 +1,175 @@
+/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ *
+ * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp
+*/
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ECN.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables ECN modification module");
+/* set ECT codepoint from IP header.
+ *      return 0 if there was an error. */
+static inline int
+set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
+{
+        if (((*pskb)->nh.iph->tos & IPT_ECN_IP_MASK)
+            != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
+                u_int16_t diffs[2];
+                if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+                        return 0;
+                diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+                (*pskb)->nh.iph->tos &= ~IPT_ECN_IP_MASK;
+                (*pskb)->nh.iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
+                diffs[1] = htons((*pskb)->nh.iph->tos);
+                (*pskb)->nh.iph->check
+                        = csum_fold(csum_partial((char *)diffs,
+                                                 sizeof(diffs),
+                                                 (*pskb)->nh.iph->check
+                                                 ^0xFFFF));
+                (*pskb)->nfcache |= NFC_ALTERED;
+        } 
+        return 1;
+}
+/* Return 0 if there was an error. */
+static inline int
+set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
+{
+        struct tcphdr _tcph, *tcph;
+        u_int16_t diffs[2];
+        /* Not enought header? */
+        tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+                                  sizeof(_tcph), &_tcph);
+        if (!tcph)
+                return 0;
+        if (!(einfo->operation & IPT_ECN_OP_SET_ECE
+              || tcph->ece == einfo->proto.tcp.ece)
+            && (!(einfo->operation & IPT_ECN_OP_SET_CWR
+                  || tcph->cwr == einfo->proto.tcp.cwr)))
+                return 1;
+        if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+                return 0;
+        tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
+        diffs[0] = ((u_int16_t *)tcph)[6];
+        if (einfo->operation & IPT_ECN_OP_SET_ECE)
+                tcph->ece = einfo->proto.tcp.ece;
+        if (einfo->operation & IPT_ECN_OP_SET_CWR)
+                tcph->cwr = einfo->proto.tcp.cwr;
+        diffs[1] = ((u_int16_t *)tcph)[6];
+        diffs[0] = diffs[0] ^ 0xFFFF;
+        if ((*pskb)->ip_summed != CHECKSUM_HW)
+                tcph->check = csum_fold(csum_partial((char *)diffs,
+                                                     sizeof(diffs),
+                                                     tcph->check^0xFFFF));
+        else
+                if (skb_checksum_help(*pskb, inward))
+                        return 0;
+        (*pskb)->nfcache |= NFC_ALTERED;
+        return 1;
+}
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_ECN_info *einfo = targinfo;
+        if (einfo->operation & IPT_ECN_OP_SET_IP)
+                if (!set_ect_ip(pskb, einfo))
+                        return NF_DROP;
+        if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
+            && (*pskb)->nh.iph->protocol == IPPROTO_TCP)
+                if (!set_ect_tcp(pskb, einfo, (out == NULL)))
+                        return NF_DROP;
+        return IPT_CONTINUE;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) {
+                printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_ECN_info)));
+                return 0;
+        }
+        if (strcmp(tablename, "mangle") != 0) {
+                printk(KERN_WARNING "ECN: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        if (einfo->operation & IPT_ECN_OP_MASK) {
+                printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
+                        einfo->operation);
+                return 0;
+        }
+        if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
+                printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n",
+                        einfo->ip_ect);
+                return 0;
+        }
+        if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR))
+            && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) {
+                printk(KERN_WARNING "ECN: cannot use TCP operations on a "
+                       "non-tcp rule\n");
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_ecn_reg = {
+        .name           = "ECN",
+        .target         = target,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_ecn_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_ecn_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
new file mode 100644
index 000000000000..ef08733d26da
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -0,0 +1,485 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_LOG.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables syslog logging module");
+static unsigned int nflog = 1;
+module_param(nflog, int, 0400);
+MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
+ 
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+/* Use lock to serialize, so printks don't overlap */
+static DEFINE_SPINLOCK(log_lock);
+/* One level of recursion won't kill us */
+static void dump_packet(const struct ipt_log_info *info,
+                        const struct sk_buff *skb,
+                        unsigned int iphoff)
+{
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+        if (ih == NULL) {
+                printk("TRUNCATED");
+                return;
+        }
+        /* Important fields:
+         * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+        /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+        printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
+               NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
+        /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+        printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+               ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+               ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+        /* Max length: 6 "CE DF MF " */
+        if (ntohs(ih->frag_off) & IP_CE)
+                printk("CE ");
+        if (ntohs(ih->frag_off) & IP_DF)
+                printk("DF ");
+        if (ntohs(ih->frag_off) & IP_MF)
+                printk("MF ");
+        /* Max length: 11 "FRAG:65535 " */
+        if (ntohs(ih->frag_off) & IP_OFFSET)
+                printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+        if ((info->logflags & IPT_LOG_IPOPT)
+            && ih->ihl * 4 > sizeof(struct iphdr)) {
+                unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
+                unsigned int i, optsize;
+                optsize = ih->ihl * 4 - sizeof(struct iphdr);
+                op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+                                        optsize, _opt);
+                if (op == NULL) {
+                        printk("TRUNCATED");
+                        return;
+                }
+                /* Max length: 127 "OPT (" 15*4*2chars ") " */
+                printk("OPT (");
+                for (i = 0; i < optsize; i++)
+                        printk("%02X", op[i]);
+                printk(") ");
+        }
+        switch (ih->protocol) {
+        case IPPROTO_TCP: {
+                struct tcphdr _tcph, *th;
+                /* Max length: 10 "PROTO=TCP " */
+                printk("PROTO=TCP ");
+                if (ntohs(ih->frag_off) & IP_OFFSET)
+                        break;
+                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+                th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+                                        sizeof(_tcph), &_tcph);
+                if (th == NULL) {
+                        printk("INCOMPLETE [%u bytes] ",
+                               skb->len - iphoff - ih->ihl*4);
+                        break;
+                }
+                /* Max length: 20 "SPT=65535 DPT=65535 " */
+                printk("SPT=%u DPT=%u ",
+                       ntohs(th->source), ntohs(th->dest));
+                /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+                if (info->logflags & IPT_LOG_TCPSEQ)
+                        printk("SEQ=%u ACK=%u ",
+                               ntohl(th->seq), ntohl(th->ack_seq));
+                /* Max length: 13 "WINDOW=65535 " */
+                printk("WINDOW=%u ", ntohs(th->window));
+                /* Max length: 9 "RES=0x3F " */
+                printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+                /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+                if (th->cwr)
+                        printk("CWR ");
+                if (th->ece)
+                        printk("ECE ");
+                if (th->urg)
+                        printk("URG ");
+                if (th->ack)
+                        printk("ACK ");
+                if (th->psh)
+                        printk("PSH ");
+                if (th->rst)
+                        printk("RST ");
+                if (th->syn)
+                        printk("SYN ");
+                if (th->fin)
+                        printk("FIN ");
+                /* Max length: 11 "URGP=65535 " */
+                printk("URGP=%u ", ntohs(th->urg_ptr));
+                if ((info->logflags & IPT_LOG_TCPOPT)
+                    && th->doff * 4 > sizeof(struct tcphdr)) {
+                        unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
+                        unsigned char *op;
+                        unsigned int i, optsize;
+                        optsize = th->doff * 4 - sizeof(struct tcphdr);
+                        op = skb_header_pointer(skb,
+                                                iphoff+ih->ihl*4+sizeof(_tcph),
+                                                optsize, _opt);
+                        if (op == NULL) {
+                                printk("TRUNCATED");
+                                return;
+                        }
+                        /* Max length: 127 "OPT (" 15*4*2chars ") " */
+                        printk("OPT (");
+                        for (i = 0; i < optsize; i++)
+                                printk("%02X", op[i]);
+                        printk(") ");
+                }
+                break;
+        }
+        case IPPROTO_UDP: {
+                struct udphdr _udph, *uh;
+                /* Max length: 10 "PROTO=UDP " */
+                printk("PROTO=UDP ");
+                if (ntohs(ih->frag_off) & IP_OFFSET)
+                        break;
+                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+                uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+                                        sizeof(_udph), &_udph);
+                if (uh == NULL) {
+                        printk("INCOMPLETE [%u bytes] ",
+                               skb->len - iphoff - ih->ihl*4);
+                        break;
+                }
+                /* Max length: 20 "SPT=65535 DPT=65535 " */
+                printk("SPT=%u DPT=%u LEN=%u ",
+                       ntohs(uh->source), ntohs(uh->dest),
+                       ntohs(uh->len));
+                break;
+        }
+        case IPPROTO_ICMP: {
+                struct icmphdr _icmph, *ich;
+                static size_t required_len[NR_ICMP_TYPES+1]
+                        = { [ICMP_ECHOREPLY] = 4,
+                            [ICMP_DEST_UNREACH]
+                            = 8 + sizeof(struct iphdr),
+                            [ICMP_SOURCE_QUENCH]
+                            = 8 + sizeof(struct iphdr),
+                            [ICMP_REDIRECT]
+                            = 8 + sizeof(struct iphdr),
+                            [ICMP_ECHO] = 4,
+                            [ICMP_TIME_EXCEEDED]
+                            = 8 + sizeof(struct iphdr),
+                            [ICMP_PARAMETERPROB]
+                            = 8 + sizeof(struct iphdr),
+                            [ICMP_TIMESTAMP] = 20,
+                            [ICMP_TIMESTAMPREPLY] = 20,
+                            [ICMP_ADDRESS] = 12,
+                            [ICMP_ADDRESSREPLY] = 12 };
+                /* Max length: 11 "PROTO=ICMP " */
+                printk("PROTO=ICMP ");
+                if (ntohs(ih->frag_off) & IP_OFFSET)
+                        break;
+                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+                ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+                                         sizeof(_icmph), &_icmph);
+                if (ich == NULL) {
+                        printk("INCOMPLETE [%u bytes] ",
+                               skb->len - iphoff - ih->ihl*4);
+                        break;
+                }
+                /* Max length: 18 "TYPE=255 CODE=255 " */
+                printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+                if (ich->type <= NR_ICMP_TYPES
+                    && required_len[ich->type]
+                    && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+                        printk("INCOMPLETE [%u bytes] ",
+                               skb->len - iphoff - ih->ihl*4);
+                        break;
+                }
+                switch (ich->type) {
+                case ICMP_ECHOREPLY:
+                case ICMP_ECHO:
+                        /* Max length: 19 "ID=65535 SEQ=65535 " */
+                        printk("ID=%u SEQ=%u ",
+                               ntohs(ich->un.echo.id),
+                               ntohs(ich->un.echo.sequence));
+                        break;
+                case ICMP_PARAMETERPROB:
+                        /* Max length: 14 "PARAMETER=255 " */
+                        printk("PARAMETER=%u ",
+                               ntohl(ich->un.gateway) >> 24);
+                        break;
+                case ICMP_REDIRECT:
+                        /* Max length: 24 "GATEWAY=255.255.255.255 " */
+                        printk("GATEWAY=%u.%u.%u.%u ",
+                               NIPQUAD(ich->un.gateway));
+                        /* Fall through */
+                case ICMP_DEST_UNREACH:
+                case ICMP_SOURCE_QUENCH:
+                case ICMP_TIME_EXCEEDED:
+                        /* Max length: 3+maxlen */
+                        if (!iphoff) { /* Only recurse once. */
+                                printk("[");
+                                dump_packet(info, skb,
+                                            iphoff + ih->ihl*4+sizeof(_icmph));
+                                printk("] ");
+                        }
+                        /* Max length: 10 "MTU=65535 " */
+                        if (ich->type == ICMP_DEST_UNREACH
+                            && ich->code == ICMP_FRAG_NEEDED)
+                                printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+                }
+                break;
+        }
+        /* Max Length */
+        case IPPROTO_AH: {
+                struct ip_auth_hdr _ahdr, *ah;
+                if (ntohs(ih->frag_off) & IP_OFFSET)
+                        break;
+                
+                /* Max length: 9 "PROTO=AH " */
+                printk("PROTO=AH ");
+                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+                ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+                                        sizeof(_ahdr), &_ahdr);
+                if (ah == NULL) {
+                        printk("INCOMPLETE [%u bytes] ",
+                               skb->len - iphoff - ih->ihl*4);
+                        break;
+                }
+                /* Length: 15 "SPI=0xF1234567 " */
+                printk("SPI=0x%x ", ntohl(ah->spi));
+                break;
+        }
+        case IPPROTO_ESP: {
+                struct ip_esp_hdr _esph, *eh;
+                /* Max length: 10 "PROTO=ESP " */
+                printk("PROTO=ESP ");
+                if (ntohs(ih->frag_off) & IP_OFFSET)
+                        break;
+                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+                eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+                                        sizeof(_esph), &_esph);
+                if (eh == NULL) {
+                        printk("INCOMPLETE [%u bytes] ",
+                               skb->len - iphoff - ih->ihl*4);
+                        break;
+                }
+                /* Length: 15 "SPI=0xF1234567 " */
+                printk("SPI=0x%x ", ntohl(eh->spi));
+                break;
+        }
+        /* Max length: 10 "PROTO 255 " */
+        default:
+                printk("PROTO=%u ", ih->protocol);
+        }
+        /* Max length: 15 "UID=4294967295 " */
+        if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+                read_lock_bh(&skb->sk->sk_callback_lock);
+                if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+                        printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
+                read_unlock_bh(&skb->sk->sk_callback_lock);
+        }
+        /* Proto    Max log string length */
+        /* IP:      40+46+6+11+127 = 230 */
+        /* TCP:     10+max(25,20+30+13+9+32+11+127) = 252 */
+        /* UDP:     10+max(25,20) = 35 */
+        /* ICMP:    11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+        /* ESP:     10+max(25)+15 = 50 */
+        /* AH:      9+max(25)+15 = 49 */
+        /* unknown: 10 */
+        /* (ICMP allows recursion one level deep) */
+        /* maxlen =  IP + ICMP +  IP + max(TCP,UDP,ICMP,unknown) */
+        /* maxlen = 230+   91  + 230 + 252 = 803 */
+}
+static void
+ipt_log_packet(unsigned int hooknum,
+               const struct sk_buff *skb,
+               const struct net_device *in,
+               const struct net_device *out,
+               const struct ipt_log_info *loginfo,
+               const char *level_string,
+               const char *prefix)
+{
+        spin_lock_bh(&log_lock);
+        printk(level_string);
+        printk("%sIN=%s OUT=%s ",
+               prefix == NULL ? loginfo->prefix : prefix,
+               in ? in->name : "",
+               out ? out->name : "");
+#ifdef CONFIG_BRIDGE_NETFILTER
+        if (skb->nf_bridge) {
+                struct net_device *physindev = skb->nf_bridge->physindev;
+                struct net_device *physoutdev = skb->nf_bridge->physoutdev;
+                if (physindev && in != physindev)
+                        printk("PHYSIN=%s ", physindev->name);
+                if (physoutdev && out != physoutdev)
+                        printk("PHYSOUT=%s ", physoutdev->name);
+        }
+#endif
+        if (in && !out) {
+                /* MAC logging for input chain only. */
+                printk("MAC=");
+                if (skb->dev && skb->dev->hard_header_len
+                    && skb->mac.raw != (void*)skb->nh.iph) {
+                        int i;
+                        unsigned char *p = skb->mac.raw;
+                        for (i = 0; i < skb->dev->hard_header_len; i++,p++)
+                                printk("%02x%c", *p,
+                                       i==skb->dev->hard_header_len - 1
+                                       ? ' ':':');
+                } else
+                        printk(" ");
+        }
+        dump_packet(loginfo, skb, 0);
+        printk("\n");
+        spin_unlock_bh(&log_lock);
+}
+static unsigned int
+ipt_log_target(struct sk_buff **pskb,
+               const struct net_device *in,
+               const struct net_device *out,
+               unsigned int hooknum,
+               const void *targinfo,
+               void *userinfo)
+{
+        const struct ipt_log_info *loginfo = targinfo;
+        char level_string[4] = "< >";
+        level_string[1] = '0' + (loginfo->level % 8);
+        ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+        return IPT_CONTINUE;
+}
+static void
+ipt_logfn(unsigned int hooknum,
+          const struct sk_buff *skb,
+          const struct net_device *in,
+          const struct net_device *out,
+          const char *prefix)
+{
+        struct ipt_log_info loginfo = { 
+                .level = 0, 
+                .logflags = IPT_LOG_MASK, 
+                .prefix = "" 
+        };
+        ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+}
+static int ipt_log_checkentry(const char *tablename,
+                              const struct ipt_entry *e,
+                              void *targinfo,
+                              unsigned int targinfosize,
+                              unsigned int hook_mask)
+{
+        const struct ipt_log_info *loginfo = targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) {
+                DEBUGP("LOG: targinfosize %u != %u\n",
+                       targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info)));
+                return 0;
+        }
+        if (loginfo->level >= 8) {
+                DEBUGP("LOG: level %u >= 8\n", loginfo->level);
+                return 0;
+        }
+        if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+                DEBUGP("LOG: prefix term %i\n",
+                       loginfo->prefix[sizeof(loginfo->prefix)-1]);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_log_reg = {
+        .name           = "LOG",
+        .target         = ipt_log_target,
+        .checkentry     = ipt_log_checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        if (ipt_register_target(&ipt_log_reg))
+                return -EINVAL;
+        if (nflog)
+                nf_log_register(PF_INET, &ipt_logfn);
+        
+        return 0;
+}
+static void __exit fini(void)
+{
+        if (nflog)
+                nf_log_unregister(PF_INET, &ipt_logfn);
+        ipt_unregister_target(&ipt_log_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
new file mode 100644
index 000000000000..33c6f9b63b8d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -0,0 +1,162 @@
+/* This is a module which is used for setting the NFMARK field of an skb. */
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_MARK.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables MARK modification module");
+static unsigned int
+target_v0(struct sk_buff **pskb,
+          const struct net_device *in,
+          const struct net_device *out,
+          unsigned int hooknum,
+          const void *targinfo,
+          void *userinfo)
+{
+        const struct ipt_mark_target_info *markinfo = targinfo;
+        if((*pskb)->nfmark != markinfo->mark) {
+                (*pskb)->nfmark = markinfo->mark;
+                (*pskb)->nfcache |= NFC_ALTERED;
+        }
+        return IPT_CONTINUE;
+}
+static unsigned int
+target_v1(struct sk_buff **pskb,
+          const struct net_device *in,
+          const struct net_device *out,
+          unsigned int hooknum,
+          const void *targinfo,
+          void *userinfo)
+{
+        const struct ipt_mark_target_info_v1 *markinfo = targinfo;
+        int mark = 0;
+        switch (markinfo->mode) {
+        case IPT_MARK_SET:
+                mark = markinfo->mark;
+                break;
+                
+        case IPT_MARK_AND:
+                mark = (*pskb)->nfmark & markinfo->mark;
+                break;
+                
+        case IPT_MARK_OR:
+                mark = (*pskb)->nfmark | markinfo->mark;
+                break;
+        }
+        if((*pskb)->nfmark != mark) {
+                (*pskb)->nfmark = mark;
+                (*pskb)->nfcache |= NFC_ALTERED;
+        }
+        return IPT_CONTINUE;
+}
+static int
+checkentry_v0(const char *tablename,
+              const struct ipt_entry *e,
+              void *targinfo,
+              unsigned int targinfosize,
+              unsigned int hook_mask)
+{
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
+                printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_mark_target_info)));
+                return 0;
+        }
+        if (strcmp(tablename, "mangle") != 0) {
+                printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        return 1;
+}
+static int
+checkentry_v1(const char *tablename,
+              const struct ipt_entry *e,
+              void *targinfo,
+              unsigned int targinfosize,
+              unsigned int hook_mask)
+{
+        struct ipt_mark_target_info_v1 *markinfo = targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){
+                printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1)));
+                return 0;
+        }
+        if (strcmp(tablename, "mangle") != 0) {
+                printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        if (markinfo->mode != IPT_MARK_SET
+            && markinfo->mode != IPT_MARK_AND
+            && markinfo->mode != IPT_MARK_OR) {
+                printk(KERN_WARNING "MARK: unknown mode %u\n",
+                       markinfo->mode);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_mark_reg_v0 = {
+        .name           = "MARK",
+        .target         = target_v0,
+        .checkentry     = checkentry_v0,
+        .me             = THIS_MODULE,
+        .revision       = 0,
+};
+static struct ipt_target ipt_mark_reg_v1 = {
+        .name           = "MARK",
+        .target         = target_v1,
+        .checkentry     = checkentry_v1,
+        .me             = THIS_MODULE,
+        .revision       = 1,
+};
+static int __init init(void)
+{
+        int err;
+        err = ipt_register_target(&ipt_mark_reg_v0);
+        if (!err) {
+                err = ipt_register_target(&ipt_mark_reg_v1);
+                if (err)
+                        ipt_unregister_target(&ipt_mark_reg_v0);
+        }
+        return err;
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_mark_reg_v0);
+        ipt_unregister_target(&ipt_mark_reg_v1);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 000000000000..57e9f6cf1c36
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,207 @@
+/* Masquerade.  Simple mapping which alters range to a local IP address
+   (depending on route). */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables MASQUERADE target module");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+/* Lock protects masq region inside conntrack */
+static DECLARE_RWLOCK(masq_lock);
+/* FIXME: Multiple targets. --RR */
+static int
+masquerade_check(const char *tablename,
+                 const struct ipt_entry *e,
+                 void *targinfo,
+                 unsigned int targinfosize,
+                 unsigned int hook_mask)
+{
+        const struct ip_nat_multi_range_compat *mr = targinfo;
+        if (strcmp(tablename, "nat") != 0) {
+                DEBUGP("masquerade_check: bad table `%s'.\n", tablename);
+                return 0;
+        }
+        if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+                DEBUGP("masquerade_check: size %u != %u.\n",
+                       targinfosize, sizeof(*mr));
+                return 0;
+        }
+        if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+                DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask);
+                return 0;
+        }
+        if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+                DEBUGP("masquerade_check: bad MAP_IPS.\n");
+                return 0;
+        }
+        if (mr->rangesize != 1) {
+                DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
+                return 0;
+        }
+        return 1;
+}
+static unsigned int
+masquerade_target(struct sk_buff **pskb,
+                  const struct net_device *in,
+                  const struct net_device *out,
+                  unsigned int hooknum,
+                  const void *targinfo,
+                  void *userinfo)
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        const struct ip_nat_multi_range_compat *mr;
+        struct ip_nat_range newrange;
+        struct rtable *rt;
+        u_int32_t newsrc;
+        IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+        /* FIXME: For the moment, don't do local packets, breaks
+           testsuite for 2.3.49 --RR */
+        if ((*pskb)->sk)
+                return NF_ACCEPT;
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
+                            || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+        mr = targinfo;
+        rt = (struct rtable *)(*pskb)->dst;
+        newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+        if (!newsrc) {
+                printk("MASQUERADE: %s ate my IP address\n", out->name);
+                return NF_DROP;
+        }
+        WRITE_LOCK(&masq_lock);
+        ct->nat.masq_index = out->ifindex;
+        WRITE_UNLOCK(&masq_lock);
+        /* Transfer from original range. */
+        newrange = ((struct ip_nat_range)
+                { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+                  newsrc, newsrc,
+                  mr->range[0].min, mr->range[0].max });
+        /* Hand modified range to generic setup. */
+        return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+static inline int
+device_cmp(struct ip_conntrack *i, void *ifindex)
+{
+        int ret;
+        READ_LOCK(&masq_lock);
+        ret = (i->nat.masq_index == (int)(long)ifindex);
+        READ_UNLOCK(&masq_lock);
+        return ret;
+}
+static int masq_device_event(struct notifier_block *this,
+                             unsigned long event,
+                             void *ptr)
+{
+        struct net_device *dev = ptr;
+        if (event == NETDEV_DOWN) {
+                /* Device was downed.  Search entire table for
+                   conntracks which were associated with that device,
+                   and forget them. */
+                IP_NF_ASSERT(dev->ifindex != 0);
+                ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
+        }
+        return NOTIFY_DONE;
+}
+static int masq_inet_event(struct notifier_block *this,
+                           unsigned long event,
+                           void *ptr)
+{
+        struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+        if (event == NETDEV_DOWN) {
+                /* IP address was deleted.  Search entire table for
+                   conntracks which were associated with that device,
+                   and forget them. */
+                IP_NF_ASSERT(dev->ifindex != 0);
+                ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block masq_dev_notifier = {
+        .notifier_call  = masq_device_event,
+};
+static struct notifier_block masq_inet_notifier = {
+        .notifier_call  = masq_inet_event,
+};
+static struct ipt_target masquerade = {
+        .name           = "MASQUERADE",
+        .target         = masquerade_target,
+        .checkentry     = masquerade_check,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        int ret;
+        ret = ipt_register_target(&masquerade);
+        if (ret == 0) {
+                /* Register for device down reports */
+                register_netdevice_notifier(&masq_dev_notifier);
+                /* Register IP address change reports */
+                register_inetaddr_notifier(&masq_inet_notifier);
+        }
+        return ret;
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&masquerade);
+        unregister_netdevice_notifier(&masq_dev_notifier);
+        unregister_inetaddr_notifier(&masq_inet_notifier);      
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
new file mode 100644
index 000000000000..06254b29d034
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -0,0 +1,117 @@
+/* NETMAP - static NAT mapping of IP network addresses (1:1).
+ * The mapping can be applied to source (POSTROUTING),
+ * destination (PREROUTING), or both (with separate rules).
+ */
+/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#define MODULENAME "NETMAP"
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
+MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static int
+check(const char *tablename,
+      const struct ipt_entry *e,
+      void *targinfo,
+      unsigned int targinfosize,
+      unsigned int hook_mask)
+{
+        const struct ip_nat_multi_range_compat *mr = targinfo;
+        if (strcmp(tablename, "nat") != 0) {
+                DEBUGP(MODULENAME":check: bad table `%s'.\n", tablename);
+                return 0;
+        }
+        if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+                DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
+                return 0;
+        }
+        if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+                DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
+                return 0;
+        }
+        if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
+                DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
+                return 0;
+        }
+        if (mr->rangesize != 1) {
+                DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize);
+                return 0;
+        }
+        return 1;
+}
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        u_int32_t new_ip, netmask;
+        const struct ip_nat_multi_range_compat *mr = targinfo;
+        struct ip_nat_range newrange;
+        IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+                     || hooknum == NF_IP_POST_ROUTING);
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
+        if (hooknum == NF_IP_PRE_ROUTING)
+                new_ip = (*pskb)->nh.iph->daddr & ~netmask;
+        else
+                new_ip = (*pskb)->nh.iph->saddr & ~netmask;
+        new_ip |= mr->range[0].min_ip & netmask;
+        newrange = ((struct ip_nat_range)
+                { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+                  new_ip, new_ip,
+                  mr->range[0].min, mr->range[0].max });
+        /* Hand modified range to generic setup. */
+        return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+static struct ipt_target target_module = { 
+        .name           = MODULENAME,
+        .target         = target, 
+        .checkentry     = check,
+        .me             = THIS_MODULE 
+};
+static int __init init(void)
+{
+        return ipt_register_target(&target_module);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&target_module);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c
new file mode 100644
index 000000000000..a4bb9b3bc292
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NOTRACK.c
@@ -0,0 +1,76 @@
+/* This is a module which is used for setting up fake conntracks
+ * on packets so that they are not seen by the conntrack/NAT code.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        /* Previously seen (loopback)? Ignore. */
+        if ((*pskb)->nfct != NULL)
+                return IPT_CONTINUE;
+        /* Attach fake conntrack entry. 
+           If there is a real ct entry correspondig to this packet, 
+           it'll hang aroun till timing out. We don't deal with it
+           for performance reasons. JK */
+        (*pskb)->nfct = &ip_conntrack_untracked.ct_general;
+        (*pskb)->nfctinfo = IP_CT_NEW;
+        nf_conntrack_get((*pskb)->nfct);
+        return IPT_CONTINUE;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        if (targinfosize != 0) {
+                printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n",
+                       targinfosize);
+                return 0;
+        }
+        if (strcmp(tablename, "raw") != 0) {
+                printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_notrack_reg = { 
+        .name = "NOTRACK", 
+        .target = target, 
+        .checkentry = checkentry,
+        .me = THIS_MODULE 
+};
+static int __init init(void)
+{
+        if (ipt_register_target(&ipt_notrack_reg))
+                return -EINVAL;
+        return 0;
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_notrack_reg);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
new file mode 100644
index 000000000000..d2e13447678e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -0,0 +1,129 @@
+/* Redirect.  Simple mapping which alters dst to a local IP address. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables REDIRECT target module");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+/* FIXME: Take multiple ranges --RR */
+static int
+redirect_check(const char *tablename,
+               const struct ipt_entry *e,
+               void *targinfo,
+               unsigned int targinfosize,
+               unsigned int hook_mask)
+{
+        const struct ip_nat_multi_range_compat *mr = targinfo;
+        if (strcmp(tablename, "nat") != 0) {
+                DEBUGP("redirect_check: bad table `%s'.\n", table);
+                return 0;
+        }
+        if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+                DEBUGP("redirect_check: size %u.\n", targinfosize);
+                return 0;
+        }
+        if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+                DEBUGP("redirect_check: bad hooks %x.\n", hook_mask);
+                return 0;
+        }
+        if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+                DEBUGP("redirect_check: bad MAP_IPS.\n");
+                return 0;
+        }
+        if (mr->rangesize != 1) {
+                DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
+                return 0;
+        }
+        return 1;
+}
+static unsigned int
+redirect_target(struct sk_buff **pskb,
+                const struct net_device *in,
+                const struct net_device *out,
+                unsigned int hooknum,
+                const void *targinfo,
+                void *userinfo)
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        u_int32_t newdst;
+        const struct ip_nat_multi_range_compat *mr = targinfo;
+        struct ip_nat_range newrange;
+        IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+                     || hooknum == NF_IP_LOCAL_OUT);
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+        /* Local packets: make them go to loopback */
+        if (hooknum == NF_IP_LOCAL_OUT)
+                newdst = htonl(0x7F000001);
+        else {
+                struct in_device *indev;
+                /* Device might not have an associated in_device. */
+                indev = (struct in_device *)(*pskb)->dev->ip_ptr;
+                if (indev == NULL || indev->ifa_list == NULL)
+                        return NF_DROP;
+                /* Grab first address on interface. */
+                newdst = indev->ifa_list->ifa_local;
+        }
+        /* Transfer from original range. */
+        newrange = ((struct ip_nat_range)
+                { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+                  newdst, newdst,
+                  mr->range[0].min, mr->range[0].max });
+        /* Hand modified range to generic setup. */
+        return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+static struct ipt_target redirect_reg = {
+        .name           = "REDIRECT",
+        .target         = redirect_target,
+        .checkentry     = redirect_check,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&redirect_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&redirect_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 000000000000..266d64979286
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,335 @@
+/*
+ * This is a module which is used for rejecting packets.
+ * Added support for customized reject packets (Jozsef Kadlecsik).
+ * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812]
+ */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include <linux/netfilter_bridge.h>
+#endif
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables REJECT target module");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static inline struct rtable *route_reverse(struct sk_buff *skb, 
+                                           struct tcphdr *tcph, int hook)
+{
+        struct iphdr *iph = skb->nh.iph;
+        struct dst_entry *odst;
+        struct flowi fl = {};
+        struct rtable *rt;
+        /* We don't require ip forwarding to be enabled to be able to
+         * send a RST reply for bridged traffic. */
+        if (hook != NF_IP_FORWARD
+#ifdef CONFIG_BRIDGE_NETFILTER
+            || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED)
+#endif
+           ) {
+                fl.nl_u.ip4_u.daddr = iph->saddr;
+                if (hook == NF_IP_LOCAL_IN)
+                        fl.nl_u.ip4_u.saddr = iph->daddr;
+                fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+                if (ip_route_output_key(&rt, &fl) != 0)
+                        return NULL;
+        } else {
+                /* non-local src, find valid iif to satisfy
+                 * rp-filter when calling ip_route_input. */
+                fl.nl_u.ip4_u.daddr = iph->daddr;
+                if (ip_route_output_key(&rt, &fl) != 0)
+                        return NULL;
+                odst = skb->dst;
+                if (ip_route_input(skb, iph->saddr, iph->daddr,
+                                   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+                        dst_release(&rt->u.dst);
+                        return NULL;
+                }
+                dst_release(&rt->u.dst);
+                rt = (struct rtable *)skb->dst;
+                skb->dst = odst;
+                fl.nl_u.ip4_u.daddr = iph->saddr;
+                fl.nl_u.ip4_u.saddr = iph->daddr;
+                fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+        }
+        if (rt->u.dst.error) {
+                dst_release(&rt->u.dst);
+                return NULL;
+        }
+        fl.proto = IPPROTO_TCP;
+        fl.fl_ip_sport = tcph->dest;
+        fl.fl_ip_dport = tcph->source;
+        if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) {
+                dst_release(&rt->u.dst);
+                rt = NULL;
+        }
+        return rt;
+}
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb, int hook)
+{
+        struct sk_buff *nskb;
+        struct tcphdr _otcph, *oth, *tcph;
+        struct rtable *rt;
+        u_int16_t tmp_port;
+        u_int32_t tmp_addr;
+        int needs_ack;
+        int hh_len;
+        /* IP header checks: fragment. */
+        if (oldskb->nh.iph->frag_off & htons(IP_OFFSET))
+                return;
+        oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4,
+                                 sizeof(_otcph), &_otcph);
+        if (oth == NULL)
+                return;
+        /* No RST for RST. */
+        if (oth->rst)
+                return;
+        /* FIXME: Check checksum --RR */
+        if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
+                return;
+        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+        /* We need a linear, writeable skb.  We also need to expand
+           headroom in case hh_len of incoming interface < hh_len of
+           outgoing interface */
+        nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb),
+                               GFP_ATOMIC);
+        if (!nskb) {
+                dst_release(&rt->u.dst);
+                return;
+        }
+        dst_release(nskb->dst);
+        nskb->dst = &rt->u.dst;
+        /* This packet will not be the same as the other: clear nf fields */
+        nf_reset(nskb);
+        nskb->nfcache = 0;
+        nskb->nfmark = 0;
+#ifdef CONFIG_BRIDGE_NETFILTER
+        nf_bridge_put(nskb->nf_bridge);
+        nskb->nf_bridge = NULL;
+#endif
+        tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
+        /* Swap source and dest */
+        tmp_addr = nskb->nh.iph->saddr;
+        nskb->nh.iph->saddr = nskb->nh.iph->daddr;
+        nskb->nh.iph->daddr = tmp_addr;
+        tmp_port = tcph->source;
+        tcph->source = tcph->dest;
+        tcph->dest = tmp_port;
+        /* Truncate to length (no data) */
+        tcph->doff = sizeof(struct tcphdr)/4;
+        skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr));
+        nskb->nh.iph->tot_len = htons(nskb->len);
+        if (tcph->ack) {
+                needs_ack = 0;
+                tcph->seq = oth->ack_seq;
+                tcph->ack_seq = 0;
+        } else {
+                needs_ack = 1;
+                tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin
+                                      + oldskb->len - oldskb->nh.iph->ihl*4
+                                      - (oth->doff<<2));
+                tcph->seq = 0;
+        }
+        /* Reset flags */
+        ((u_int8_t *)tcph)[13] = 0;
+        tcph->rst = 1;
+        tcph->ack = needs_ack;
+        tcph->window = 0;
+        tcph->urg_ptr = 0;
+        /* Adjust TCP checksum */
+        tcph->check = 0;
+        tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr),
+                                   nskb->nh.iph->saddr,
+                                   nskb->nh.iph->daddr,
+                                   csum_partial((char *)tcph,
+                                                sizeof(struct tcphdr), 0));
+        /* Adjust IP TTL, DF */
+        nskb->nh.iph->ttl = MAXTTL;
+        /* Set DF, id = 0 */
+        nskb->nh.iph->frag_off = htons(IP_DF);
+        nskb->nh.iph->id = 0;
+        /* Adjust IP checksum */
+        nskb->nh.iph->check = 0;
+        nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, 
+                                           nskb->nh.iph->ihl);
+        /* "Never happens" */
+        if (nskb->len > dst_mtu(nskb->dst))
+                goto free_nskb;
+        nf_ct_attach(nskb, oldskb);
+        NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
+                dst_output);
+        return;
+ free_nskb:
+        kfree_skb(nskb);
+}
+static inline void send_unreach(struct sk_buff *skb_in, int code)
+{
+        icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}       
+static unsigned int reject(struct sk_buff **pskb,
+                           const struct net_device *in,
+                           const struct net_device *out,
+                           unsigned int hooknum,
+                           const void *targinfo,
+                           void *userinfo)
+{
+        const struct ipt_reject_info *reject = targinfo;
+        /* Our naive response construction doesn't deal with IP
+           options, and probably shouldn't try. */
+        if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr))
+                return NF_DROP;
+        /* WARNING: This code causes reentry within iptables.
+           This means that the iptables jump stack is now crap.  We
+           must return an absolute verdict. --RR */
+        switch (reject->with) {
+        case IPT_ICMP_NET_UNREACHABLE:
+                send_unreach(*pskb, ICMP_NET_UNREACH);
+                break;
+        case IPT_ICMP_HOST_UNREACHABLE:
+                send_unreach(*pskb, ICMP_HOST_UNREACH);
+                break;
+        case IPT_ICMP_PROT_UNREACHABLE:
+                send_unreach(*pskb, ICMP_PROT_UNREACH);
+                break;
+        case IPT_ICMP_PORT_UNREACHABLE:
+                send_unreach(*pskb, ICMP_PORT_UNREACH);
+                break;
+        case IPT_ICMP_NET_PROHIBITED:
+                send_unreach(*pskb, ICMP_NET_ANO);
+                break;
+        case IPT_ICMP_HOST_PROHIBITED:
+                send_unreach(*pskb, ICMP_HOST_ANO);
+                break;
+        case IPT_ICMP_ADMIN_PROHIBITED:
+                send_unreach(*pskb, ICMP_PKT_FILTERED);
+                break;
+        case IPT_TCP_RESET:
+                send_reset(*pskb, hooknum);
+        case IPT_ICMP_ECHOREPLY:
+                /* Doesn't happen. */
+                break;
+        }
+        return NF_DROP;
+}
+static int check(const char *tablename,
+                 const struct ipt_entry *e,
+                 void *targinfo,
+                 unsigned int targinfosize,
+                 unsigned int hook_mask)
+{
+        const struct ipt_reject_info *rejinfo = targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) {
+                DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize);
+                return 0;
+        }
+        /* Only allow these for packet filtering. */
+        if (strcmp(tablename, "filter") != 0) {
+                DEBUGP("REJECT: bad table `%s'.\n", tablename);
+                return 0;
+        }
+        if ((hook_mask & ~((1 << NF_IP_LOCAL_IN)
+                           | (1 << NF_IP_FORWARD)
+                           | (1 << NF_IP_LOCAL_OUT))) != 0) {
+                DEBUGP("REJECT: bad hook mask %X\n", hook_mask);
+                return 0;
+        }
+        if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+                printk("REJECT: ECHOREPLY no longer supported.\n");
+                return 0;
+        } else if (rejinfo->with == IPT_TCP_RESET) {
+                /* Must specify that it's a TCP packet */
+                if (e->ip.proto != IPPROTO_TCP
+                    || (e->ip.invflags & IPT_INV_PROTO)) {
+                        DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n");
+                        return 0;
+                }
+        }
+        return 1;
+}
+static struct ipt_target ipt_reject_reg = {
+        .name           = "REJECT",
+        .target         = reject,
+        .checkentry     = check,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_reject_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_reject_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
new file mode 100644
index 000000000000..7a0536d864ac
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -0,0 +1,211 @@
+/* Same.  Just like SNAT, only try to make the connections
+ *        between client A and server B always have the same source ip.
+ *
+ * (C) 2000 Paul `Rusty' Russell
+ * (C) 2001 Martin Josefsson
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 010320 Martin Josefsson <gandalf@wlug.westbo.se>
+ *      * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things.
+ * 010728 Martin Josefsson <gandalf@wlug.westbo.se>
+ *      * added --nodst to not include destination-ip in new source
+ *        calculations.
+ *      * added some more sanity-checks.
+ * 010729 Martin Josefsson <gandalf@wlug.westbo.se>
+ *      * fixed a buggy if-statement in same_check(), should have
+ *        used ntohl() but didn't.
+ *      * added support for multiple ranges. IPT_SAME_MAX_RANGE is
+ *        defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h
+ *        and is currently set to 10.
+ *      * added support for 1-address range, nice to have now that
+ *        we have multiple ranges.
+ */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ipt_SAME.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>");
+MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static int
+same_check(const char *tablename,
+              const struct ipt_entry *e,
+              void *targinfo,
+              unsigned int targinfosize,
+              unsigned int hook_mask)
+{
+        unsigned int count, countess, rangeip, index = 0;
+        struct ipt_same_info *mr = targinfo;
+        mr->ipnum = 0;
+        if (strcmp(tablename, "nat") != 0) {
+                DEBUGP("same_check: bad table `%s'.\n", tablename);
+                return 0;
+        }
+        if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+                DEBUGP("same_check: size %u.\n", targinfosize);
+                return 0;
+        }
+        if (hook_mask & ~(1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING)) {
+                DEBUGP("same_check: bad hooks %x.\n", hook_mask);
+                return 0;
+        }
+        if (mr->rangesize < 1) {
+                DEBUGP("same_check: need at least one dest range.\n");
+                return 0;
+        }
+        if (mr->rangesize > IPT_SAME_MAX_RANGE) {
+                DEBUGP("same_check: too many ranges specified, maximum "
+                                "is %u ranges\n",
+                                IPT_SAME_MAX_RANGE);
+                return 0;
+        }
+        for (count = 0; count < mr->rangesize; count++) {
+                if (ntohl(mr->range[count].min_ip) >
+                                ntohl(mr->range[count].max_ip)) {
+                        DEBUGP("same_check: min_ip is larger than max_ip in "
+                                "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
+                                NIPQUAD(mr->range[count].min_ip),
+                                NIPQUAD(mr->range[count].max_ip));
+                        return 0;
+                }
+                if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
+                        DEBUGP("same_check: bad MAP_IPS.\n");
+                        return 0;
+                }
+                rangeip = (ntohl(mr->range[count].max_ip) - 
+                                        ntohl(mr->range[count].min_ip) + 1);
+                mr->ipnum += rangeip;
+                
+                DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip);
+        }
+        DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum);
+        
+        mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL);
+        if (!mr->iparray) {
+                DEBUGP("same_check: Couldn't allocate %u bytes "
+                        "for %u ipaddresses!\n", 
+                        (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+                return 0;
+        }
+        DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n",
+                        (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+        
+        for (count = 0; count < mr->rangesize; count++) {
+                for (countess = ntohl(mr->range[count].min_ip);
+                                countess <= ntohl(mr->range[count].max_ip);
+                                        countess++) {
+                        mr->iparray[index] = countess;
+                        DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' "
+                                "in index %u.\n",
+                                HIPQUAD(countess), index);
+                        index++;
+                }
+        }
+        return 1;
+}
+static void 
+same_destroy(void *targinfo,
+                unsigned int targinfosize)
+{
+        struct ipt_same_info *mr = targinfo;
+        kfree(mr->iparray);
+        
+        DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n",
+                        (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+}
+static unsigned int
+same_target(struct sk_buff **pskb,
+                const struct net_device *in,
+                const struct net_device *out,
+                unsigned int hooknum,
+                const void *targinfo,
+                void *userinfo)
+{
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        u_int32_t tmpip, aindex, new_ip;
+        const struct ipt_same_info *same = targinfo;
+        struct ip_nat_range newrange;
+        const struct ip_conntrack_tuple *t;
+        IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
+                        hooknum == NF_IP_POST_ROUTING);
+        ct = ip_conntrack_get(*pskb, &ctinfo);
+        t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+        /* Base new source on real src ip and optionally dst ip,
+           giving some hope for consistency across reboots.
+           Here we calculate the index in same->iparray which
+           holds the ipaddress we should use */
+        
+        tmpip = ntohl(t->src.ip);
+        if (!(same->info & IPT_SAME_NODST))
+                tmpip += ntohl(t->dst.ip);
+        
+        aindex = tmpip % same->ipnum;
+        new_ip = htonl(same->iparray[aindex]);
+        DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
+                        "new src=%u.%u.%u.%u\n",
+                        NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip),
+                        NIPQUAD(new_ip));
+        /* Transfer from original range. */
+        newrange = ((struct ip_nat_range)
+                { same->range[0].flags, new_ip, new_ip,
+                  /* FIXME: Use ports from correct range! */
+                  same->range[0].min, same->range[0].max });
+        /* Hand modified range to generic setup. */
+        return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+static struct ipt_target same_reg = { 
+        .name           = "SAME",
+        .target         = same_target,
+        .checkentry     = same_check,
+        .destroy        = same_destroy,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&same_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&same_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
new file mode 100644
index 000000000000..1049050b2bfb
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -0,0 +1,262 @@
+/*
+ * This is a module which is used for setting the MSS option in TCP packets.
+ *
+ * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TCPMSS.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables TCP MSS modification module");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static u_int16_t
+cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+        u_int32_t diffs[] = { oldvalinv, newval };
+        return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+                                      oldcheck^0xFFFF));
+}
+static inline unsigned int
+optlen(const u_int8_t *opt, unsigned int offset)
+{
+        /* Beware zero-length options: make finite progress */
+        if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1;
+        else return opt[offset+1];
+}
+static unsigned int
+ipt_tcpmss_target(struct sk_buff **pskb,
+                  const struct net_device *in,
+                  const struct net_device *out,
+                  unsigned int hooknum,
+                  const void *targinfo,
+                  void *userinfo)
+{
+        const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
+        struct tcphdr *tcph;
+        struct iphdr *iph;
+        u_int16_t tcplen, newtotlen, oldval, newmss;
+        unsigned int i;
+        u_int8_t *opt;
+        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+                return NF_DROP;
+        iph = (*pskb)->nh.iph;
+        tcplen = (*pskb)->len - iph->ihl*4;
+        tcph = (void *)iph + iph->ihl*4;
+        /* Since it passed flags test in tcp match, we know it is is
+           not a fragment, and has data >= tcp header length.  SYN
+           packets should not contain data: if they did, then we risk
+           running over MTU, sending Frag Needed and breaking things
+           badly. --RR */
+        if (tcplen != tcph->doff*4) {
+                if (net_ratelimit())
+                        printk(KERN_ERR
+                               "ipt_tcpmss_target: bad length (%d bytes)\n",
+                               (*pskb)->len);
+                return NF_DROP;
+        }
+        if(tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) {
+                if(!(*pskb)->dst) {
+                        if (net_ratelimit())
+                                printk(KERN_ERR
+                                        "ipt_tcpmss_target: no dst?! can't determine path-MTU\n");
+                        return NF_DROP; /* or IPT_CONTINUE ?? */
+                }
+                if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
+                        if (net_ratelimit())
+                                printk(KERN_ERR
+                                        "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst));
+                        return NF_DROP; /* or IPT_CONTINUE ?? */
+                }
+                newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr);
+        } else
+                newmss = tcpmssinfo->mss;
+        opt = (u_int8_t *)tcph;
+        for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)){
+                if ((opt[i] == TCPOPT_MSS) &&
+                    ((tcph->doff*4 - i) >= TCPOLEN_MSS) &&
+                    (opt[i+1] == TCPOLEN_MSS)) {
+                        u_int16_t oldmss;
+                        oldmss = (opt[i+2] << 8) | opt[i+3];
+                        if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) &&
+                                (oldmss <= newmss))
+                                        return IPT_CONTINUE;
+                        opt[i+2] = (newmss & 0xff00) >> 8;
+                        opt[i+3] = (newmss & 0x00ff);
+                        tcph->check = cheat_check(htons(oldmss)^0xFFFF,
+                                                  htons(newmss),
+                                                  tcph->check);
+                        DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
+                               "->%u.%u.%u.%u:%hu changed TCP MSS option"
+                               " (from %u to %u)\n", 
+                               NIPQUAD((*pskb)->nh.iph->saddr),
+                               ntohs(tcph->source),
+                               NIPQUAD((*pskb)->nh.iph->daddr),
+                               ntohs(tcph->dest),
+                               oldmss, newmss);
+                        goto retmodified;
+                }
+        }
+        /*
+         * MSS Option not found ?! add it..
+         */
+        if (skb_tailroom((*pskb)) < TCPOLEN_MSS) {
+                struct sk_buff *newskb;
+                newskb = skb_copy_expand(*pskb, skb_headroom(*pskb),
+                                         TCPOLEN_MSS, GFP_ATOMIC);
+                if (!newskb) {
+                        if (net_ratelimit())
+                                printk(KERN_ERR "ipt_tcpmss_target:"
+                                       " unable to allocate larger skb\n");
+                        return NF_DROP;
+                }
+                kfree_skb(*pskb);
+                *pskb = newskb;
+                iph = (*pskb)->nh.iph;
+                tcph = (void *)iph + iph->ihl*4;
+        }
+        skb_put((*pskb), TCPOLEN_MSS);
+        opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
+        memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
+        tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF,
+                                  htons(tcplen + TCPOLEN_MSS), tcph->check);
+        tcplen += TCPOLEN_MSS;
+        opt[0] = TCPOPT_MSS;
+        opt[1] = TCPOLEN_MSS;
+        opt[2] = (newmss & 0xff00) >> 8;
+        opt[3] = (newmss & 0x00ff);
+        tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check);
+        oldval = ((u_int16_t *)tcph)[6];
+        tcph->doff += TCPOLEN_MSS/4;
+        tcph->check = cheat_check(oldval ^ 0xFFFF,
+                                  ((u_int16_t *)tcph)[6], tcph->check);
+        newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS);
+        iph->check = cheat_check(iph->tot_len ^ 0xFFFF,
+                                 newtotlen, iph->check);
+        iph->tot_len = newtotlen;
+        DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
+               "->%u.%u.%u.%u:%hu added TCP MSS option (%u)\n",
+               NIPQUAD((*pskb)->nh.iph->saddr),
+               ntohs(tcph->source),
+               NIPQUAD((*pskb)->nh.iph->daddr),
+               ntohs(tcph->dest),
+               newmss);
+ retmodified:
+        /* We never hw checksum SYN packets.  */
+        BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
+        (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
+        return IPT_CONTINUE;
+}
+#define TH_SYN 0x02
+static inline int find_syn_match(const struct ipt_entry_match *m)
+{
+        const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data;
+        if (strcmp(m->u.kernel.match->name, "tcp") == 0
+            && (tcpinfo->flg_cmp & TH_SYN)
+            && !(tcpinfo->invflags & IPT_TCP_INV_FLAGS))
+                return 1;
+        return 0;
+}
+/* Must specify -p tcp --syn/--tcp-flags SYN */
+static int
+ipt_tcpmss_checkentry(const char *tablename,
+                      const struct ipt_entry *e,
+                      void *targinfo,
+                      unsigned int targinfosize,
+                      unsigned int hook_mask)
+{
+        const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) {
+                DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n",
+                       targinfosize, IPT_ALIGN(sizeof(struct ipt_tcpmss_info)));
+                return 0;
+        }
+        if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && 
+                        ((hook_mask & ~((1 << NF_IP_FORWARD)
+                                | (1 << NF_IP_LOCAL_OUT)
+                                | (1 << NF_IP_POST_ROUTING))) != 0)) {
+                printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
+                return 0;
+        }
+        if (e->ip.proto == IPPROTO_TCP
+            && !(e->ip.invflags & IPT_INV_PROTO)
+            && IPT_MATCH_ITERATE(e, find_syn_match))
+                return 1;
+        printk("TCPMSS: Only works on TCP SYN packets\n");
+        return 0;
+}
+static struct ipt_target ipt_tcpmss_reg = {
+        .name           = "TCPMSS",
+        .target         = ipt_tcpmss_target,
+        .checkentry     = ipt_tcpmss_checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_tcpmss_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_tcpmss_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
new file mode 100644
index 000000000000..85c70d240f8b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -0,0 +1,105 @@
+/* This is a module which is used for setting the TOS field of a packet. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TOS.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables TOS mangling module");
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_tos_target_info *tosinfo = targinfo;
+        if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
+                u_int16_t diffs[2];
+                if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+                        return NF_DROP;
+                diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+                (*pskb)->nh.iph->tos
+                        = ((*pskb)->nh.iph->tos & IPTOS_PREC_MASK)
+                        | tosinfo->tos;
+                diffs[1] = htons((*pskb)->nh.iph->tos);
+                (*pskb)->nh.iph->check
+                        = csum_fold(csum_partial((char *)diffs,
+                                                 sizeof(diffs),
+                                                 (*pskb)->nh.iph->check
+                                                 ^0xFFFF));
+                (*pskb)->nfcache |= NFC_ALTERED;
+        }
+        return IPT_CONTINUE;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) {
+                printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_tos_target_info)));
+                return 0;
+        }
+        if (strcmp(tablename, "mangle") != 0) {
+                printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        if (tos != IPTOS_LOWDELAY
+            && tos != IPTOS_THROUGHPUT
+            && tos != IPTOS_RELIABILITY
+            && tos != IPTOS_MINCOST
+            && tos != IPTOS_NORMALSVC) {
+                printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_tos_reg = {
+        .name           = "TOS",
+        .target         = target,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_tos_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_tos_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
new file mode 100644
index 000000000000..6f2cefbe16cd
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -0,0 +1,419 @@
+/*
+ * netfilter module for userspace packet logging daemons
+ *
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * 2000/09/22 ulog-cprange feature added
+ * 2001/01/04 in-kernel queue as proposed by Sebastian Zander 
+ *                                              <zander@fokus.gmd.de>
+ * 2001/01/30 per-rule nlgroup conflicts with global queue. 
+ *            nlgroup now global (sysctl)
+ * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at
+ *            module loadtime -HW
+ * 2002/07/07 remove broken nflog_rcv() function -HW
+ * 2002/08/29 fix shifted/unshifted nlgroup bug -HW
+ * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen>
+ * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT
+ *            resulting in bogus 'error during NLMSG_PUT' messages.
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This module accepts two parameters: 
+ * 
+ * nlbufsiz:
+ *   The parameter specifies how big the buffer for each netlink multicast
+ * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
+ * get accumulated in the kernel until they are sent to userspace. It is
+ * NOT possible to allocate more than 128kB, and it is strongly discouraged,
+ * because atomically allocating 128kB inside the network rx softirq is not
+ * reliable. Please also keep in mind that this buffer size is allocated for
+ * each nlgroup you are using, so the total kernel memory usage increases
+ * by that factor.
+ *
+ * flushtimeout:
+ *   Specify, after how many hundredths of a second the queue should be
+ *   flushed even if it is not full yet.
+ *
+ * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/netlink.h>
+#include <linux/netdevice.h>
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ULOG.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <net/sock.h>
+#include <linux/bitops.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("iptables userspace logging module");
+#define ULOG_NL_EVENT           111             /* Harald's favorite number */
+#define ULOG_MAXNLGROUPS        32              /* numer of nlgroups */
+#if 0
+#define DEBUGP(format, args...) printk("%s:%s:" format, \
+                                       __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
+static unsigned int nlbufsiz = 4096;
+module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */
+MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
+static unsigned int flushtimeout = 10;
+module_param(flushtimeout, int, 0600);
+MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
+static unsigned int nflog = 1;
+module_param(nflog, int, 0400);
+MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
+/* global data structures */
+typedef struct {
+        unsigned int qlen;              /* number of nlmsgs' in the skb */
+        struct nlmsghdr *lastnlh;       /* netlink header of last msg in skb */
+        struct sk_buff *skb;            /* the pre-allocated skb */
+        struct timer_list timer;        /* the timer function */
+} ulog_buff_t;
+static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];      /* array of buffers */
+static struct sock *nflognl;    /* our socket */
+static DECLARE_LOCK(ulog_lock); /* spinlock */
+/* send one ulog_buff_t to userspace */
+static void ulog_send(unsigned int nlgroupnum)
+{
+        ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+        if (timer_pending(&ub->timer)) {
+                DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n");
+                del_timer(&ub->timer);
+        }
+        /* last nlmsg needs NLMSG_DONE */
+        if (ub->qlen > 1)
+                ub->lastnlh->nlmsg_type = NLMSG_DONE;
+        NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
+        DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
+                ub->qlen, nlgroupnum);
+        netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+        ub->qlen = 0;
+        ub->skb = NULL;
+        ub->lastnlh = NULL;
+}
+/* timer function to flush queue in flushtimeout time */
+static void ulog_timer(unsigned long data)
+{
+        DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n");
+        /* lock to protect against somebody modifying our structure
+         * from ipt_ulog_target at the same time */
+        LOCK_BH(&ulog_lock);
+        ulog_send(data);
+        UNLOCK_BH(&ulog_lock);
+}
+static struct sk_buff *ulog_alloc_skb(unsigned int size)
+{
+        struct sk_buff *skb;
+        /* alloc skb which should be big enough for a whole
+         * multipart message. WARNING: has to be <= 131000
+         * due to slab allocator restrictions */
+        skb = alloc_skb(nlbufsiz, GFP_ATOMIC);
+        if (!skb) {
+                PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n",
+                        nlbufsiz);
+                /* try to allocate only as much as we need for 
+                 * current packet */
+                skb = alloc_skb(size, GFP_ATOMIC);
+                if (!skb)
+                        PRINTR("ipt_ULOG: can't even allocate %ub\n", size);
+        }
+        return skb;
+}
+static void ipt_ulog_packet(unsigned int hooknum,
+                            const struct sk_buff *skb,
+                            const struct net_device *in,
+                            const struct net_device *out,
+                            const struct ipt_ulog_info *loginfo,
+                            const char *prefix)
+{
+        ulog_buff_t *ub;
+        ulog_packet_msg_t *pm;
+        size_t size, copy_len;
+        struct nlmsghdr *nlh;
+        /* ffs == find first bit set, necessary because userspace
+         * is already shifting groupnumber, but we need unshifted.
+         * ffs() returns [1..32], we need [0..31] */
+        unsigned int groupnum = ffs(loginfo->nl_group) - 1;
+        /* calculate the size of the skb needed */
+        if ((loginfo->copy_range == 0) ||
+            (loginfo->copy_range > skb->len)) {
+                copy_len = skb->len;
+        } else {
+                copy_len = loginfo->copy_range;
+        }
+        size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+        ub = &ulog_buffers[groupnum];
+        
+        LOCK_BH(&ulog_lock);
+        if (!ub->skb) {
+                if (!(ub->skb = ulog_alloc_skb(size)))
+                        goto alloc_failure;
+        } else if (ub->qlen >= loginfo->qthreshold ||
+                   size > skb_tailroom(ub->skb)) {
+                /* either the queue len is too high or we don't have 
+                 * enough room in nlskb left. send it to userspace. */
+                ulog_send(groupnum);
+                if (!(ub->skb = ulog_alloc_skb(size)))
+                        goto alloc_failure;
+        }
+        DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen, 
+                loginfo->qthreshold);
+        /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
+        nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, 
+                        sizeof(*pm)+copy_len);
+        ub->qlen++;
+        pm = NLMSG_DATA(nlh);
+        /* We might not have a timestamp, get one */
+        if (skb->stamp.tv_sec == 0)
+                do_gettimeofday((struct timeval *)&skb->stamp);
+        /* copy hook, prefix, timestamp, payload, etc. */
+        pm->data_len = copy_len;
+        pm->timestamp_sec = skb->stamp.tv_sec;
+        pm->timestamp_usec = skb->stamp.tv_usec;
+        pm->mark = skb->nfmark;
+        pm->hook = hooknum;
+        if (prefix != NULL)
+                strncpy(pm->prefix, prefix, sizeof(pm->prefix));
+        else if (loginfo->prefix[0] != '\0')
+                strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
+        else
+                *(pm->prefix) = '\0';
+        if (in && in->hard_header_len > 0
+            && skb->mac.raw != (void *) skb->nh.iph
+            && in->hard_header_len <= ULOG_MAC_LEN) {
+                memcpy(pm->mac, skb->mac.raw, in->hard_header_len);
+                pm->mac_len = in->hard_header_len;
+        } else
+                pm->mac_len = 0;
+        if (in)
+                strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+        else
+                pm->indev_name[0] = '\0';
+        if (out)
+                strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+        else
+                pm->outdev_name[0] = '\0';
+        /* copy_len <= skb->len, so can't fail. */
+        if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
+                BUG();
+        
+        /* check if we are building multi-part messages */
+        if (ub->qlen > 1) {
+                ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
+        }
+        ub->lastnlh = nlh;
+        /* if timer isn't already running, start it */
+        if (!timer_pending(&ub->timer)) {
+                ub->timer.expires = jiffies + flushtimeout * HZ / 100;
+                add_timer(&ub->timer);
+        }
+        /* if threshold is reached, send message to userspace */
+        if (ub->qlen >= loginfo->qthreshold) {
+                if (loginfo->qthreshold > 1)
+                        nlh->nlmsg_type = NLMSG_DONE;
+                ulog_send(groupnum);
+        }
+        UNLOCK_BH(&ulog_lock);
+        return;
+nlmsg_failure:
+        PRINTR("ipt_ULOG: error during NLMSG_PUT\n");
+alloc_failure:
+        PRINTR("ipt_ULOG: Error building netlink message\n");
+        UNLOCK_BH(&ulog_lock);
+}
+static unsigned int ipt_ulog_target(struct sk_buff **pskb,
+                                    const struct net_device *in,
+                                    const struct net_device *out,
+                                    unsigned int hooknum,
+                                    const void *targinfo, void *userinfo)
+{
+        struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
+        ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL);
+ 
+        return IPT_CONTINUE;
+}
+ 
+static void ipt_logfn(unsigned int hooknum,
+                      const struct sk_buff *skb,
+                      const struct net_device *in,
+                      const struct net_device *out,
+                      const char *prefix)
+{
+        struct ipt_ulog_info loginfo = { 
+                .nl_group = ULOG_DEFAULT_NLGROUP,
+                .copy_range = 0,
+                .qthreshold = ULOG_DEFAULT_QTHRESHOLD,
+                .prefix = ""
+        };
+        ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+static int ipt_ulog_checkentry(const char *tablename,
+                               const struct ipt_entry *e,
+                               void *targinfo,
+                               unsigned int targinfosize,
+                               unsigned int hookmask)
+{
+        struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ulog_info))) {
+                DEBUGP("ipt_ULOG: targinfosize %u != 0\n", targinfosize);
+                return 0;
+        }
+        if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
+                DEBUGP("ipt_ULOG: prefix term %i\n",
+                       loginfo->prefix[sizeof(loginfo->prefix) - 1]);
+                return 0;
+        }
+        if (loginfo->qthreshold > ULOG_MAX_QLEN) {
+                DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n",
+                        loginfo->qthreshold);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_ulog_reg = {
+        .name           = "ULOG",
+        .target         = ipt_ulog_target,
+        .checkentry     = ipt_ulog_checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        int i;
+        DEBUGP("ipt_ULOG: init module\n");
+        if (nlbufsiz >= 128*1024) {
+                printk("Netlink buffer has to be <= 128kB\n");
+                return -EINVAL;
+        }
+        /* initialize ulog_buffers */
+        for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+                init_timer(&ulog_buffers[i].timer);
+                ulog_buffers[i].timer.function = ulog_timer;
+                ulog_buffers[i].timer.data = i;
+        }
+        nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+        if (!nflognl)
+                return -ENOMEM;
+        if (ipt_register_target(&ipt_ulog_reg) != 0) {
+                sock_release(nflognl->sk_socket);
+                return -EINVAL;
+        }
+        if (nflog)
+                nf_log_register(PF_INET, &ipt_logfn);
+        
+        return 0;
+}
+static void __exit fini(void)
+{
+        ulog_buff_t *ub;
+        int i;
+        DEBUGP("ipt_ULOG: cleanup_module\n");
+        if (nflog)
+                nf_log_unregister(PF_INET, &ipt_logfn);
+        ipt_unregister_target(&ipt_ulog_reg);
+        sock_release(nflognl->sk_socket);
+        /* remove pending timers and free allocated skb's */
+        for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+                ub = &ulog_buffers[i];
+                if (timer_pending(&ub->timer)) {
+                        DEBUGP("timer was pending, deleting\n");
+                        del_timer(&ub->timer);
+                }
+                if (ub->skb) {
+                        kfree_skb(ub->skb);
+                        ub->skb = NULL;
+                }
+        }
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
new file mode 100644
index 000000000000..f5909a4c3fc7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -0,0 +1,77 @@
+/*
+ *  iptables module to match inet_addr_type() of an ip.
+ *
+ *  Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4/ipt_addrtype.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("iptables addrtype match");
+static inline int match_type(u_int32_t addr, u_int16_t mask)
+{
+        return !!(mask & (1 << inet_addr_type(addr)));
+}
+static int match(const struct sk_buff *skb, const struct net_device *in,
+                 const struct net_device *out, const void *matchinfo,
+                 int offset, int *hotdrop)
+{
+        const struct ipt_addrtype_info *info = matchinfo;
+        const struct iphdr *iph = skb->nh.iph;
+        int ret = 1;
+        if (info->source)
+                ret &= match_type(iph->saddr, info->source)^info->invert_source;
+        if (info->dest)
+                ret &= match_type(iph->daddr, info->dest)^info->invert_dest;
+        
+        return ret;
+}
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) {
+                printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.",
+                       matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info)));
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_match addrtype_match = {
+        .name           = "addrtype",
+        .match          = match,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&addrtype_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&addrtype_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 000000000000..a0fea847cb72
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,117 @@
+/* Kernel module to match AH parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4/ipt_ah.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("iptables AH SPI match module");
+#ifdef DEBUG_CONNTRACK
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline int
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+{
+        int r=0;
+        duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
+                min,spi,max);
+        r=(spi >= min && spi <= max) ^ invert;
+        duprintf(" result %s\n",r? "PASS" : "FAILED");
+        return r;
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        struct ip_auth_hdr _ahdr, *ah;
+        const struct ipt_ah *ahinfo = matchinfo;
+        /* Must not be a fragment. */
+        if (offset)
+                return 0;
+        ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                sizeof(_ahdr), &_ahdr);
+        if (ah == NULL) {
+                /* We've been asked to examine this packet, and we
+                 * can't.  Hence, no choice but to drop.
+                 */
+                duprintf("Dropping evil AH tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+                         ntohl(ah->spi),
+                         !!(ahinfo->invflags & IPT_AH_INV_SPI));
+}
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchinfosize,
+           unsigned int hook_mask)
+{
+        const struct ipt_ah *ahinfo = matchinfo;
+        /* Must specify proto == AH, and no unknown invflags */
+        if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) {
+                duprintf("ipt_ah: Protocol %u != %u\n", ip->proto,
+                         IPPROTO_AH);
+                return 0;
+        }
+        if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_ah))) {
+                duprintf("ipt_ah: matchsize %u != %u\n",
+                         matchinfosize, IPT_ALIGN(sizeof(struct ipt_ah)));
+                return 0;
+        }
+        if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
+                duprintf("ipt_ah: unknown flags %X\n",
+                         ahinfo->invflags);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_match ah_match = {
+        .name           = "ah",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&ah_match);
+}
+static void __exit cleanup(void)
+{
+        ipt_unregister_match(&ah_match);
+}
+module_init(init);
+module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c
new file mode 100644
index 000000000000..6b76a1ea5245
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_comment.c
@@ -0,0 +1,59 @@
+/*
+ * Implements a dummy match to allow attaching comments to rules
+ *
+ * 2003-05-13 Brad Fisher (brad@info-link.net)
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_comment.h>
+MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
+MODULE_DESCRIPTION("iptables comment match module");
+MODULE_LICENSE("GPL");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        /* We always match */
+        return 1;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        /* Check the size */
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match comment_match = {
+        .name           = "comment",
+        .match          = match,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&comment_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&comment_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
new file mode 100644
index 000000000000..2706f96cea55
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -0,0 +1,81 @@
+/* This kernel module matches connection mark values set by the
+ * CONNMARK target
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables connmark match module");
+MODULE_LICENSE("GPL");
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connmark.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_connmark_info *info = matchinfo;
+        enum ip_conntrack_info ctinfo;
+        struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+        if (!ct)
+                return 0;
+        return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match connmark_match = {
+        .name = "connmark",
+        .match = &match,
+        .checkentry = &checkentry,
+        .me = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&connmark_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&connmark_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c
new file mode 100644
index 000000000000..c1d22801b7cf
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_conntrack.c
@@ -0,0 +1,136 @@
+/* Kernel module to match connection tracking information.
+ * Superset of Rusty's minimalistic state match.
+ *
+ * (C) 2001  Marc Boucher (marc@mbsi.ca).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_conntrack.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables connection tracking match module");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_conntrack_info *sinfo = matchinfo;
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        unsigned int statebit;
+        ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
+        if (ct == &ip_conntrack_untracked)
+                statebit = IPT_CONNTRACK_STATE_UNTRACKED;
+        else if (ct)
+                statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
+        else
+                statebit = IPT_CONNTRACK_STATE_INVALID;
+ 
+        if(sinfo->flags & IPT_CONNTRACK_STATE) {
+                if (ct) {
+                        if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip !=
+                            ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip)
+                                statebit |= IPT_CONNTRACK_STATE_SNAT;
+                        if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip !=
+                            ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip)
+                                statebit |= IPT_CONNTRACK_STATE_DNAT;
+                }
+                if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
+                        return 0;
+        }
+        if(sinfo->flags & IPT_CONNTRACK_PROTO) {
+                if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
+                        return 0;
+        }
+        if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
+                if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
+                        return 0;
+        }
+        if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
+                if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
+                        return 0;
+        }
+        if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
+                if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
+                        return 0;
+        }
+        if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
+                if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
+                        return 0;
+        }
+        if(sinfo->flags & IPT_CONNTRACK_STATUS) {
+                if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
+                        return 0;
+        }
+        if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
+                unsigned long expires;
+                if(!ct)
+                        return 0;
+                expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
+                if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
+                        return 0;
+        }
+        return 1;
+}
+static int check(const char *tablename,
+                 const struct ipt_ip *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match conntrack_match = {
+        .name           = "conntrack",
+        .match          = &match,
+        .checkentry     = &check,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        need_ip_conntrack();
+        return ipt_register_match(&conntrack_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&conntrack_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c
new file mode 100644
index 000000000000..5df52a64a5d4
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dscp.c
@@ -0,0 +1,63 @@
+/* IP tables module for matching the value of the IPv4 DSCP field
+ *
+ * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_dscp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables DSCP matching module");
+MODULE_LICENSE("GPL");
+static int match(const struct sk_buff *skb, const struct net_device *in,
+                 const struct net_device *out, const void *matchinfo,
+                 int offset, int *hotdrop)
+{
+        const struct ipt_dscp_info *info = matchinfo;
+        const struct iphdr *iph = skb->nh.iph;
+        u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
+        return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert;
+}
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_dscp_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match dscp_match = {
+        .name           = "dscp",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&dscp_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&dscp_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
new file mode 100644
index 000000000000..b6f7181e89cc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -0,0 +1,131 @@
+/* IP tables module for matching the value of the IPv4 and TCP ECN bits
+ *
+ * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ecn.h>
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables ECN matching module");
+MODULE_LICENSE("GPL");
+static inline int match_ip(const struct sk_buff *skb,
+                           const struct ipt_ecn_info *einfo)
+{
+        return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect);
+}
+static inline int match_tcp(const struct sk_buff *skb,
+                            const struct ipt_ecn_info *einfo,
+                            int *hotdrop)
+{
+        struct tcphdr _tcph, *th;
+        /* In practice, TCP match does this, so can't fail.  But let's
+         * be good citizens.
+         */
+        th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL) {
+                *hotdrop = 0;
+                return 0;
+        }
+        if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
+                if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
+                        if (th->ece == 1)
+                                return 0;
+                } else {
+                        if (th->ece == 0)
+                                return 0;
+                }
+        }
+        if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
+                if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
+                        if (th->cwr == 1)
+                                return 0;
+                } else {
+                        if (th->cwr == 0)
+                                return 0;
+                }
+        }
+        return 1;
+}
+static int match(const struct sk_buff *skb, const struct net_device *in,
+                 const struct net_device *out, const void *matchinfo,
+                 int offset, int *hotdrop)
+{
+        const struct ipt_ecn_info *info = matchinfo;
+        if (info->operation & IPT_ECN_OP_MATCH_IP)
+                if (!match_ip(skb, info))
+                        return 0;
+        if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
+                if (skb->nh.iph->protocol != IPPROTO_TCP)
+                        return 0;
+                if (!match_tcp(skb, info, hotdrop))
+                        return 0;
+        }
+        return 1;
+}
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+        const struct ipt_ecn_info *info = matchinfo;
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info)))
+                return 0;
+        if (info->operation & IPT_ECN_OP_MATCH_MASK)
+                return 0;
+        if (info->invert & IPT_ECN_OP_MATCH_MASK)
+                return 0;
+        if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)
+            && ip->proto != IPPROTO_TCP) {
+                printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
+                       " non-tcp packets\n");
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_match ecn_match = {
+        .name           = "ecn",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&ecn_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&ecn_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c
new file mode 100644
index 000000000000..e1d0dd31e117
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_esp.c
@@ -0,0 +1,118 @@
+/* Kernel module to match ESP parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4/ipt_esp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("iptables ESP SPI match module");
+#ifdef DEBUG_CONNTRACK
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline int
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+{
+        int r=0;
+        duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
+                min,spi,max);
+        r=(spi >= min && spi <= max) ^ invert;
+        duprintf(" result %s\n",r? "PASS" : "FAILED");
+        return r;
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        struct ip_esp_hdr _esp, *eh;
+        const struct ipt_esp *espinfo = matchinfo;
+        /* Must not be a fragment. */
+        if (offset)
+                return 0;
+        eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                sizeof(_esp), &_esp);
+        if (eh == NULL) {
+                /* We've been asked to examine this packet, and we
+                 * can't.  Hence, no choice but to drop.
+                 */
+                duprintf("Dropping evil ESP tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        return spi_match(espinfo->spis[0], espinfo->spis[1],
+                         ntohl(eh->spi),
+                         !!(espinfo->invflags & IPT_ESP_INV_SPI));
+}
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchinfosize,
+           unsigned int hook_mask)
+{
+        const struct ipt_esp *espinfo = matchinfo;
+        /* Must specify proto == ESP, and no unknown invflags */
+        if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) {
+                duprintf("ipt_esp: Protocol %u != %u\n", ip->proto,
+                         IPPROTO_ESP);
+                return 0;
+        }
+        if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_esp))) {
+                duprintf("ipt_esp: matchsize %u != %u\n",
+                         matchinfosize, IPT_ALIGN(sizeof(struct ipt_esp)));
+                return 0;
+        }
+        if (espinfo->invflags & ~IPT_ESP_INV_MASK) {
+                duprintf("ipt_esp: unknown flags %X\n",
+                         espinfo->invflags);
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_match esp_match = {
+        .name           = "esp",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&esp_match);
+}
+static void __exit cleanup(void)
+{
+        ipt_unregister_match(&esp_match);
+}
+module_init(init);
+module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
new file mode 100644
index 000000000000..f1937190cd77
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -0,0 +1,731 @@
+/* iptables match extension to limit the number of packets per second
+ * seperately for each hashbucket (sourceip/sourceport/dstip/dstport)
+ *
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $
+ *
+ * Development of this code was funded by Astaro AG, http://www.astaro.com/
+ *
+ * based on ipt_limit.c by:
+ * J�r�me de Vivie      <devivie@info.enserb.u-bordeaux.fr>
+ * Herv� Eychenne       <eychenne@info.enserb.u-bordeaux.fr>
+ * Rusty Russell        <rusty@rustcorp.com.au>
+ *
+ * The general idea is to create a hash table for every dstip and have a
+ * seperate limit counter per tuple.  This way you can do something like 'limit
+ * the number of syn packets for each of my internal addresses.
+ *
+ * Ideally this would just be implemented as a general 'hash' match, which would
+ * allow us to attach any iptables target to it's hash buckets.  But this is
+ * not possible in the current iptables architecture.  As always, pkttables for
+ * 2.7.x will help ;)
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_hashlimit.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+/* FIXME: this is just for IP_NF_ASSERRT */
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for limiting per hash-bucket");
+/* need to declare this at the top */
+static struct proc_dir_entry *hashlimit_procdir;
+static struct file_operations dl_file_ops;
+/* hash table crap */
+struct dsthash_dst {
+        u_int32_t src_ip;
+        u_int32_t dst_ip;
+        /* ports have to be consecutive !!! */
+        u_int16_t src_port;
+        u_int16_t dst_port;
+};
+struct dsthash_ent {
+        /* static / read-only parts in the beginning */
+        struct hlist_node node;
+        struct dsthash_dst dst;
+        /* modified structure members in the end */
+        unsigned long expires;          /* precalculated expiry time */
+        struct {
+                unsigned long prev;     /* last modification */
+                u_int32_t credit;
+                u_int32_t credit_cap, cost;
+        } rateinfo;
+};
+struct ipt_hashlimit_htable {
+        struct hlist_node node;         /* global list of all htables */
+        atomic_t use;
+        struct hashlimit_cfg cfg;       /* config */
+        /* used internally */
+        spinlock_t lock;                /* lock for list_head */
+        u_int32_t rnd;                  /* random seed for hash */
+        struct timer_list timer;        /* timer for gc */
+        atomic_t count;                 /* number entries in table */
+        /* seq_file stuff */
+        struct proc_dir_entry *pde;
+        struct hlist_head hash[0];      /* hashtable itself */
+};
+static DECLARE_LOCK(hashlimit_lock);    /* protects htables list */
+static DECLARE_MUTEX(hlimit_mutex);     /* additional checkentry protection */
+static HLIST_HEAD(hashlimit_htables);
+static kmem_cache_t *hashlimit_cachep;
+static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
+{
+        return (ent->dst.dst_ip == b->dst_ip 
+                && ent->dst.dst_port == b->dst_port
+                && ent->dst.src_port == b->src_port
+                && ent->dst.src_ip == b->src_ip);
+}
+static inline u_int32_t
+hash_dst(const struct ipt_hashlimit_htable *ht, const struct dsthash_dst *dst)
+{
+        return (jhash_3words(dst->dst_ip, (dst->dst_port<<16 | dst->src_port), 
+                             dst->src_ip, ht->rnd) % ht->cfg.size);
+}
+static inline struct dsthash_ent *
+__dsthash_find(const struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
+{
+        struct dsthash_ent *ent;
+        struct hlist_node *pos;
+        u_int32_t hash = hash_dst(ht, dst);
+        if (!hlist_empty(&ht->hash[hash]))
+                hlist_for_each_entry(ent, pos, &ht->hash[hash], node) {
+                        if (dst_cmp(ent, dst)) {
+                                return ent;
+                        }
+                }
+        
+        return NULL;
+}
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+__dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
+{
+        struct dsthash_ent *ent;
+        /* initialize hash with random val at the time we allocate
+         * the first hashtable entry */
+        if (!ht->rnd)
+                get_random_bytes(&ht->rnd, 4);
+        if (ht->cfg.max &&
+            atomic_read(&ht->count) >= ht->cfg.max) {
+                /* FIXME: do something. question is what.. */
+                if (net_ratelimit())
+                        printk(KERN_WARNING 
+                                "ipt_hashlimit: max count of %u reached\n", 
+                                ht->cfg.max);
+                return NULL;
+        }
+        ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+        if (!ent) {
+                if (net_ratelimit())
+                        printk(KERN_ERR 
+                                "ipt_hashlimit: can't allocate dsthash_ent\n");
+                return NULL;
+        }
+        atomic_inc(&ht->count);
+        ent->dst.dst_ip = dst->dst_ip;
+        ent->dst.dst_port = dst->dst_port;
+        ent->dst.src_ip = dst->src_ip;
+        ent->dst.src_port = dst->src_port;
+        hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+        return ent;
+}
+static inline void 
+__dsthash_free(struct ipt_hashlimit_htable *ht, struct dsthash_ent *ent)
+{
+        hlist_del(&ent->node);
+        kmem_cache_free(hashlimit_cachep, ent);
+        atomic_dec(&ht->count);
+}
+static void htable_gc(unsigned long htlong);
+static int htable_create(struct ipt_hashlimit_info *minfo)
+{
+        int i;
+        unsigned int size;
+        struct ipt_hashlimit_htable *hinfo;
+        if (minfo->cfg.size)
+                size = minfo->cfg.size;
+        else {
+                size = (((num_physpages << PAGE_SHIFT) / 16384)
+                         / sizeof(struct list_head));
+                if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+                        size = 8192;
+                if (size < 16)
+                        size = 16;
+        }
+        /* FIXME: don't use vmalloc() here or anywhere else -HW */
+        hinfo = vmalloc(sizeof(struct ipt_hashlimit_htable)
+                        + (sizeof(struct list_head) * size));
+        if (!hinfo) {
+                printk(KERN_ERR "ipt_hashlimit: Unable to create hashtable\n");
+                return -1;
+        }
+        minfo->hinfo = hinfo;
+        /* copy match config into hashtable config */
+        memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+        hinfo->cfg.size = size;
+        if (!hinfo->cfg.max)
+                hinfo->cfg.max = 8 * hinfo->cfg.size;
+        else if (hinfo->cfg.max < hinfo->cfg.size)
+                hinfo->cfg.max = hinfo->cfg.size;
+        for (i = 0; i < hinfo->cfg.size; i++)
+                INIT_HLIST_HEAD(&hinfo->hash[i]);
+        atomic_set(&hinfo->count, 0);
+        atomic_set(&hinfo->use, 1);
+        hinfo->rnd = 0;
+        spin_lock_init(&hinfo->lock);
+        hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
+        if (!hinfo->pde) {
+                vfree(hinfo);
+                return -1;
+        }
+        hinfo->pde->proc_fops = &dl_file_ops;
+        hinfo->pde->data = hinfo;
+        init_timer(&hinfo->timer);
+        hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval);
+        hinfo->timer.data = (unsigned long )hinfo;
+        hinfo->timer.function = htable_gc;
+        add_timer(&hinfo->timer);
+        LOCK_BH(&hashlimit_lock);
+        hlist_add_head(&hinfo->node, &hashlimit_htables);
+        UNLOCK_BH(&hashlimit_lock);
+        return 0;
+}
+static int select_all(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
+{
+        return 1;
+}
+static int select_gc(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
+{
+        return (jiffies >= he->expires);
+}
+static void htable_selective_cleanup(struct ipt_hashlimit_htable *ht,
+                                int (*select)(struct ipt_hashlimit_htable *ht, 
+                                              struct dsthash_ent *he))
+{
+        int i;
+        IP_NF_ASSERT(ht->cfg.size && ht->cfg.max);
+        /* lock hash table and iterate over it */
+        spin_lock_bh(&ht->lock);
+        for (i = 0; i < ht->cfg.size; i++) {
+                struct dsthash_ent *dh;
+                struct hlist_node *pos, *n;
+                hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) {
+                        if ((*select)(ht, dh))
+                                __dsthash_free(ht, dh);
+                }
+        }
+        spin_unlock_bh(&ht->lock);
+}
+/* hash table garbage collector, run by timer */
+static void htable_gc(unsigned long htlong)
+{
+        struct ipt_hashlimit_htable *ht = (struct ipt_hashlimit_htable *)htlong;
+        htable_selective_cleanup(ht, select_gc);
+        /* re-add the timer accordingly */
+        ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval);
+        add_timer(&ht->timer);
+}
+static void htable_destroy(struct ipt_hashlimit_htable *hinfo)
+{
+        /* remove timer, if it is pending */
+        if (timer_pending(&hinfo->timer))
+                del_timer(&hinfo->timer);
+        /* remove proc entry */
+        remove_proc_entry(hinfo->pde->name, hashlimit_procdir);
+        htable_selective_cleanup(hinfo, select_all);
+        vfree(hinfo);
+}
+static struct ipt_hashlimit_htable *htable_find_get(char *name)
+{
+        struct ipt_hashlimit_htable *hinfo;
+        struct hlist_node *pos;
+        LOCK_BH(&hashlimit_lock);
+        hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
+                if (!strcmp(name, hinfo->pde->name)) {
+                        atomic_inc(&hinfo->use);
+                        UNLOCK_BH(&hashlimit_lock);
+                        return hinfo;
+                }
+        }
+        UNLOCK_BH(&hashlimit_lock);
+        return NULL;
+}
+static void htable_put(struct ipt_hashlimit_htable *hinfo)
+{
+        if (atomic_dec_and_test(&hinfo->use)) {
+                LOCK_BH(&hashlimit_lock);
+                hlist_del(&hinfo->node);
+                UNLOCK_BH(&hashlimit_lock);
+                htable_destroy(hinfo);
+        }
+}
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+   To get the maximum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+/* Precision saver. */
+static inline u_int32_t
+user2credits(u_int32_t user)
+{
+        /* If multiplying would overflow... */
+        if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+                /* Divide first. */
+                return (user / IPT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+        return (user * HZ * CREDITS_PER_JIFFY) / IPT_HASHLIMIT_SCALE;
+}
+static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+{
+        dh->rateinfo.credit += (now - xchg(&dh->rateinfo.prev, now)) 
+                                        * CREDITS_PER_JIFFY;
+        if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
+                dh->rateinfo.credit = dh->rateinfo.credit_cap;
+}
+static inline int get_ports(const struct sk_buff *skb, int offset, 
+                            u16 ports[2])
+{
+        union {
+                struct tcphdr th;
+                struct udphdr uh;
+                sctp_sctphdr_t sctph;
+        } hdr_u, *ptr_u;
+        /* Must not be a fragment. */
+        if (offset)
+                return 1;
+        /* Must be big enough to read ports (both UDP and TCP have
+           them at the start). */
+        ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); 
+        if (!ptr_u)
+                return 1;
+        switch (skb->nh.iph->protocol) {
+                case IPPROTO_TCP:
+                        ports[0] = ptr_u->th.source;
+                        ports[1] = ptr_u->th.dest;
+                        break;
+                case IPPROTO_UDP:
+                        ports[0] = ptr_u->uh.source;
+                        ports[1] = ptr_u->uh.dest;
+                        break;
+                case IPPROTO_SCTP:
+                        ports[0] = ptr_u->sctph.source;
+                        ports[1] = ptr_u->sctph.dest;
+                        break;
+                default:
+                        /* all other protocols don't supprot per-port hash
+                         * buckets */
+                        ports[0] = ports[1] = 0;
+                        break;
+        }
+        return 0;
+}
+static int
+hashlimit_match(const struct sk_buff *skb,
+                const struct net_device *in,
+                const struct net_device *out,
+                const void *matchinfo,
+                int offset,
+                int *hotdrop)
+{
+        struct ipt_hashlimit_info *r = 
+                ((struct ipt_hashlimit_info *)matchinfo)->u.master;
+        struct ipt_hashlimit_htable *hinfo = r->hinfo;
+        unsigned long now = jiffies;
+        struct dsthash_ent *dh;
+        struct dsthash_dst dst;
+        /* build 'dst' according to hinfo->cfg and current packet */
+        memset(&dst, 0, sizeof(dst));
+        if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DIP)
+                dst.dst_ip = skb->nh.iph->daddr;
+        if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SIP)
+                dst.src_ip = skb->nh.iph->saddr;
+        if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
+            ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
+                u_int16_t ports[2];
+                if (get_ports(skb, offset, ports)) {
+                        /* We've been asked to examine this packet, and we
+                          can't.  Hence, no choice but to drop. */
+                        *hotdrop = 1;
+                        return 0;
+                }
+                if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT)
+                        dst.src_port = ports[0];
+                if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT)
+                        dst.dst_port = ports[1];
+        } 
+        spin_lock_bh(&hinfo->lock);
+        dh = __dsthash_find(hinfo, &dst);
+        if (!dh) {
+                dh = __dsthash_alloc_init(hinfo, &dst);
+                if (!dh) {
+                        /* enomem... don't match == DROP */
+                        if (net_ratelimit())
+                                printk(KERN_ERR "%s: ENOMEM\n", __FUNCTION__);
+                        spin_unlock_bh(&hinfo->lock);
+                        return 0;
+                }
+                dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
+                dh->rateinfo.prev = jiffies;
+                dh->rateinfo.credit = user2credits(hinfo->cfg.avg * 
+                                                        hinfo->cfg.burst);
+                dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * 
+                                                        hinfo->cfg.burst);
+                dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+                spin_unlock_bh(&hinfo->lock);
+                return 1;
+        }
+        /* update expiration timeout */
+        dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+        rateinfo_recalc(dh, now);
+        if (dh->rateinfo.credit >= dh->rateinfo.cost) {
+                /* We're underlimit. */
+                dh->rateinfo.credit -= dh->rateinfo.cost;
+                spin_unlock_bh(&hinfo->lock);
+                return 1;
+        }
+        spin_unlock_bh(&hinfo->lock);
+        /* default case: we're overlimit, thus don't match */
+        return 0;
+}
+static int
+hashlimit_checkentry(const char *tablename,
+                     const struct ipt_ip *ip,
+                     void *matchinfo,
+                     unsigned int matchsize,
+                     unsigned int hook_mask)
+{
+        struct ipt_hashlimit_info *r = matchinfo;
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_hashlimit_info)))
+                return 0;
+        /* Check for overflow. */
+        if (r->cfg.burst == 0
+            || user2credits(r->cfg.avg * r->cfg.burst) < 
+                                        user2credits(r->cfg.avg)) {
+                printk(KERN_ERR "ipt_hashlimit: Overflow, try lower: %u/%u\n",
+                       r->cfg.avg, r->cfg.burst);
+                return 0;
+        }
+        if (r->cfg.mode == 0 
+            || r->cfg.mode > (IPT_HASHLIMIT_HASH_DPT
+                          |IPT_HASHLIMIT_HASH_DIP
+                          |IPT_HASHLIMIT_HASH_SIP
+                          |IPT_HASHLIMIT_HASH_SPT))
+                return 0;
+        if (!r->cfg.gc_interval)
+                return 0;
+        
+        if (!r->cfg.expire)
+                return 0;
+        /* This is the best we've got: We cannot release and re-grab lock,
+         * since checkentry() is called before ip_tables.c grabs ipt_mutex.  
+         * We also cannot grab the hashtable spinlock, since htable_create will 
+         * call vmalloc, and that can sleep.  And we cannot just re-search
+         * the list of htable's in htable_create(), since then we would
+         * create duplicate proc files. -HW */
+        down(&hlimit_mutex);
+        r->hinfo = htable_find_get(r->name);
+        if (!r->hinfo && (htable_create(r) != 0)) {
+                up(&hlimit_mutex);
+                return 0;
+        }
+        up(&hlimit_mutex);
+        /* Ugly hack: For SMP, we only want to use one set */
+        r->u.master = r;
+        return 1;
+}
+static void
+hashlimit_destroy(void *matchinfo, unsigned int matchsize)
+{
+        struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo;
+        htable_put(r->hinfo);
+}
+static struct ipt_match ipt_hashlimit = { 
+        .name = "hashlimit", 
+        .match = hashlimit_match, 
+        .checkentry = hashlimit_checkentry, 
+        .destroy = hashlimit_destroy,
+        .me = THIS_MODULE 
+};
+/* PROC stuff */
+static void *dl_seq_start(struct seq_file *s, loff_t *pos)
+{
+        struct proc_dir_entry *pde = s->private;
+        struct ipt_hashlimit_htable *htable = pde->data;
+        unsigned int *bucket;
+        spin_lock_bh(&htable->lock);
+        if (*pos >= htable->cfg.size)
+                return NULL;
+        bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC);
+        if (!bucket)
+                return ERR_PTR(-ENOMEM);
+        *bucket = *pos;
+        return bucket;
+}
+static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        struct proc_dir_entry *pde = s->private;
+        struct ipt_hashlimit_htable *htable = pde->data;
+        unsigned int *bucket = (unsigned int *)v;
+        *pos = ++(*bucket);
+        if (*pos >= htable->cfg.size) {
+                kfree(v);
+                return NULL;
+        }
+        return bucket;
+}
+static void dl_seq_stop(struct seq_file *s, void *v)
+{
+        struct proc_dir_entry *pde = s->private;
+        struct ipt_hashlimit_htable *htable = pde->data;
+        unsigned int *bucket = (unsigned int *)v;
+        kfree(bucket);
+        spin_unlock_bh(&htable->lock);
+}
+static inline int dl_seq_real_show(struct dsthash_ent *ent, struct seq_file *s)
+{
+        /* recalculate to show accurate numbers */
+        rateinfo_recalc(ent, jiffies);
+        return seq_printf(s, "%ld %u.%u.%u.%u:%u->%u.%u.%u.%u:%u %u %u %u\n",
+                        (long)(ent->expires - jiffies)/HZ,
+                        NIPQUAD(ent->dst.src_ip), ntohs(ent->dst.src_port),
+                        NIPQUAD(ent->dst.dst_ip), ntohs(ent->dst.dst_port),
+                        ent->rateinfo.credit, ent->rateinfo.credit_cap,
+                        ent->rateinfo.cost);
+}
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+        struct proc_dir_entry *pde = s->private;
+        struct ipt_hashlimit_htable *htable = pde->data;
+        unsigned int *bucket = (unsigned int *)v;
+        struct dsthash_ent *ent;
+        struct hlist_node *pos;
+        if (!hlist_empty(&htable->hash[*bucket]))
+                hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) {
+                        if (dl_seq_real_show(ent, s)) {
+                                /* buffer was filled and unable to print that tuple */
+                                return 1;
+                        }
+                }
+        
+        return 0;
+}
+static struct seq_operations dl_seq_ops = {
+        .start = dl_seq_start,
+        .next  = dl_seq_next,
+        .stop  = dl_seq_stop,
+        .show  = dl_seq_show
+};
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+        int ret = seq_open(file, &dl_seq_ops);
+        if (!ret) {
+                struct seq_file *sf = file->private_data;
+                sf->private = PDE(inode);
+        }
+        return ret;
+}
+static struct file_operations dl_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = dl_proc_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+static int init_or_fini(int fini)
+{
+        int ret = 0;
+        if (fini)
+                goto cleanup;
+        if (ipt_register_match(&ipt_hashlimit)) {
+                ret = -EINVAL;
+                goto cleanup_nothing;
+        }
+        hashlimit_cachep = kmem_cache_create("ipt_hashlimit",
+                                            sizeof(struct dsthash_ent), 0,
+                                            0, NULL, NULL);
+        if (!hashlimit_cachep) {
+                printk(KERN_ERR "Unable to create ipt_hashlimit slab cache\n");
+                ret = -ENOMEM;
+                goto cleanup_unreg_match;
+        }
+        hashlimit_procdir = proc_mkdir("ipt_hashlimit", proc_net);
+        if (!hashlimit_procdir) {
+                printk(KERN_ERR "Unable to create proc dir entry\n");
+                ret = -ENOMEM;
+                goto cleanup_free_slab;
+        }
+        return ret;
+cleanup:
+        remove_proc_entry("ipt_hashlimit", proc_net);
+cleanup_free_slab:
+        kmem_cache_destroy(hashlimit_cachep);
+cleanup_unreg_match:
+        ipt_unregister_match(&ipt_hashlimit);
+cleanup_nothing:
+        return ret;
+        
+}
+static int __init init(void)
+{
+        return init_or_fini(0);
+}
+static void __exit fini(void)
+{
+        init_or_fini(1);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
new file mode 100644
index 000000000000..33fdf364d3d3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -0,0 +1,113 @@
+/* iptables module to match on related connections */
+/*
+ * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *   19 Mar 2002 Harald Welte <laforge@gnumonks.org>:
+ *               - Port to newnat infrastructure
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_helper.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
+MODULE_DESCRIPTION("iptables helper match module");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_helper_info *info = matchinfo;
+        struct ip_conntrack *ct;
+        enum ip_conntrack_info ctinfo;
+        int ret = info->invert;
+        
+        ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+        if (!ct) {
+                DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
+                return ret;
+        }
+        if (!ct->master) {
+                DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
+                return ret;
+        }
+        READ_LOCK(&ip_conntrack_lock);
+        if (!ct->master->helper) {
+                DEBUGP("ipt_helper: master ct %p has no helper\n", 
+                        exp->expectant);
+                goto out_unlock;
+        }
+        DEBUGP("master's name = %s , info->name = %s\n", 
+                ct->master->helper->name, info->name);
+        if (info->name[0] == '\0')
+                ret ^= 1;
+        else
+                ret ^= !strncmp(ct->master->helper->name, info->name, 
+                                strlen(ct->master->helper->name));
+out_unlock:
+        READ_UNLOCK(&ip_conntrack_lock);
+        return ret;
+}
+static int check(const char *tablename,
+                 const struct ipt_ip *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+        struct ipt_helper_info *info = matchinfo;
+        info->name[29] = '\0';
+        /* verify size */
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match helper_match = {
+        .name           = "helper",
+        .match          = &match,
+        .checkentry     = &check,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        need_ip_conntrack();
+        return ipt_register_match(&helper_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&helper_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
new file mode 100644
index 000000000000..b835b7b2e560
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -0,0 +1,99 @@
+/*
+ * iptables module to match IP address ranges
+ *
+ * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_iprange.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("iptables arbitrary IP range match module");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset, int *hotdrop)
+{
+        const struct ipt_iprange_info *info = matchinfo;
+        const struct iphdr *iph = skb->nh.iph;
+        if (info->flags & IPRANGE_SRC) {
+                if (((ntohl(iph->saddr) < ntohl(info->src.min_ip))
+                          || (ntohl(iph->saddr) > ntohl(info->src.max_ip)))
+                         ^ !!(info->flags & IPRANGE_SRC_INV)) {
+                        DEBUGP("src IP %u.%u.%u.%u NOT in range %s"
+                               "%u.%u.%u.%u-%u.%u.%u.%u\n",
+                                NIPQUAD(iph->saddr),
+                                info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+                                NIPQUAD(info->src.min_ip),
+                                NIPQUAD(info->src.max_ip));
+                        return 0;
+                }
+        }
+        if (info->flags & IPRANGE_DST) {
+                if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip))
+                          || (ntohl(iph->daddr) > ntohl(info->dst.max_ip)))
+                         ^ !!(info->flags & IPRANGE_DST_INV)) {
+                        DEBUGP("dst IP %u.%u.%u.%u NOT in range %s"
+                               "%u.%u.%u.%u-%u.%u.%u.%u\n",
+                                NIPQUAD(iph->daddr),
+                                info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+                                NIPQUAD(info->dst.min_ip),
+                                NIPQUAD(info->dst.max_ip));
+                        return 0;
+                }
+        }
+        return 1;
+}
+static int check(const char *tablename,
+                 const struct ipt_ip *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+        /* verify size */
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match iprange_match = 
+{ 
+        .list = { NULL, NULL }, 
+        .name = "iprange", 
+        .match = &match, 
+        .checkentry = &check, 
+        .destroy = NULL, 
+        .me = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&iprange_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&iprange_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c
new file mode 100644
index 000000000000..4eabcfbda9d1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_length.c
@@ -0,0 +1,64 @@
+/* Kernel module to match packet length. */
+/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_length.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("IP tables packet length matching module");
+MODULE_LICENSE("GPL");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_length_info *info = matchinfo;
+        u_int16_t pktlen = ntohs(skb->nh.iph->tot_len);
+        
+        return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match length_match = {
+        .name           = "length",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&length_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&length_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c
new file mode 100644
index 000000000000..0c24dcc703a5
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_limit.c
@@ -0,0 +1,157 @@
+/* Kernel module to control the rate
+ *
+ * 2 September 1999: Changed from the target RATE to the match
+ *                   `limit', removed logging.  Did I mention that
+ *                   Alexey is a fucking genius?
+ *                   Rusty Russell (rusty@rustcorp.com.au).  */
+/* (C) 1999 J�r�me de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * (C) 1999 Herv� Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_limit.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
+MODULE_DESCRIPTION("iptables rate limit match");
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+static DEFINE_SPINLOCK(limit_lock);
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+   To get the maxmum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+static int
+ipt_limit_match(const struct sk_buff *skb,
+                const struct net_device *in,
+                const struct net_device *out,
+                const void *matchinfo,
+                int offset,
+                int *hotdrop)
+{
+        struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master;
+        unsigned long now = jiffies;
+        spin_lock_bh(&limit_lock);
+        r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
+        if (r->credit > r->credit_cap)
+                r->credit = r->credit_cap;
+        if (r->credit >= r->cost) {
+                /* We're not limited. */
+                r->credit -= r->cost;
+                spin_unlock_bh(&limit_lock);
+                return 1;
+        }
+        spin_unlock_bh(&limit_lock);
+        return 0;
+}
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+        /* If multiplying would overflow... */
+        if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+                /* Divide first. */
+                return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+        return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE;
+}
+static int
+ipt_limit_checkentry(const char *tablename,
+                     const struct ipt_ip *ip,
+                     void *matchinfo,
+                     unsigned int matchsize,
+                     unsigned int hook_mask)
+{
+        struct ipt_rateinfo *r = matchinfo;
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo)))
+                return 0;
+        /* Check for overflow. */
+        if (r->burst == 0
+            || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+                printk("Overflow in ipt_limit, try lower: %u/%u\n",
+                       r->avg, r->burst);
+                return 0;
+        }
+        /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies *
+           128. */
+        r->prev = jiffies;
+        r->credit = user2credits(r->avg * r->burst);     /* Credits full. */
+        r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
+        r->cost = user2credits(r->avg);
+        /* For SMP, we only want to use one set of counters. */
+        r->master = r;
+        return 1;
+}
+static struct ipt_match ipt_limit_reg = {
+        .name           = "limit",
+        .match          = ipt_limit_match,
+        .checkentry     = ipt_limit_checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        if (ipt_register_match(&ipt_limit_reg))
+                return -EINVAL;
+        return 0;
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&ipt_limit_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c
new file mode 100644
index 000000000000..11a459e33f25
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mac.c
@@ -0,0 +1,79 @@
+/* Kernel module to match MAC address parameters. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/netfilter_ipv4/ipt_mac.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mac matching module");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+    const struct ipt_mac_info *info = matchinfo;
+    /* Is mac pointer valid? */
+    return (skb->mac.raw >= skb->head
+            && (skb->mac.raw + ETH_HLEN) <= skb->data
+            /* If so, compare... */
+            && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN)
+                == 0) ^ info->invert));
+}
+static int
+ipt_mac_checkentry(const char *tablename,
+                   const struct ipt_ip *ip,
+                   void *matchinfo,
+                   unsigned int matchsize,
+                   unsigned int hook_mask)
+{
+        /* FORWARD isn't always valid, but it's nice to be able to do --RR */
+        if (hook_mask
+            & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+                | (1 << NF_IP_FORWARD))) {
+                printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
+                return 0;
+        }
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match mac_match = {
+        .name           = "mac",
+        .match          = &match,
+        .checkentry     = &ipt_mac_checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&mac_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&mac_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
new file mode 100644
index 000000000000..8955728127b9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -0,0 +1,64 @@
+/* Kernel module to match NFMARK values. */
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_mark.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables mark matching module");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_mark_info *info = matchinfo;
+        return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match mark_match = {
+        .name           = "mark",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&mark_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&mark_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c
new file mode 100644
index 000000000000..99e8188162e2
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_multiport.c
@@ -0,0 +1,212 @@
+/* Kernel module to match one of a list of TCP/UDP ports: ports are in
+   the same place so we can treat them as equal. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_multiport.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables multiple port match module");
+#if 0
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline int
+ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags,
+            u_int8_t count, u_int16_t src, u_int16_t dst)
+{
+        unsigned int i;
+        for (i=0; i<count; i++) {
+                if (flags != IPT_MULTIPORT_DESTINATION
+                    && portlist[i] == src)
+                        return 1;
+                if (flags != IPT_MULTIPORT_SOURCE
+                    && portlist[i] == dst)
+                        return 1;
+        }
+        return 0;
+}
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline int
+ports_match_v1(const struct ipt_multiport_v1 *minfo,
+               u_int16_t src, u_int16_t dst)
+{
+        unsigned int i;
+        u_int16_t s, e;
+        for (i=0; i < minfo->count; i++) {
+                s = minfo->ports[i];
+                if (minfo->pflags[i]) {
+                        /* range port matching */
+                        e = minfo->ports[++i];
+                        duprintf("src or dst matches with %d-%d?\n", s, e);
+                        if (minfo->flags == IPT_MULTIPORT_SOURCE
+                            && src >= s && src <= e)
+                                return 1 ^ minfo->invert;
+                        if (minfo->flags == IPT_MULTIPORT_DESTINATION
+                            && dst >= s && dst <= e)
+                                return 1 ^ minfo->invert;
+                        if (minfo->flags == IPT_MULTIPORT_EITHER
+                            && ((dst >= s && dst <= e)
+                                || (src >= s && src <= e)))
+                                return 1 ^ minfo->invert;
+                } else {
+                        /* exact port matching */
+                        duprintf("src or dst matches with %d?\n", s);
+                        if (minfo->flags == IPT_MULTIPORT_SOURCE
+                            && src == s)
+                                return 1 ^ minfo->invert;
+                        if (minfo->flags == IPT_MULTIPORT_DESTINATION
+                            && dst == s)
+                                return 1 ^ minfo->invert;
+                        if (minfo->flags == IPT_MULTIPORT_EITHER
+                            && (src == s || dst == s))
+                                return 1 ^ minfo->invert;
+                }
+        }
+ 
+        return minfo->invert;
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        u16 _ports[2], *pptr;
+        const struct ipt_multiport *multiinfo = matchinfo;
+        if (offset)
+                return 0;
+        pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL) {
+                /* We've been asked to examine this packet, and we
+                 * can't.  Hence, no choice but to drop.
+                 */
+                duprintf("ipt_multiport:"
+                         " Dropping evil offset=0 tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        return ports_match(multiinfo->ports,
+                           multiinfo->flags, multiinfo->count,
+                           ntohs(pptr[0]), ntohs(pptr[1]));
+}
+static int
+match_v1(const struct sk_buff *skb,
+         const struct net_device *in,
+         const struct net_device *out,
+         const void *matchinfo,
+         int offset,
+         int *hotdrop)
+{
+        u16 _ports[2], *pptr;
+        const struct ipt_multiport_v1 *multiinfo = matchinfo;
+        if (offset)
+                return 0;
+        pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL) {
+                /* We've been asked to examine this packet, and we
+                 * can't.  Hence, no choice but to drop.
+                 */
+                duprintf("ipt_multiport:"
+                         " Dropping evil offset=0 tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
+}
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport)));
+}
+static int
+checkentry_v1(const char *tablename,
+              const struct ipt_ip *ip,
+              void *matchinfo,
+              unsigned int matchsize,
+              unsigned int hook_mask)
+{
+        return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1)));
+}
+static struct ipt_match multiport_match = {
+        .name           = "multiport",
+        .revision       = 0,
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static struct ipt_match multiport_match_v1 = {
+        .name           = "multiport",
+        .revision       = 1,
+        .match          = &match_v1,
+        .checkentry     = &checkentry_v1,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        int err;
+        err = ipt_register_match(&multiport_match);
+        if (!err) {
+                err = ipt_register_match(&multiport_match_v1);
+                if (err)
+                        ipt_unregister_match(&multiport_match);
+        }
+        return err;
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&multiport_match);
+        ipt_unregister_match(&multiport_match_v1);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
new file mode 100644
index 000000000000..3b9065e06381
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -0,0 +1,217 @@
+/* Kernel module to match various things tied to sockets associated with
+   locally generated outgoing packets. */
+/* (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+#include <linux/netfilter_ipv4/ipt_owner.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables owner match");
+static int
+match_comm(const struct sk_buff *skb, const char *comm)
+{
+        struct task_struct *g, *p;
+        struct files_struct *files;
+        int i;
+        read_lock(&tasklist_lock);
+        do_each_thread(g, p) {
+                if(strncmp(p->comm, comm, sizeof(p->comm)))
+                        continue;
+                task_lock(p);
+                files = p->files;
+                if(files) {
+                        spin_lock(&files->file_lock);
+                        for (i=0; i < files->max_fds; i++) {
+                                if (fcheck_files(files, i) ==
+                                    skb->sk->sk_socket->file) {
+                                        spin_unlock(&files->file_lock);
+                                        task_unlock(p);
+                                        read_unlock(&tasklist_lock);
+                                        return 1;
+                                }
+                        }
+                        spin_unlock(&files->file_lock);
+                }
+                task_unlock(p);
+        } while_each_thread(g, p);
+        read_unlock(&tasklist_lock);
+        return 0;
+}
+static int
+match_pid(const struct sk_buff *skb, pid_t pid)
+{
+        struct task_struct *p;
+        struct files_struct *files;
+        int i;
+        read_lock(&tasklist_lock);
+        p = find_task_by_pid(pid);
+        if (!p)
+                goto out;
+        task_lock(p);
+        files = p->files;
+        if(files) {
+                spin_lock(&files->file_lock);
+                for (i=0; i < files->max_fds; i++) {
+                        if (fcheck_files(files, i) ==
+                            skb->sk->sk_socket->file) {
+                                spin_unlock(&files->file_lock);
+                                task_unlock(p);
+                                read_unlock(&tasklist_lock);
+                                return 1;
+                        }
+                }
+                spin_unlock(&files->file_lock);
+        }
+        task_unlock(p);
+out:
+        read_unlock(&tasklist_lock);
+        return 0;
+}
+static int
+match_sid(const struct sk_buff *skb, pid_t sid)
+{
+        struct task_struct *g, *p;
+        struct file *file = skb->sk->sk_socket->file;
+        int i, found=0;
+        read_lock(&tasklist_lock);
+        do_each_thread(g, p) {
+                struct files_struct *files;
+                if (p->signal->session != sid)
+                        continue;
+                task_lock(p);
+                files = p->files;
+                if (files) {
+                        spin_lock(&files->file_lock);
+                        for (i=0; i < files->max_fds; i++) {
+                                if (fcheck_files(files, i) == file) {
+                                        found = 1;
+                                        break;
+                                }
+                        }
+                        spin_unlock(&files->file_lock);
+                }
+                task_unlock(p);
+                if (found)
+                        goto out;
+        } while_each_thread(g, p);
+out:
+        read_unlock(&tasklist_lock);
+        return found;
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_owner_info *info = matchinfo;
+        if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
+                return 0;
+        if(info->match & IPT_OWNER_UID) {
+                if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
+                    !!(info->invert & IPT_OWNER_UID))
+                        return 0;
+        }
+        if(info->match & IPT_OWNER_GID) {
+                if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
+                    !!(info->invert & IPT_OWNER_GID))
+                        return 0;
+        }
+        if(info->match & IPT_OWNER_PID) {
+                if (!match_pid(skb, info->pid) ^
+                    !!(info->invert & IPT_OWNER_PID))
+                        return 0;
+        }
+        if(info->match & IPT_OWNER_SID) {
+                if (!match_sid(skb, info->sid) ^
+                    !!(info->invert & IPT_OWNER_SID))
+                        return 0;
+        }
+        if(info->match & IPT_OWNER_COMM) {
+                if (!match_comm(skb, info->comm) ^
+                    !!(info->invert & IPT_OWNER_COMM))
+                        return 0;
+        }
+        return 1;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        if (hook_mask
+            & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
+                printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
+                return 0;
+        }
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) {
+                printk("Matchsize %u != %Zu\n", matchsize,
+                       IPT_ALIGN(sizeof(struct ipt_owner_info)));
+                return 0;
+        }
+#ifdef CONFIG_SMP
+        /* files->file_lock can not be used in a BH */
+        if (((struct ipt_owner_info *)matchinfo)->match
+            & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+                printk("ipt_owner: pid, sid and command matching is broken "
+                       "on SMP.\n");
+                return 0;
+        }
+#endif
+        return 1;
+}
+static struct ipt_match owner_match = {
+        .name           = "owner",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&owner_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&owner_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
new file mode 100644
index 000000000000..1a53924041fc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -0,0 +1,134 @@
+/* Kernel module to match the bridge port in and
+ * out device for IP packets coming into contact with a bridge. */
+/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_physdev.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_bridge.h>
+#define MATCH   1
+#define NOMATCH 0
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("iptables bridge physical device match module");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        int i;
+        static const char nulldevname[IFNAMSIZ];
+        const struct ipt_physdev_info *info = matchinfo;
+        unsigned int ret;
+        const char *indev, *outdev;
+        struct nf_bridge_info *nf_bridge;
+        /* Not a bridged IP packet or no info available yet:
+         * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
+         * the destination device will be a bridge. */
+        if (!(nf_bridge = skb->nf_bridge)) {
+                /* Return MATCH if the invert flags of the used options are on */
+                if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
+                    !(info->invert & IPT_PHYSDEV_OP_BRIDGED))
+                        return NOMATCH;
+                if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) &&
+                    !(info->invert & IPT_PHYSDEV_OP_ISIN))
+                        return NOMATCH;
+                if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) &&
+                    !(info->invert & IPT_PHYSDEV_OP_ISOUT))
+                        return NOMATCH;
+                if ((info->bitmask & IPT_PHYSDEV_OP_IN) &&
+                    !(info->invert & IPT_PHYSDEV_OP_IN))
+                        return NOMATCH;
+                if ((info->bitmask & IPT_PHYSDEV_OP_OUT) &&
+                    !(info->invert & IPT_PHYSDEV_OP_OUT))
+                        return NOMATCH;
+                return MATCH;
+        }
+        /* This only makes sense in the FORWARD and POSTROUTING chains */
+        if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
+            (!!(nf_bridge->mask & BRNF_BRIDGED) ^
+            !(info->invert & IPT_PHYSDEV_OP_BRIDGED)))
+                return NOMATCH;
+        if ((info->bitmask & IPT_PHYSDEV_OP_ISIN &&
+            (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) ||
+            (info->bitmask & IPT_PHYSDEV_OP_ISOUT &&
+            (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT))))
+                return NOMATCH;
+        if (!(info->bitmask & IPT_PHYSDEV_OP_IN))
+                goto match_outdev;
+        indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
+        for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+                ret |= (((const unsigned int *)indev)[i]
+                        ^ ((const unsigned int *)info->physindev)[i])
+                        & ((const unsigned int *)info->in_mask)[i];
+        }
+        if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN))
+                return NOMATCH;
+match_outdev:
+        if (!(info->bitmask & IPT_PHYSDEV_OP_OUT))
+                return MATCH;
+        outdev = nf_bridge->physoutdev ?
+                 nf_bridge->physoutdev->name : nulldevname;
+        for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+                ret |= (((const unsigned int *)outdev)[i]
+                        ^ ((const unsigned int *)info->physoutdev)[i])
+                        & ((const unsigned int *)info->out_mask)[i];
+        }
+        return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT);
+}
+static int
+checkentry(const char *tablename,
+                       const struct ipt_ip *ip,
+                       void *matchinfo,
+                       unsigned int matchsize,
+                       unsigned int hook_mask)
+{
+        const struct ipt_physdev_info *info = matchinfo;
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info)))
+                return 0;
+        if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) ||
+            info->bitmask & ~IPT_PHYSDEV_OP_MASK)
+                return 0;
+        return 1;
+}
+static struct ipt_match physdev_match = {
+        .name           = "physdev",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&physdev_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&physdev_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c
new file mode 100644
index 000000000000..8ddb1dc5e5ae
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_pkttype.c
@@ -0,0 +1,70 @@
+/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/netfilter_ipv4/ipt_pkttype.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
+MODULE_DESCRIPTION("IP tables match to match on linklayer packet type");
+static int match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+    const struct ipt_pkttype_info *info = matchinfo;
+    return (skb->pkt_type == info->pkttype) ^ info->invert;
+}
+static int checkentry(const char *tablename,
+                   const struct ipt_ip *ip,
+                   void *matchinfo,
+                   unsigned int matchsize,
+                   unsigned int hook_mask)
+{
+/*
+        if (hook_mask
+            & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+                | (1 << NF_IP_FORWARD))) {
+                printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
+                return 0;
+        }
+*/
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match pkttype_match = {
+        .name           = "pkttype",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&pkttype_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&pkttype_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c
new file mode 100644
index 000000000000..54a6897ebaa6
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_realm.c
@@ -0,0 +1,76 @@
+/* IP tables module for matching the routing realm
+ *
+ * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $
+ *
+ * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4/ipt_realm.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables realm match");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_realm_info *info = matchinfo;
+        struct dst_entry *dst = skb->dst;
+    
+        return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
+}
+static int check(const char *tablename,
+                 const struct ipt_ip *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+        if (hook_mask
+            & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) |
+                (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) {
+                printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, "
+                       "LOCAL_IN or FORWARD.\n");
+                return 0;
+        }
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) {
+                printk("ipt_realm: invalid matchsize.\n");
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_match realm_match = {
+        .name           = "realm",
+        .match          = match, 
+        .checkentry     = check,
+        .me             = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&realm_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&realm_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
new file mode 100644
index 000000000000..25ab9fabdcba
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -0,0 +1,1002 @@
+/* Kernel module to check if the source address has been seen recently. */
+/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */
+/* Author: Stephen Frost <sfrost@snowman.net> */
+/* Project Page: http://snowman.net/projects/ipt_recent/ */
+/* This software is distributed under the terms of the GPL, Version 2 */
+/* This copyright does not cover user programs that use kernel services
+ * by normal system calls. */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_recent.h>
+#undef DEBUG
+#define HASH_LOG 9
+/* Defaults, these can be overridden on the module command-line. */
+static int ip_list_tot = 100;
+static int ip_pkt_list_tot = 20;
+static int ip_list_hash_size = 0;
+static int ip_list_perms = 0644;
+#ifdef DEBUG
+static int debug = 1;
+#endif
+static char version[] =
+KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>.  http://snowman.net/projects/ipt_recent/\n";
+MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
+MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
+MODULE_LICENSE("GPL");
+module_param(ip_list_tot, int, 0400);
+module_param(ip_pkt_list_tot, int, 0400);
+module_param(ip_list_hash_size, int, 0400);
+module_param(ip_list_perms, int, 0400);
+#ifdef DEBUG
+module_param(debug, int, 0600);
+MODULE_PARM_DESC(debug,"debugging level, defaults to 1");
+#endif
+MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list");
+MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember");
+MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs");
+MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files");
+/* Structure of our list of recently seen addresses. */
+struct recent_ip_list {
+        u_int32_t addr;
+        u_int8_t  ttl;
+        unsigned long last_seen;
+        unsigned long *last_pkts;
+        u_int32_t oldest_pkt;
+        u_int32_t hash_entry;
+        u_int32_t time_pos;
+};
+struct time_info_list {
+        u_int32_t position;
+        u_int32_t time;
+};
+/* Structure of our linked list of tables of recent lists. */
+struct recent_ip_tables {
+        char name[IPT_RECENT_NAME_LEN];
+        int count;
+        int time_pos;
+        struct recent_ip_list *table;
+        struct recent_ip_tables *next;
+        spinlock_t list_lock;
+        int *hash_table;
+        struct time_info_list *time_info;
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *status_proc;
+#endif /* CONFIG_PROC_FS */
+};
+/* Our current list of addresses we have recently seen.
+ * Only added to on a --set, and only updated on --set || --update 
+ */
+static struct recent_ip_tables *r_tables = NULL;
+/* We protect r_list with this spinlock so two processors are not modifying
+ * the list at the same time. 
+ */
+static DEFINE_SPINLOCK(recent_lock);
+#ifdef CONFIG_PROC_FS
+/* Our /proc/net/ipt_recent entry */
+static struct proc_dir_entry *proc_net_ipt_recent = NULL;
+#endif
+/* Function declaration for later. */
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop);
+/* Function to hash a given address into the hash table of table_size size */
+static int hash_func(unsigned int addr, int table_size)
+{
+        int result = 0;
+        unsigned int value = addr;
+        do { result ^= value; } while((value >>= HASH_LOG));
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
+                         result & (table_size - 1),
+                         addr,
+                         table_size);
+#endif
+        return(result & (table_size - 1));
+}
+#ifdef CONFIG_PROC_FS
+/* This is the function which produces the output for our /proc output
+ * interface which lists each IP address, the last seen time and the 
+ * other recent times the address was seen.
+ */
+static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+{
+        int len = 0, count, last_len = 0, pkt_count;
+        off_t pos = 0;
+        off_t begin = 0;
+        struct recent_ip_tables *curr_table;
+        curr_table = (struct recent_ip_tables*) data;
+        spin_lock_bh(&curr_table->list_lock);
+        for(count = 0; count < ip_list_tot; count++) {
+                if(!curr_table->table[count].addr) continue;
+                last_len = len;
+                len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
+                len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
+                len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
+                len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
+                len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
+                for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
+                        if(!curr_table->table[count].last_pkts[pkt_count]) break;
+                        len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
+                }
+                len += sprintf(buffer+len,"\n");
+                pos = begin + len;
+                if(pos < offset) { len = 0; begin = pos; }
+                if(pos > offset + length) { len = last_len; break; }
+        }
+        *start = buffer + (offset - begin);
+        len -= (offset - begin);
+        if(len > length) len = length;
+        spin_unlock_bh(&curr_table->list_lock);
+        return len;
+}
+/* ip_recent_ctrl provides an interface for users to modify the table
+ * directly.  This allows adding entries, removing entries, and
+ * flushing the entire table.
+ * This is done by opening up the appropriate table for writing and
+ * sending one of:
+ * xx.xx.xx.xx   -- Add entry to table with current time
+ * +xx.xx.xx.xx  -- Add entry to table with current time
+ * -xx.xx.xx.xx  -- Remove entry from table
+ * clear         -- Flush table, remove all entries
+ */
+static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
+{
+        static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff };
+        u_int32_t val;
+        int base, used = 0;
+        char c, *cp;
+        union iaddr {
+                uint8_t bytes[4];
+                uint32_t word;
+        } res;
+        uint8_t *pp = res.bytes;
+        int digit;
+        char buffer[20];
+        int len, check_set = 0, count;
+        u_int32_t addr = 0;
+        struct sk_buff *skb;
+        struct ipt_recent_info *info;
+        struct recent_ip_tables *curr_table;
+        curr_table = (struct recent_ip_tables*) data;
+        if(size > 20) len = 20; else len = size;
+        if(copy_from_user(buffer,input,len)) return -EFAULT;
+        if(len < 20) buffer[len] = '\0';
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
+#endif
+        cp = buffer;
+        while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; }
+        /* Check if we are asked to flush the entire table */
+        if(!memcmp(cp,"clear",5)) {
+                used += 5;
+                spin_lock_bh(&curr_table->list_lock);
+                curr_table->time_pos = 0;
+                for(count = 0; count < ip_list_hash_size; count++) {
+                        curr_table->hash_table[count] = -1;
+                }
+                for(count = 0; count < ip_list_tot; count++) {
+                        curr_table->table[count].last_seen = 0;
+                        curr_table->table[count].addr = 0;
+                        curr_table->table[count].ttl = 0;
+                        memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+                        curr_table->table[count].oldest_pkt = 0;
+                        curr_table->table[count].time_pos = 0;
+                        curr_table->time_info[count].position = count;
+                        curr_table->time_info[count].time = 0;
+                }
+                spin_unlock_bh(&curr_table->list_lock);
+                return used;
+        }
+        check_set = IPT_RECENT_SET;
+        switch(*cp) {
+                case '+': check_set = IPT_RECENT_SET; cp++; used++; break;
+                case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
+                default: if(!isdigit(*cp)) return (used+1); break;
+        }
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set);
+#endif
+        /* Get addr (effectively inet_aton()) */
+        /* Shamelessly stolen from libc, a function in the kernel for doing
+         * this would, of course, be greatly preferred, but our options appear
+         * to be rather limited, so we will just do it ourselves here.
+         */
+        res.word = 0;
+        c = *cp;
+        for(;;) {
+                if(!isdigit(c)) return used;
+                val = 0; base = 10; digit = 0;
+                if(c == '0') {
+                        c = *++cp;
+                        if(c == 'x' || c == 'X') base = 16, c = *++cp;
+                        else { base = 8; digit = 1; }
+                }
+                for(;;) {
+                        if(isascii(c) && isdigit(c)) {
+                                if(base == 8 && (c == '8' || c == '0')) return used;
+                                val = (val * base) + (c - '0');
+                                c = *++cp;
+                                digit = 1;
+                        } else if(base == 16 && isascii(c) && isxdigit(c)) {
+                                val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
+                                c = *++cp;
+                                digit = 1;
+                        } else break;
+                }
+                if(c == '.') {
+                        if(pp > res.bytes + 2 || val > 0xff) return used;
+                        *pp++ = val;
+                        c = *++cp;
+                } else break;
+        }
+        used = cp - buffer;
+        if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
+        if(c == '\n') used++;
+        if(!digit) return used;
+        if(val > max[pp - res.bytes]) return used;
+        addr = res.word | htonl(val);
+        if(!addr && check_set == IPT_RECENT_SET) return used;
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used);
+#endif
+        /* Set up and just call match */
+        info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL);
+        if(!info) { return -ENOMEM; }
+        info->seconds = 0;
+        info->hit_count = 0;
+        info->check_set = check_set;
+        info->invert = 0;
+        info->side = IPT_RECENT_SOURCE;
+        strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
+        info->name[IPT_RECENT_NAME_LEN-1] = '\0';
+        skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
+        if (!skb) {
+                used = -ENOMEM;
+                goto out_free_info;
+        }
+        skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
+        if (!skb->nh.iph) {
+                used = -ENOMEM;
+                goto out_free_skb;
+        }
+        skb->nh.iph->saddr = addr;
+        skb->nh.iph->daddr = 0;
+        /* Clear ttl since we have no way of knowing it */
+        skb->nh.iph->ttl = 0;
+        match(skb,NULL,NULL,info,0,NULL);
+        kfree(skb->nh.iph);
+out_free_skb:
+        kfree(skb);
+out_free_info:
+        kfree(info);
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
+#endif
+        return used;
+}
+#endif /* CONFIG_PROC_FS */
+/* 'match' is our primary function, called by the kernel whenever a rule is
+ * hit with our module as an option to it.
+ * What this function does depends on what was specifically asked of it by
+ * the user:
+ * --set -- Add or update last seen time of the source address of the packet
+ *   -- matchinfo->check_set == IPT_RECENT_SET
+ * --rcheck -- Just check if the source address is in the list
+ *   -- matchinfo->check_set == IPT_RECENT_CHECK
+ * --update -- If the source address is in the list, update last_seen
+ *   -- matchinfo->check_set == IPT_RECENT_UPDATE
+ * --remove -- If the source address is in the list, remove it
+ *   -- matchinfo->check_set == IPT_RECENT_REMOVE
+ * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
+ *   -- matchinfo->seconds
+ * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
+ *   -- matchinfo->hit_count
+ * --seconds and --hitcount can be combined
+ */
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        int pkt_count, hits_found, ans;
+        unsigned long now;
+        const struct ipt_recent_info *info = matchinfo;
+        u_int32_t addr = 0, time_temp;
+        u_int8_t ttl = skb->nh.iph->ttl;
+        int *hash_table;
+        int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1;
+        struct time_info_list *time_info;
+        struct recent_ip_tables *curr_table;
+        struct recent_ip_tables *last_table;
+        struct recent_ip_list *r_list;
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
+#endif
+        /* Default is false ^ info->invert */
+        ans = info->invert;
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name);
+#endif
+        /* if out != NULL then routing has been done and TTL changed.
+         * We change it back here internally for match what came in before routing. */
+        if(out) ttl++;
+        /* Find the right table */
+        spin_lock_bh(&recent_lock);
+        curr_table = r_tables;
+        while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) );
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name);
+#endif
+        spin_unlock_bh(&recent_lock);
+        /* Table with this name not found, match impossible */
+        if(!curr_table) { return ans; }
+        /* Make sure no one is changing the list while we work with it */
+        spin_lock_bh(&curr_table->list_lock);
+        r_list = curr_table->table;
+        if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
+        if(!addr) { 
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
+#endif
+                spin_unlock_bh(&curr_table->list_lock);
+                return ans;
+        }
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
+#endif
+        /* Get jiffies now in case they changed while we were waiting for a lock */
+        now = jiffies;
+        hash_table = curr_table->hash_table;
+        time_info = curr_table->time_info;
+        orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
+        /* Hash entry at this result used */
+        /* Check for TTL match if requested.  If TTL is zero then a match would never
+         * happen, so match regardless of existing TTL in that case.  Zero means the
+         * entry was added via the /proc interface anyway, so we will just use the
+         * first TTL we get for that IP address. */
+        if(info->check_set & IPT_RECENT_TTL) {
+                while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
+                        (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
+                        /* Collision in hash table */
+                        hash_result = (hash_result + 1) % ip_list_hash_size;
+                }
+        } else {
+                while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
+                        /* Collision in hash table */
+                        hash_result = (hash_result + 1) % ip_list_hash_size;
+                }
+        }
+        if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
+                /* IP not in list and not asked to SET */
+                spin_unlock_bh(&curr_table->list_lock);
+                return ans;
+        }
+        /* Check if we need to handle the collision, do not need to on REMOVE */
+        if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
+                                 orig_hash_result,
+                                 hash_result,
+                                 r_list[hash_table[orig_hash_result]].addr,
+                                 addr);
+#endif
+                /* We had a collision.
+                 * orig_hash_result is where we started, hash_result is where we ended up.
+                 * So, swap them because we are likely to see the same guy again sooner */
+#ifdef DEBUG
+                if(debug) {
+                  printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
+                  printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
+                                r_list[hash_table[orig_hash_result]].hash_entry);
+                }
+#endif
+                r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
+                temp = hash_table[orig_hash_result];
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
+#endif
+                hash_table[orig_hash_result] = hash_table[hash_result];
+                hash_table[hash_result] = temp;
+                temp = hash_result;
+                hash_result = orig_hash_result;
+                orig_hash_result = temp;
+                time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
+                if(hash_table[hash_result] != -1) {
+                        r_list[hash_table[hash_result]].hash_entry = hash_result;
+                        time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+                }
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
+#endif
+        }
+        if(hash_table[hash_result] == -1) {
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n",
+                                 hash_result, addr);
+#endif
+                /* New item found and IPT_RECENT_SET, so we need to add it */
+                location = time_info[curr_table->time_pos].position;
+                hash_table[r_list[location].hash_entry] = -1;
+                hash_table[hash_result] = location;
+                memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+                r_list[location].time_pos = curr_table->time_pos;
+                r_list[location].addr = addr;
+                r_list[location].ttl = ttl;
+                r_list[location].last_seen = now;
+                r_list[location].oldest_pkt = 1;
+                r_list[location].last_pkts[0] = now;
+                r_list[location].hash_entry = hash_result;
+                time_info[curr_table->time_pos].time = r_list[location].last_seen;
+                curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
+                ans = !info->invert;
+        } else {
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
+                                 hash_result,
+                                 addr);
+#endif
+                /* Existing item found */
+                location = hash_table[hash_result];
+                /* We have a match on address, now to make sure it meets all requirements for a
+                 * full match. */
+                if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
+                        if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
+                        if(info->seconds && !info->hit_count) {
+                                if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
+                        }
+                        if(info->seconds && info->hit_count) {
+                                for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
+                                        if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
+                                }
+                                if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
+                        }
+                        if(info->hit_count && !info->seconds) {
+                                for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
+                                        if(r_list[location].last_pkts[pkt_count] == 0) break;
+                                        hits_found++;
+                                }
+                                if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
+                        }
+                }
+#ifdef DEBUG
+                if(debug) {
+                        if(ans)
+                                printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
+                        else
+                                printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
+                }
+#endif
+                /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
+                 * current timestamp to the last_seen. */
+                if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
+#ifdef DEBUG
+                        if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
+#endif
+                        /* Have to update our time info */
+                        time_loc = r_list[location].time_pos;
+                        time_info[time_loc].time = now;
+                        time_info[time_loc].position = location;
+                        while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
+                                time_temp = time_info[time_loc].time;
+                                time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
+                                time_info[(time_loc+1)%ip_list_tot].time = time_temp;
+                                time_temp = time_info[time_loc].position;
+                                time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
+                                time_info[(time_loc+1)%ip_list_tot].position = time_temp;
+                                r_list[time_info[time_loc].position].time_pos = time_loc;
+                                r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
+                                time_loc = (time_loc+1) % ip_list_tot;
+                        }
+                        r_list[location].time_pos = time_loc;
+                        r_list[location].ttl = ttl;
+                        r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
+                        r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
+                        r_list[location].last_seen = now;
+                }
+                /* If we have been asked to remove the entry from the list, just set it to 0 */
+                if(info->check_set & IPT_RECENT_REMOVE) {
+#ifdef DEBUG
+                        if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
+#endif
+                        /* Check if this is part of a collision chain */
+                        while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
+                                orig_hash_result++;
+                                if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
+                                        /* Found collision chain, how deep does this rabbit hole go? */
+#ifdef DEBUG
+                                        if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
+#endif
+                                        end_collision_chain = orig_hash_result;
+                                }
+                        }
+                        if(end_collision_chain != -1) {
+#ifdef DEBUG
+                                if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
+#endif
+                                /* Part of a collision chain, swap it with the end of the chain
+                                 * before removing. */
+                                r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
+                                temp = hash_table[end_collision_chain];
+                                hash_table[end_collision_chain] = hash_table[hash_result];
+                                hash_table[hash_result] = temp;
+                                time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+                                hash_result = end_collision_chain;
+                                r_list[hash_table[hash_result]].hash_entry = hash_result;
+                                time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+                        }
+                        location = hash_table[hash_result];
+                        hash_table[r_list[location].hash_entry] = -1;
+                        time_loc = r_list[location].time_pos;
+                        time_info[time_loc].time = 0;
+                        time_info[time_loc].position = location;
+                        while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
+                                time_temp = time_info[time_loc].time;
+                                time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
+                                time_info[(time_loc+1)%ip_list_tot].time = time_temp;
+                                time_temp = time_info[time_loc].position;
+                                time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
+                                time_info[(time_loc+1)%ip_list_tot].position = time_temp;
+                                r_list[time_info[time_loc].position].time_pos = time_loc;
+                                r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
+                                time_loc = (time_loc+1) % ip_list_tot;
+                        }
+                        r_list[location].time_pos = time_loc;
+                        r_list[location].last_seen = 0;
+                        r_list[location].addr = 0;
+                        r_list[location].ttl = 0;
+                        memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+                        r_list[location].oldest_pkt = 0;
+                        ans = !info->invert;
+                }
+                spin_unlock_bh(&curr_table->list_lock);
+                return ans;
+        }
+        spin_unlock_bh(&curr_table->list_lock);
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n");
+#endif
+        return ans;
+}
+/* This function is to verify that the rule given during the userspace iptables
+ * command is correct.
+ * If the command is valid then we check if the table name referred to by the
+ * rule exists, if not it is created.
+ */
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        int flag = 0, c;
+        unsigned long *hold;
+        const struct ipt_recent_info *info = matchinfo;
+        struct recent_ip_tables *curr_table, *find_table, *last_table;
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
+#endif
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0;
+        /* seconds and hit_count only valid for CHECK/UPDATE */
+        if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
+        if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
+        if(info->check_set & IPT_RECENT_CHECK) flag++;
+        if(info->check_set & IPT_RECENT_UPDATE) flag++;
+        /* One and only one of these should ever be set */
+        if(flag != 1) return 0;
+        /* Name must be set to something */
+        if(!info->name || !info->name[0]) return 0;
+        /* Things look good, create a list for this if it does not exist */
+        /* Lock the linked list while we play with it */
+        spin_lock_bh(&recent_lock);
+        /* Look for an entry with this name already created */
+        /* Finds the end of the list and the entry before the end if current name does not exist */
+        find_table = r_tables;
+        while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+        /* If a table already exists just increment the count on that table and return */
+        if(find_table) { 
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name);
+#endif
+                find_table->count++;
+                spin_unlock_bh(&recent_lock);
+                return 1;
+        }
+        spin_unlock_bh(&recent_lock);
+        /* Table with this name not found */
+        /* Allocate memory for new linked list item */
+#ifdef DEBUG
+        if(debug) {
+                printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name);
+                printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables));
+        }
+#endif
+        curr_table = vmalloc(sizeof(struct recent_ip_tables));
+        if(curr_table == NULL) return 0;
+        spin_lock_init(&curr_table->list_lock);
+        curr_table->next = NULL;
+        curr_table->count = 1;
+        curr_table->time_pos = 0;
+        strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
+        curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
+        /* Allocate memory for this table and the list of packets in each entry. */
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
+                        sizeof(struct recent_ip_list)*ip_list_tot,
+                        info->name);
+#endif
+        curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
+        if(curr_table->table == NULL) { vfree(curr_table); return 0; }
+        memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
+                        sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+#endif
+        hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
+#endif
+        if(hold == NULL) { 
+                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
+                vfree(curr_table->table); 
+                vfree(curr_table);
+                return 0;
+        }
+        for(c = 0; c < ip_list_tot; c++) {
+                curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
+        }
+        /* Allocate memory for the hash table */
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n",
+                        sizeof(int)*ip_list_hash_size);
+#endif
+        curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
+        if(!curr_table->hash_table) {
+                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
+                vfree(hold);
+                vfree(curr_table->table); 
+                vfree(curr_table);
+                return 0;
+        }
+        for(c = 0; c < ip_list_hash_size; c++) {
+                curr_table->hash_table[c] = -1;
+        }
+        /* Allocate memory for the time info */
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n",
+                        sizeof(struct time_info_list)*ip_list_tot);
+#endif
+        curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot);
+        if(!curr_table->time_info) {
+                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n");
+                vfree(curr_table->hash_table);
+                vfree(hold);
+                vfree(curr_table->table); 
+                vfree(curr_table);
+                return 0;
+        }
+        for(c = 0; c < ip_list_tot; c++) {
+                curr_table->time_info[c].position = c;
+                curr_table->time_info[c].time = 0;
+        }
+        /* Put the new table in place */
+        spin_lock_bh(&recent_lock);
+        find_table = r_tables;
+        while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+        /* If a table already exists just increment the count on that table and return */
+        if(find_table) { 
+                find_table->count++;    
+                spin_unlock_bh(&recent_lock);
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
+#endif
+                vfree(curr_table->time_info);
+                vfree(curr_table->hash_table);
+                vfree(hold);
+                vfree(curr_table->table);
+                vfree(curr_table);
+                return 1;
+        }
+        if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
+        spin_unlock_bh(&recent_lock);
+#ifdef CONFIG_PROC_FS
+        /* Create our proc 'status' entry. */
+        curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent);
+        if (!curr_table->status_proc) {
+                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
+                /* Destroy the created table */
+                spin_lock_bh(&recent_lock);
+                last_table = NULL;
+                curr_table = r_tables;
+                if(!curr_table) {
+#ifdef DEBUG
+                        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
+#endif
+                        spin_unlock_bh(&recent_lock);
+                        return 0;
+                }
+                while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
+                if(!curr_table) {
+#ifdef DEBUG
+                        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
+#endif
+                        spin_unlock_bh(&recent_lock);
+                        return 0;
+                }
+                if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
+                spin_unlock_bh(&recent_lock);
+                vfree(curr_table->time_info);
+                vfree(curr_table->hash_table);
+                vfree(hold);
+                vfree(curr_table->table);
+                vfree(curr_table);
+                return 0;
+        }
+        
+        curr_table->status_proc->owner = THIS_MODULE;
+        curr_table->status_proc->data = curr_table;
+        wmb();
+        curr_table->status_proc->read_proc = ip_recent_get_info;
+        curr_table->status_proc->write_proc = ip_recent_ctrl;
+#endif /* CONFIG_PROC_FS */
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
+#endif
+        return 1;
+}
+/* This function is called in the event that a rule matching this module is
+ * removed.
+ * When this happens we need to check if there are no other rules matching
+ * the table given.  If that is the case then we remove the table and clean
+ * up its memory.
+ */
+static void
+destroy(void *matchinfo, unsigned int matchsize)
+{
+        const struct ipt_recent_info *info = matchinfo;
+        struct recent_ip_tables *curr_table, *last_table;
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n");
+#endif
+        if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return;
+        /* Lock the linked list while we play with it */
+        spin_lock_bh(&recent_lock);
+        /* Look for an entry with this name already created */
+        /* Finds the end of the list and the entry before the end if current name does not exist */
+        last_table = NULL;
+        curr_table = r_tables;
+        if(!curr_table) { 
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n");
+#endif
+                spin_unlock_bh(&recent_lock);
+                return;
+        }
+        while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
+        /* If a table does not exist then do nothing and return */
+        if(!curr_table) { 
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n");
+#endif
+                spin_unlock_bh(&recent_lock);
+                return;
+        }
+        curr_table->count--;
+        /* If count is still non-zero then there are still rules referenceing it so we do nothing */
+        if(curr_table->count) { 
+#ifdef DEBUG
+                if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n");
+#endif
+                spin_unlock_bh(&recent_lock);
+                return;
+        }
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
+#endif
+        /* Count must be zero so we remove this table from the list */
+        if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
+        spin_unlock_bh(&recent_lock);
+        /* lock to make sure any late-runners still using this after we removed it from
+         * the list finish up then remove everything */
+        spin_lock_bh(&curr_table->list_lock);
+        spin_unlock_bh(&curr_table->list_lock);
+#ifdef CONFIG_PROC_FS
+        if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent);
+#endif /* CONFIG_PROC_FS */
+        vfree(curr_table->table[0].last_pkts);
+        vfree(curr_table->table);
+        vfree(curr_table->hash_table);
+        vfree(curr_table->time_info);
+        vfree(curr_table);
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
+#endif
+        return;
+}
+/* This is the structure we pass to ipt_register to register our
+ * module with iptables.
+ */
+static struct ipt_match recent_match = { 
+  .name = "recent", 
+  .match = &match, 
+  .checkentry = &checkentry, 
+  .destroy = &destroy, 
+  .me = THIS_MODULE
+};
+/* Kernel module initialization. */
+static int __init init(void)
+{
+        int err, count;
+        printk(version);
+#ifdef CONFIG_PROC_FS
+        proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net);
+        if(!proc_net_ipt_recent) return -ENOMEM;
+#endif
+        if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
+          printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
+          ip_list_hash_size = 0;
+        }
+        if(!ip_list_hash_size) {
+                ip_list_hash_size = ip_list_tot*3;
+                count = 2*2;
+                while(ip_list_hash_size > count) count = count*2;
+                ip_list_hash_size = count;
+        }
+#ifdef DEBUG
+        if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
+#endif
+        err = ipt_register_match(&recent_match);
+        if (err)
+                remove_proc_entry("ipt_recent", proc_net);
+        return err;
+}
+/* Kernel module destruction. */
+static void __exit fini(void)
+{
+        ipt_unregister_match(&recent_match);
+        remove_proc_entry("ipt_recent",proc_net);
+}
+/* Register our module with the kernel. */
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c
new file mode 100644
index 000000000000..fe2b327bcaa4
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_sctp.c
@@ -0,0 +1,203 @@
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/sctp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_sctp.h>
+#ifdef DEBUG_SCTP
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+                                              || (!!((invflag) & (option)) ^ (cond)))
+static int
+match_flags(const struct ipt_sctp_flag_info *flag_info,
+            const int flag_count,
+            u_int8_t chunktype,
+            u_int8_t chunkflags)
+{
+        int i;
+        for (i = 0; i < flag_count; i++) {
+                if (flag_info[i].chunktype == chunktype) {
+                        return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
+                }
+        }
+        return 1;
+}
+static int
+match_packet(const struct sk_buff *skb,
+             const u_int32_t *chunkmap,
+             int chunk_match_type,
+             const struct ipt_sctp_flag_info *flag_info,
+             const int flag_count,
+             int *hotdrop)
+{
+        int offset;
+        u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
+        sctp_chunkhdr_t _sch, *sch;
+#ifdef DEBUG_SCTP
+        int i = 0;
+#endif
+        if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) {
+                SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap);
+        }
+        offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t);
+        do {
+                sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
+                if (sch == NULL) {
+                        duprintf("Dropping invalid SCTP packet.\n");
+                        *hotdrop = 1;
+                        return 0;
+                }
+                duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", 
+                                ++i, offset, sch->type, htons(sch->length), sch->flags);
+                offset += (htons(sch->length) + 3) & ~3;
+                duprintf("skb->len: %d\toffset: %d\n", skb->len, offset);
+                if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) {
+                        switch (chunk_match_type) {
+                        case SCTP_CHUNK_MATCH_ANY:
+                                if (match_flags(flag_info, flag_count, 
+                                        sch->type, sch->flags)) {
+                                        return 1;
+                                }
+                                break;
+                        case SCTP_CHUNK_MATCH_ALL:
+                                if (match_flags(flag_info, flag_count, 
+                                        sch->type, sch->flags)) {
+                                        SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
+                                }
+                                break;
+                        case SCTP_CHUNK_MATCH_ONLY:
+                                if (!match_flags(flag_info, flag_count, 
+                                        sch->type, sch->flags)) {
+                                        return 0;
+                                }
+                                break;
+                        }
+                } else {
+                        switch (chunk_match_type) {
+                        case SCTP_CHUNK_MATCH_ONLY:
+                                return 0;
+                        }
+                }
+        } while (offset < skb->len);
+        switch (chunk_match_type) {
+        case SCTP_CHUNK_MATCH_ALL:
+                return SCTP_CHUNKMAP_IS_CLEAR(chunkmap);
+        case SCTP_CHUNK_MATCH_ANY:
+                return 0;
+        case SCTP_CHUNK_MATCH_ONLY:
+                return 1;
+        }
+        /* This will never be reached, but required to stop compiler whine */
+        return 0;
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_sctp_info *info;
+        sctp_sctphdr_t _sh, *sh;
+        info = (const struct ipt_sctp_info *)matchinfo;
+        if (offset) {
+                duprintf("Dropping non-first fragment.. FIXME\n");
+                return 0;
+        }
+        
+        sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh);
+        if (sh == NULL) {
+                duprintf("Dropping evil TCP offset=0 tinygram.\n");
+                *hotdrop = 1;
+                return 0;
+        }
+        duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
+        return  SCCHECK(((ntohs(sh->source) >= info->spts[0]) 
+                        && (ntohs(sh->source) <= info->spts[1])), 
+                        IPT_SCTP_SRC_PORTS, info->flags, info->invflags)
+                && SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) 
+                        && (ntohs(sh->dest) <= info->dpts[1])), 
+                        IPT_SCTP_DEST_PORTS, info->flags, info->invflags)
+                && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type,
+                                        info->flag_info, info->flag_count, 
+                                        hotdrop),
+                           IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        const struct ipt_sctp_info *info;
+        info = (const struct ipt_sctp_info *)matchinfo;
+        return ip->proto == IPPROTO_SCTP
+                && !(ip->invflags & IPT_INV_PROTO)
+                && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info))
+                && !(info->flags & ~IPT_SCTP_VALID_FLAGS)
+                && !(info->invflags & ~IPT_SCTP_VALID_FLAGS)
+                && !(info->invflags & ~info->flags)
+                && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || 
+                        (info->chunk_match_type &
+                                (SCTP_CHUNK_MATCH_ALL 
+                                | SCTP_CHUNK_MATCH_ANY
+                                | SCTP_CHUNK_MATCH_ONLY)));
+}
+static struct ipt_match sctp_match = 
+{ 
+        .list = { NULL, NULL},
+        .name = "sctp",
+        .match = &match,
+        .checkentry = &checkentry,
+        .destroy = NULL,
+        .me = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&sctp_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&sctp_match);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Match for SCTP protocol packets");
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
new file mode 100644
index 000000000000..b1511b97ea5f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_state.c
@@ -0,0 +1,74 @@
+/* Kernel module to match connection tracking information. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_state.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("iptables connection tracking state match module");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_state_info *sinfo = matchinfo;
+        enum ip_conntrack_info ctinfo;
+        unsigned int statebit;
+        if (skb->nfct == &ip_conntrack_untracked.ct_general)
+                statebit = IPT_STATE_UNTRACKED;
+        else if (!ip_conntrack_get(skb, &ctinfo))
+                statebit = IPT_STATE_INVALID;
+        else
+                statebit = IPT_STATE_BIT(ctinfo);
+        return (sinfo->statemask & statebit);
+}
+static int check(const char *tablename,
+                 const struct ipt_ip *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match state_match = {
+        .name           = "state",
+        .match          = &match,
+        .checkentry     = &check,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        need_ip_conntrack();
+        return ipt_register_match(&state_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&state_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c
new file mode 100644
index 000000000000..4dc9b16ab4a3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tcpmss.c
@@ -0,0 +1,127 @@
+/* Kernel module to match TCP MSS values. */
+/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ipt_tcpmss.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#define TH_SYN 0x02
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables TCP MSS match module");
+/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */
+static inline int
+mssoption_match(u_int16_t min, u_int16_t max,
+                const struct sk_buff *skb,
+                int invert,
+                int *hotdrop)
+{
+        struct tcphdr _tcph, *th;
+        /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+        u8 _opt[15 * 4 - sizeof(_tcph)], *op;
+        unsigned int i, optlen;
+        /* If we don't have the whole header, drop packet. */
+        th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL)
+                goto dropit;
+        /* Malformed. */
+        if (th->doff*4 < sizeof(*th))
+                goto dropit;
+        optlen = th->doff*4 - sizeof(*th);
+        if (!optlen)
+                goto out;
+        /* Truncated options. */
+        op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th),
+                                optlen, _opt);
+        if (op == NULL)
+                goto dropit;
+        for (i = 0; i < optlen; ) {
+                if (op[i] == TCPOPT_MSS
+                    && (optlen - i) >= TCPOLEN_MSS
+                    && op[i+1] == TCPOLEN_MSS) {
+                        u_int16_t mssval;
+                        mssval = (op[i+2] << 8) | op[i+3];
+                        
+                        return (mssval >= min && mssval <= max) ^ invert;
+                }
+                if (op[i] < 2) i++;
+                else i += op[i+1]?:1;
+        }
+out:
+        return invert;
+ dropit:
+        *hotdrop = 1;
+        return 0;
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_tcpmss_match_info *info = matchinfo;
+        return mssoption_match(info->mss_min, info->mss_max, skb,
+                               info->invert, hotdrop);
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info)))
+                return 0;
+        /* Must specify -p tcp */
+        if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) {
+                printk("tcpmss: Only works on TCP packets\n");
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_match tcpmss_match = {
+        .name           = "tcpmss",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&tcpmss_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&tcpmss_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
new file mode 100644
index 000000000000..086a1bb61e3e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -0,0 +1,64 @@
+/* Kernel module to match TOS values. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_tos.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables TOS match module");
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_tos_info *info = matchinfo;
+        return (skb->nh.iph->tos == info->tos) ^ info->invert;
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match tos_match = {
+        .name           = "tos",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&tos_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&tos_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
new file mode 100644
index 000000000000..219aa9de88cc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -0,0 +1,79 @@
+/* IP tables module for matching the value of the TTL 
+ *
+ * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp
+ *
+ * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_ttl.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL matching module");
+MODULE_LICENSE("GPL");
+static int match(const struct sk_buff *skb, const struct net_device *in,
+                 const struct net_device *out, const void *matchinfo,
+                 int offset, int *hotdrop)
+{
+        const struct ipt_ttl_info *info = matchinfo;
+        switch (info->mode) {
+                case IPT_TTL_EQ:
+                        return (skb->nh.iph->ttl == info->ttl);
+                        break;
+                case IPT_TTL_NE:
+                        return (!(skb->nh.iph->ttl == info->ttl));
+                        break;
+                case IPT_TTL_LT:
+                        return (skb->nh.iph->ttl < info->ttl);
+                        break;
+                case IPT_TTL_GT:
+                        return (skb->nh.iph->ttl > info->ttl);
+                        break;
+                default:
+                        printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", 
+                                info->mode);
+                        return 0;
+        }
+        return 0;
+}
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_ttl_info)))
+                return 0;
+        return 1;
+}
+static struct ipt_match ttl_match = {
+        .name           = "ttl",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_match(&ttl_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&ttl_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 000000000000..260a4f0a2a90
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,194 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables filter table");
+#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
+static struct
+{
+        struct ipt_replace repl;
+        struct ipt_standard entries[3];
+        struct ipt_error term;
+} initial_table __initdata 
+= { { "filter", FILTER_VALID_HOOKS, 4,
+      sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+      { [NF_IP_LOCAL_IN] = 0,
+        [NF_IP_FORWARD] = sizeof(struct ipt_standard),
+        [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      { [NF_IP_LOCAL_IN] = 0,
+        [NF_IP_FORWARD] = sizeof(struct ipt_standard),
+        [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      0, NULL, { } },
+    {
+            /* LOCAL_IN */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* FORWARD */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* LOCAL_OUT */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } }
+    },
+    /* ERROR */
+    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+        0,
+        sizeof(struct ipt_entry),
+        sizeof(struct ipt_error),
+        0, { 0, 0 }, { } },
+      { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+          { } },
+        "ERROR"
+      }
+    }
+};
+static struct ipt_table packet_filter = {
+        .name           = "filter",
+        .valid_hooks    = FILTER_VALID_HOOKS,
+        .lock           = RW_LOCK_UNLOCKED,
+        .me             = THIS_MODULE
+};
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+         struct sk_buff **pskb,
+         const struct net_device *in,
+         const struct net_device *out,
+         int (*okfn)(struct sk_buff *))
+{
+        return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+static unsigned int
+ipt_local_out_hook(unsigned int hook,
+                   struct sk_buff **pskb,
+                   const struct net_device *in,
+                   const struct net_device *out,
+                   int (*okfn)(struct sk_buff *))
+{
+        /* root is playing with raw sockets. */
+        if ((*pskb)->len < sizeof(struct iphdr)
+            || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+                if (net_ratelimit())
+                        printk("ipt_hook: happy cracking.\n");
+                return NF_ACCEPT;
+        }
+        return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+static struct nf_hook_ops ipt_ops[] = {
+        {
+                .hook           = ipt_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_LOCAL_IN,
+                .priority       = NF_IP_PRI_FILTER,
+        },
+        {
+                .hook           = ipt_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_FORWARD,
+                .priority       = NF_IP_PRI_FILTER,
+        },
+        {
+                .hook           = ipt_local_out_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_LOCAL_OUT,
+                .priority       = NF_IP_PRI_FILTER,
+        },
+};
+/* Default to forward because I got too much mail already. */
+static int forward = NF_ACCEPT;
+module_param(forward, bool, 0000);
+static int __init init(void)
+{
+        int ret;
+        if (forward < 0 || forward > NF_MAX_VERDICT) {
+                printk("iptables forward must be 0 or 1\n");
+                return -EINVAL;
+        }
+        /* Entry 1 is the FORWARD hook */
+        initial_table.entries[1].target.verdict = -forward - 1;
+        /* Register table */
+        ret = ipt_register_table(&packet_filter, &initial_table.repl);
+        if (ret < 0)
+                return ret;
+        /* Register hooks */
+        ret = nf_register_hook(&ipt_ops[0]);
+        if (ret < 0)
+                goto cleanup_table;
+        ret = nf_register_hook(&ipt_ops[1]);
+        if (ret < 0)
+                goto cleanup_hook0;
+        ret = nf_register_hook(&ipt_ops[2]);
+        if (ret < 0)
+                goto cleanup_hook1;
+        return ret;
+ cleanup_hook1:
+        nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+        nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+        ipt_unregister_table(&packet_filter);
+        return ret;
+}
+static void __exit fini(void)
+{
+        unsigned int i;
+        for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+                nf_unregister_hook(&ipt_ops[i]);
+        ipt_unregister_table(&packet_filter);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 000000000000..160eb11b6e2f
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,260 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Extended to all five netfilter hooks by Brad Chapman & Harald Welte
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mangle table");
+#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \
+                            (1 << NF_IP_LOCAL_IN) | \
+                            (1 << NF_IP_FORWARD) | \
+                            (1 << NF_IP_LOCAL_OUT) | \
+                            (1 << NF_IP_POST_ROUTING))
+/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
+static struct
+{
+        struct ipt_replace repl;
+        struct ipt_standard entries[5];
+        struct ipt_error term;
+} initial_table __initdata
+= { { "mangle", MANGLE_VALID_HOOKS, 6,
+      sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
+      { [NF_IP_PRE_ROUTING]     = 0,
+        [NF_IP_LOCAL_IN]        = sizeof(struct ipt_standard),
+        [NF_IP_FORWARD]         = sizeof(struct ipt_standard) * 2,
+        [NF_IP_LOCAL_OUT]       = sizeof(struct ipt_standard) * 3,
+        [NF_IP_POST_ROUTING]    = sizeof(struct ipt_standard) * 4 },
+      { [NF_IP_PRE_ROUTING]     = 0,
+        [NF_IP_LOCAL_IN]        = sizeof(struct ipt_standard),
+        [NF_IP_FORWARD]         = sizeof(struct ipt_standard) * 2,
+        [NF_IP_LOCAL_OUT]       = sizeof(struct ipt_standard) * 3,
+        [NF_IP_POST_ROUTING]    = sizeof(struct ipt_standard) * 4 },
+      0, NULL, { } },
+    {
+            /* PRE_ROUTING */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* LOCAL_IN */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* FORWARD */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* LOCAL_OUT */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+            /* POST_ROUTING */
+            { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+                0,
+                sizeof(struct ipt_entry),
+                sizeof(struct ipt_standard),
+                0, { 0, 0 }, { } },
+              { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+                -NF_ACCEPT - 1 } },
+    },
+    /* ERROR */
+    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+        0,
+        sizeof(struct ipt_entry),
+        sizeof(struct ipt_error),
+        0, { 0, 0 }, { } },
+      { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+          { } },
+        "ERROR"
+      }
+    }
+};
+static struct ipt_table packet_mangler = {
+        .name           = "mangle",
+        .valid_hooks    = MANGLE_VALID_HOOKS,
+        .lock           = RW_LOCK_UNLOCKED,
+        .me             = THIS_MODULE,
+};
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_route_hook(unsigned int hook,
+         struct sk_buff **pskb,
+         const struct net_device *in,
+         const struct net_device *out,
+         int (*okfn)(struct sk_buff *))
+{
+        return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+}
+static unsigned int
+ipt_local_hook(unsigned int hook,
+                   struct sk_buff **pskb,
+                   const struct net_device *in,
+                   const struct net_device *out,
+                   int (*okfn)(struct sk_buff *))
+{
+        unsigned int ret;
+        u_int8_t tos;
+        u_int32_t saddr, daddr;
+        unsigned long nfmark;
+        /* root is playing with raw sockets. */
+        if ((*pskb)->len < sizeof(struct iphdr)
+            || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+                if (net_ratelimit())
+                        printk("ipt_hook: happy cracking.\n");
+                return NF_ACCEPT;
+        }
+        /* Save things which could affect route */
+        nfmark = (*pskb)->nfmark;
+        saddr = (*pskb)->nh.iph->saddr;
+        daddr = (*pskb)->nh.iph->daddr;
+        tos = (*pskb)->nh.iph->tos;
+        ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+        /* Reroute for ANY change. */
+        if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE
+            && ((*pskb)->nh.iph->saddr != saddr
+                || (*pskb)->nh.iph->daddr != daddr
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                || (*pskb)->nfmark != nfmark
+#endif
+                || (*pskb)->nh.iph->tos != tos))
+                return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+        return ret;
+}
+static struct nf_hook_ops ipt_ops[] = {
+        {
+                .hook           = ipt_route_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_PRE_ROUTING, 
+                .priority       = NF_IP_PRI_MANGLE,
+        },
+        {
+                .hook           = ipt_route_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_LOCAL_IN,
+                .priority       = NF_IP_PRI_MANGLE,
+        },
+        {
+                .hook           = ipt_route_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_FORWARD,
+                .priority       = NF_IP_PRI_MANGLE,
+        },
+        {
+                .hook           = ipt_local_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_LOCAL_OUT,
+                .priority       = NF_IP_PRI_MANGLE,
+        },
+        {
+                .hook           = ipt_route_hook,
+                .owner          = THIS_MODULE,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_POST_ROUTING,
+                .priority       = NF_IP_PRI_MANGLE,
+        },
+};
+static int __init init(void)
+{
+        int ret;
+        /* Register table */
+        ret = ipt_register_table(&packet_mangler, &initial_table.repl);
+        if (ret < 0)
+                return ret;
+        /* Register hooks */
+        ret = nf_register_hook(&ipt_ops[0]);
+        if (ret < 0)
+                goto cleanup_table;
+        ret = nf_register_hook(&ipt_ops[1]);
+        if (ret < 0)
+                goto cleanup_hook0;
+        ret = nf_register_hook(&ipt_ops[2]);
+        if (ret < 0)
+                goto cleanup_hook1;
+        ret = nf_register_hook(&ipt_ops[3]);
+        if (ret < 0)
+                goto cleanup_hook2;
+        ret = nf_register_hook(&ipt_ops[4]);
+        if (ret < 0)
+                goto cleanup_hook3;
+        return ret;
+ cleanup_hook3:
+        nf_unregister_hook(&ipt_ops[3]);
+ cleanup_hook2:
+        nf_unregister_hook(&ipt_ops[2]);
+ cleanup_hook1:
+        nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+        nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+        ipt_unregister_table(&packet_mangler);
+        return ret;
+}
+static void __exit fini(void)
+{
+        unsigned int i;
+        for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+                nf_unregister_hook(&ipt_ops[i]);
+        ipt_unregister_table(&packet_mangler);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 000000000000..01b4a3c814d3
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,156 @@
+/* 
+ * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
+static struct
+{
+        struct ipt_replace repl;
+        struct ipt_standard entries[2];
+        struct ipt_error term;
+} initial_table __initdata = {
+        .repl = {
+                .name = "raw", 
+                .valid_hooks = RAW_VALID_HOOKS, 
+                .num_entries = 3,
+                .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
+                .hook_entry = { 
+                        [NF_IP_PRE_ROUTING] = 0,
+                        [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
+                .underflow = { 
+                        [NF_IP_PRE_ROUTING] = 0,
+                        [NF_IP_LOCAL_OUT]  = sizeof(struct ipt_standard) },
+        },
+        .entries = {
+             /* PRE_ROUTING */
+             { 
+                     .entry = { 
+                             .target_offset = sizeof(struct ipt_entry),
+                             .next_offset = sizeof(struct ipt_standard),
+                     },
+                     .target = { 
+                          .target = { 
+                                  .u = {
+                                          .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
+                                  },
+                          },
+                          .verdict = -NF_ACCEPT - 1,
+                     },
+             },
+             /* LOCAL_OUT */
+             {
+                     .entry = {
+                             .target_offset = sizeof(struct ipt_entry),
+                             .next_offset = sizeof(struct ipt_standard),
+                     },
+                     .target = {
+                             .target = {
+                                     .u = {
+                                             .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
+                                     },
+                             },
+                             .verdict = -NF_ACCEPT - 1,
+                     },
+             },
+        },
+        /* ERROR */
+        .term = {
+                .entry = {
+                        .target_offset = sizeof(struct ipt_entry),
+                        .next_offset = sizeof(struct ipt_error),
+                },
+                .target = {
+                        .target = {
+                                .u = {
+                                        .user = {
+                                                .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), 
+                                                .name = IPT_ERROR_TARGET,
+                                        },
+                                },
+                        },
+                        .errorname = "ERROR",
+                },
+        }
+};
+static struct ipt_table packet_raw = { 
+        .name = "raw", 
+        .valid_hooks =  RAW_VALID_HOOKS, 
+        .lock = RW_LOCK_UNLOCKED, 
+        .me = THIS_MODULE
+};
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+         struct sk_buff **pskb,
+         const struct net_device *in,
+         const struct net_device *out,
+         int (*okfn)(struct sk_buff *))
+{
+        return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL);
+}
+/* 'raw' is the very first table. */
+static struct nf_hook_ops ipt_ops[] = {
+        {
+          .hook = ipt_hook, 
+          .pf = PF_INET, 
+          .hooknum = NF_IP_PRE_ROUTING, 
+          .priority = NF_IP_PRI_RAW
+        },
+        {
+          .hook = ipt_hook, 
+          .pf = PF_INET, 
+          .hooknum = NF_IP_LOCAL_OUT, 
+          .priority = NF_IP_PRI_RAW
+        },
+};
+static int __init init(void)
+{
+        int ret;
+        /* Register table */
+        ret = ipt_register_table(&packet_raw, &initial_table.repl);
+        if (ret < 0)
+                return ret;
+        /* Register hooks */
+        ret = nf_register_hook(&ipt_ops[0]);
+        if (ret < 0)
+                goto cleanup_table;
+        ret = nf_register_hook(&ipt_ops[1]);
+        if (ret < 0)
+                goto cleanup_hook0;
+        return ret;
+ cleanup_hook0:
+        nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+        ipt_unregister_table(&packet_raw);
+        return ret;
+}
+static void __exit fini(void)
+{
+        unsigned int i;
+        for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+                nf_unregister_hook(&ipt_ops[i]);
+        ipt_unregister_table(&packet_raw);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 000000000000..912bbcc7f415
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,382 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              This file implements the various access functions for the
+ *              PROC file system.  It is mainly used for debugging and
+ *              statistics.
+ *
+ * Version:     $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $
+ *
+ * Authors:     Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ *              Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ *              Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ *              Alan Cox        :       UDP sockets show the rxqueue/txqueue
+ *                                      using hint flag for the netinfo.
+ *      Pauline Middelink       :       identd support
+ *              Alan Cox        :       Make /proc safer.
+ *      Erik Schoenfelder       :       /proc/net/snmp
+ *              Alan Cox        :       Handle dead sockets properly.
+ *      Gerhard Koerting        :       Show both timers
+ *              Alan Cox        :       Allow inode to be NULL (kernel socket)
+ *      Andi Kleen              :       Add support for open_requests and
+ *                                      split functions for more readibility.
+ *      Andi Kleen              :       Add support for /proc/net/netstat
+ *      Arnaldo C. Melo         :       Convert to seq_file
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/raw.h>
+static int fold_prot_inuse(struct proto *proto)
+{
+        int res = 0;
+        int cpu;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                res += proto->stats[cpu].inuse;
+        return res;
+}
+/*
+ *      Report socket allocation statistics [mea@utu.fi]
+ */
+static int sockstat_seq_show(struct seq_file *seq, void *v)
+{
+        /* From net/socket.c */
+        extern void socket_seq_show(struct seq_file *seq);
+        socket_seq_show(seq);
+        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
+                   fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
+                   tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+                   atomic_read(&tcp_memory_allocated));
+        seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
+        seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
+        seq_printf(seq,  "FRAG: inuse %d memory %d\n", ip_frag_nqueues,
+                   atomic_read(&ip_frag_mem));
+        return 0;
+}
+static int sockstat_seq_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, sockstat_seq_show, NULL);
+}
+static struct file_operations sockstat_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = sockstat_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+static unsigned long
+fold_field(void *mib[], int offt)
+{
+        unsigned long res = 0;
+        int i;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i))
+                        continue;
+                res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
+                res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+        }
+        return res;
+}
+/* snmp items */
+static struct snmp_mib snmp4_ipstats_list[] = {
+        SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
+        SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+        SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+        SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+        SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+        SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
+        SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
+        SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS),
+        SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+        SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+        SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+        SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
+        SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
+        SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
+        SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
+        SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
+        SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+        SNMP_MIB_SENTINEL
+};
+static struct snmp_mib snmp4_icmp_list[] = {
+        SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
+        SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
+        SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
+        SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
+        SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
+        SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
+        SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
+        SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
+        SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
+        SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
+        SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
+        SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
+        SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
+        SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
+        SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
+        SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
+        SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
+        SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
+        SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
+        SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
+        SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
+        SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
+        SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
+        SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
+        SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
+        SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
+        SNMP_MIB_SENTINEL
+};
+static struct snmp_mib snmp4_tcp_list[] = {
+        SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
+        SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
+        SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
+        SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
+        SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
+        SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
+        SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
+        SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
+        SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
+        SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
+        SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
+        SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
+        SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
+        SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+        SNMP_MIB_SENTINEL
+};
+static struct snmp_mib snmp4_udp_list[] = {
+        SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
+        SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
+        SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
+        SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+        SNMP_MIB_SENTINEL
+};
+static struct snmp_mib snmp4_net_list[] = {
+        SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
+        SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
+        SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
+        SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
+        SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
+        SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
+        SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
+        SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
+        SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
+        SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
+        SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
+        SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
+        SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
+        SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
+        SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
+        SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+        SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
+        SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
+        SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
+        SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
+        SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+        SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
+        SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
+        SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
+        SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
+        SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
+        SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
+        SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+        SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
+        SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
+        SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
+        SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
+        SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
+        SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
+        SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
+        SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
+        SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
+        SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
+        SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
+        SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+        SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
+        SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
+        SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
+        SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
+        SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
+        SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
+        SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
+        SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
+        SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+        SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
+        SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
+        SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
+        SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+        SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
+        SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
+        SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+        SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+        SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
+        SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
+        SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
+        SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
+        SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
+        SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
+        SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
+        SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+        SNMP_MIB_SENTINEL
+};
+/*
+ *      Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+        int i;
+        seq_puts(seq, "Ip: Forwarding DefaultTTL");
+        for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+                seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+        seq_printf(seq, "\nIp: %d %d",
+                        ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl);
+        for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+                seq_printf(seq, " %lu",
+                           fold_field((void **) ip_statistics, 
+                                      snmp4_ipstats_list[i].entry));
+        seq_puts(seq, "\nIcmp:");
+        for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
+                seq_printf(seq, " %s", snmp4_icmp_list[i].name);
+        seq_puts(seq, "\nIcmp:");
+        for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
+                seq_printf(seq, " %lu",
+                           fold_field((void **) icmp_statistics, 
+                                      snmp4_icmp_list[i].entry));
+        seq_puts(seq, "\nTcp:");
+        for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+                seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+        seq_puts(seq, "\nTcp:");
+        for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+                /* MaxConn field is signed, RFC 2012 */
+                if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+                        seq_printf(seq, " %ld",
+                                   fold_field((void **) tcp_statistics, 
+                                              snmp4_tcp_list[i].entry));
+                else
+                        seq_printf(seq, " %lu",
+                                   fold_field((void **) tcp_statistics,
+                                              snmp4_tcp_list[i].entry));
+        }
+        seq_puts(seq, "\nUdp:");
+        for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+                seq_printf(seq, " %s", snmp4_udp_list[i].name);
+        seq_puts(seq, "\nUdp:");
+        for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+                seq_printf(seq, " %lu",
+                           fold_field((void **) udp_statistics, 
+                                      snmp4_udp_list[i].entry));
+        seq_putc(seq, '\n');
+        return 0;
+}
+static int snmp_seq_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, snmp_seq_show, NULL);
+}
+static struct file_operations snmp_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = snmp_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+/*
+ *      Output /proc/net/netstat
+ */
+static int netstat_seq_show(struct seq_file *seq, void *v)
+{
+        int i;
+        seq_puts(seq, "TcpExt:");
+        for (i = 0; snmp4_net_list[i].name != NULL; i++)
+                seq_printf(seq, " %s", snmp4_net_list[i].name);
+        seq_puts(seq, "\nTcpExt:");
+        for (i = 0; snmp4_net_list[i].name != NULL; i++)
+                seq_printf(seq, " %lu",
+                           fold_field((void **) net_statistics, 
+                                      snmp4_net_list[i].entry));
+        seq_putc(seq, '\n');
+        return 0;
+}
+static int netstat_seq_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, netstat_seq_show, NULL);
+}
+static struct file_operations netstat_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = netstat_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+int __init ip_misc_proc_init(void)
+{
+        int rc = 0;
+        if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops))
+                goto out_netstat;
+        if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops))
+                goto out_snmp;
+        if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops))
+                goto out_sockstat;
+out:
+        return rc;
+out_sockstat:
+        proc_net_remove("snmp");
+out_snmp:
+        proc_net_remove("netstat");
+out_netstat:
+        rc = -ENOMEM;
+        goto out;
+}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 000000000000..90a587cacaa4
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,101 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              INET protocol dispatch tables.
+ *
+ * Version:     $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *              Alan Cox        : Ahah! udp icmp errors don't work because
+ *                                udp_err is never called!
+ *              Alan Cox        : Added new fields for init and ready for
+ *                                proper fragmentation (_NO_ 4K limits!)
+ *              Richard Colella : Hang on hash collision
+ *              Vince Laviano   : Modified inet_del_protocol() to correctly
+ *                                maintain copy bit.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/ipip.h>
+#include <linux/igmp.h>
+struct net_protocol *inet_protos[MAX_INET_PROTOS];
+static DEFINE_SPINLOCK(inet_proto_lock);
+/*
+ *      Add a protocol handler to the hash tables
+ */
+int inet_add_protocol(struct net_protocol *prot, unsigned char protocol)
+{
+        int hash, ret;
+        hash = protocol & (MAX_INET_PROTOS - 1);
+        spin_lock_bh(&inet_proto_lock);
+        if (inet_protos[hash]) {
+                ret = -1;
+        } else {
+                inet_protos[hash] = prot;
+                ret = 0;
+        }
+        spin_unlock_bh(&inet_proto_lock);
+        return ret;
+}
+/*
+ *      Remove a protocol from the hash tables.
+ */
+ 
+int inet_del_protocol(struct net_protocol *prot, unsigned char protocol)
+{
+        int hash, ret;
+        hash = protocol & (MAX_INET_PROTOS - 1);
+        spin_lock_bh(&inet_proto_lock);
+        if (inet_protos[hash] == prot) {
+                inet_protos[hash] = NULL;
+                ret = 0;
+        } else {
+                ret = -1;
+        }
+        spin_unlock_bh(&inet_proto_lock);
+        synchronize_net();
+        return ret;
+}
+EXPORT_SYMBOL(inet_add_protocol);
+EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 000000000000..93624a32eb9a
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,888 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              RAW - implementation of IP "raw" sockets.
+ *
+ * Version:     $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *              Alan Cox        :       verify_area() fixed up
+ *              Alan Cox        :       ICMP error handling
+ *              Alan Cox        :       EMSGSIZE if you send too big a packet
+ *              Alan Cox        :       Now uses generic datagrams and shared
+ *                                      skbuff library. No more peek crashes,
+ *                                      no more backlogs
+ *              Alan Cox        :       Checks sk->broadcast.
+ *              Alan Cox        :       Uses skb_free_datagram/skb_copy_datagram
+ *              Alan Cox        :       Raw passes ip options too
+ *              Alan Cox        :       Setsocketopt added
+ *              Alan Cox        :       Fixed error return for broadcasts
+ *              Alan Cox        :       Removed wake_up calls
+ *              Alan Cox        :       Use ttl/tos
+ *              Alan Cox        :       Cleaned up old debugging
+ *              Alan Cox        :       Use new kernel side addresses
+ *      Arnt Gulbrandsen        :       Fixed MSG_DONTROUTE in raw sockets.
+ *              Alan Cox        :       BSD style RAW socket demultiplexing.
+ *              Alan Cox        :       Beginnings of mrouted support.
+ *              Alan Cox        :       Added IP_HDRINCL option.
+ *              Alan Cox        :       Skip broadcast check if BSDism set.
+ *              David S. Miller :       New socket lookup architecture.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+ 
+#include <linux/config.h> 
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/aio.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/sockios.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/mroute.h>
+#include <linux/netdevice.h>
+#include <linux/in_route.h>
+#include <linux/route.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/gfp.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE];
+DEFINE_RWLOCK(raw_v4_lock);
+static void raw_v4_hash(struct sock *sk)
+{
+        struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num &
+                                                 (RAWV4_HTABLE_SIZE - 1)];
+        write_lock_bh(&raw_v4_lock);
+        sk_add_node(sk, head);
+        sock_prot_inc_use(sk->sk_prot);
+        write_unlock_bh(&raw_v4_lock);
+}
+static void raw_v4_unhash(struct sock *sk)
+{
+        write_lock_bh(&raw_v4_lock);
+        if (sk_del_node_init(sk))
+                sock_prot_dec_use(sk->sk_prot);
+        write_unlock_bh(&raw_v4_lock);
+}
+struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
+                             unsigned long raddr, unsigned long laddr,
+                             int dif)
+{
+        struct hlist_node *node;
+        sk_for_each_from(sk, node) {
+                struct inet_sock *inet = inet_sk(sk);
+                if (inet->num == num                                    &&
+                    !(inet->daddr && inet->daddr != raddr)              &&
+                    !(inet->rcv_saddr && inet->rcv_saddr != laddr)      &&
+                    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+                        goto found; /* gotcha */
+        }
+        sk = NULL;
+found:
+        return sk;
+}
+/*
+ *      0 - deliver
+ *      1 - block
+ */
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+{
+        int type;
+        if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+                return 1;
+        type = skb->h.icmph->type;
+        if (type < 32) {
+                __u32 data = raw_sk(sk)->filter.data;
+                return ((1 << type) & data) != 0;
+        }
+        /* Do not block unknown ICMP types */
+        return 0;
+}
+/* IP input processing comes here for RAW socket delivery.
+ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+{
+        struct sock *sk;
+        struct hlist_head *head;
+        read_lock(&raw_v4_lock);
+        head = &raw_v4_htable[hash];
+        if (hlist_empty(head))
+                goto out;
+        sk = __raw_v4_lookup(__sk_head(head), iph->protocol,
+                             iph->saddr, iph->daddr,
+                             skb->dev->ifindex);
+        while (sk) {
+                if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+                        struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+                        /* Not releasing hash table! */
+                        if (clone)
+                                raw_rcv(sk, clone);
+                }
+                sk = __raw_v4_lookup(sk_next(sk), iph->protocol,
+                                     iph->saddr, iph->daddr,
+                                     skb->dev->ifindex);
+        }
+out:
+        read_unlock(&raw_v4_lock);
+}
+void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        int type = skb->h.icmph->type;
+        int code = skb->h.icmph->code;
+        int err = 0;
+        int harderr = 0;
+        /* Report error on raw socket, if:
+           1. User requested ip_recverr.
+           2. Socket is connected (otherwise the error indication
+              is useless without ip_recverr and error is hard.
+         */
+        if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+                return;
+        switch (type) {
+        default:
+        case ICMP_TIME_EXCEEDED:
+                err = EHOSTUNREACH;
+                break;
+        case ICMP_SOURCE_QUENCH:
+                return;
+        case ICMP_PARAMETERPROB:
+                err = EPROTO;
+                harderr = 1;
+                break;
+        case ICMP_DEST_UNREACH:
+                err = EHOSTUNREACH;
+                if (code > NR_ICMP_UNREACH)
+                        break;
+                err = icmp_err_convert[code].errno;
+                harderr = icmp_err_convert[code].fatal;
+                if (code == ICMP_FRAG_NEEDED) {
+                        harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+                        err = EMSGSIZE;
+                }
+        }
+        if (inet->recverr) {
+                struct iphdr *iph = (struct iphdr*)skb->data;
+                u8 *payload = skb->data + (iph->ihl << 2);
+                if (inet->hdrincl)
+                        payload = skb->data;
+                ip_icmp_error(sk, skb, err, 0, info, payload);
+        }
+        if (inet->recverr || harderr) {
+                sk->sk_err = err;
+                sk->sk_error_report(sk);
+        }
+}
+static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+{
+        /* Charge it to the socket. */
+        
+        if (sock_queue_rcv_skb(sk, skb) < 0) {
+                /* FIXME: increment a raw drops counter here */
+                kfree_skb(skb);
+                return NET_RX_DROP;
+        }
+        return NET_RX_SUCCESS;
+}
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+                kfree_skb(skb);
+                return NET_RX_DROP;
+        }
+        skb_push(skb, skb->data - skb->nh.raw);
+        raw_rcv_skb(sk, skb);
+        return 0;
+}
+static int raw_send_hdrinc(struct sock *sk, void *from, int length,
+                        struct rtable *rt, 
+                        unsigned int flags)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        int hh_len;
+        struct iphdr *iph;
+        struct sk_buff *skb;
+        int err;
+        if (length > rt->u.dst.dev->mtu) {
+                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
+                               rt->u.dst.dev->mtu);
+                return -EMSGSIZE;
+        }
+        if (flags&MSG_PROBE)
+                goto out;
+        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+        skb = sock_alloc_send_skb(sk, length+hh_len+15,
+                                  flags&MSG_DONTWAIT, &err);
+        if (skb == NULL)
+                goto error; 
+        skb_reserve(skb, hh_len);
+        skb->priority = sk->sk_priority;
+        skb->dst = dst_clone(&rt->u.dst);
+        skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+        skb->ip_summed = CHECKSUM_NONE;
+        skb->h.raw = skb->nh.raw;
+        err = memcpy_fromiovecend((void *)iph, from, 0, length);
+        if (err)
+                goto error_fault;
+        /* We don't modify invalid header */
+        if (length >= sizeof(*iph) && iph->ihl * 4 <= length) {
+                if (!iph->saddr)
+                        iph->saddr = rt->rt_src;
+                iph->check   = 0;
+                iph->tot_len = htons(length);
+                if (!iph->id)
+                        ip_select_ident(iph, &rt->u.dst, NULL);
+                iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+        }
+        err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+                      dst_output);
+        if (err > 0)
+                err = inet->recverr ? net_xmit_errno(err) : 0;
+        if (err)
+                goto error;
+out:
+        return 0;
+error_fault:
+        err = -EFAULT;
+        kfree_skb(skb);
+error:
+        IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+        return err; 
+}
+static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+{
+        struct iovec *iov;
+        u8 __user *type = NULL;
+        u8 __user *code = NULL;
+        int probed = 0;
+        int i;
+        if (!msg->msg_iov)
+                return;
+        for (i = 0; i < msg->msg_iovlen; i++) {
+                iov = &msg->msg_iov[i];
+                if (!iov)
+                        continue;
+                switch (fl->proto) {
+                case IPPROTO_ICMP:
+                        /* check if one-byte field is readable or not. */
+                        if (iov->iov_base && iov->iov_len < 1)
+                                break;
+                        if (!type) {
+                                type = iov->iov_base;
+                                /* check if code field is readable or not. */
+                                if (iov->iov_len > 1)
+                                        code = type + 1;
+                        } else if (!code)
+                                code = iov->iov_base;
+                        if (type && code) {
+                                get_user(fl->fl_icmp_type, type);
+                                __get_user(fl->fl_icmp_code, code);
+                                probed = 1;
+                        }
+                        break;
+                default:
+                        probed = 1;
+                        break;
+                }
+                if (probed)
+                        break;
+        }
+}
+static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                       size_t len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct ipcm_cookie ipc;
+        struct rtable *rt = NULL;
+        int free = 0;
+        u32 daddr;
+        u32 saddr;
+        u8  tos;
+        int err;
+        err = -EMSGSIZE;
+        if (len < 0 || len > 0xFFFF)
+                goto out;
+        /*
+         *      Check the flags.
+         */
+        err = -EOPNOTSUPP;
+        if (msg->msg_flags & MSG_OOB)   /* Mirror BSD error message */
+                goto out;               /* compatibility */
+                         
+        /*
+         *      Get and verify the address. 
+         */
+        if (msg->msg_namelen) {
+                struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
+                err = -EINVAL;
+                if (msg->msg_namelen < sizeof(*usin))
+                        goto out;
+                if (usin->sin_family != AF_INET) {
+                        static int complained;
+                        if (!complained++)
+                                printk(KERN_INFO "%s forgot to set AF_INET in "
+                                                 "raw sendmsg. Fix it!\n",
+                                                 current->comm);
+                        err = -EAFNOSUPPORT;
+                        if (usin->sin_family)
+                                goto out;
+                }
+                daddr = usin->sin_addr.s_addr;
+                /* ANK: I did not forget to get protocol from port field.
+                 * I just do not know, who uses this weirdness.
+                 * IP_HDRINCL is much more convenient.
+                 */
+        } else {
+                err = -EDESTADDRREQ;
+                if (sk->sk_state != TCP_ESTABLISHED) 
+                        goto out;
+                daddr = inet->daddr;
+        }
+        ipc.addr = inet->saddr;
+        ipc.opt = NULL;
+        ipc.oif = sk->sk_bound_dev_if;
+        if (msg->msg_controllen) {
+                err = ip_cmsg_send(msg, &ipc);
+                if (err)
+                        goto out;
+                if (ipc.opt)
+                        free = 1;
+        }
+        saddr = ipc.addr;
+        ipc.addr = daddr;
+        if (!ipc.opt)
+                ipc.opt = inet->opt;
+        if (ipc.opt) {
+                err = -EINVAL;
+                /* Linux does not mangle headers on raw sockets,
+                 * so that IP options + IP_HDRINCL is non-sense.
+                 */
+                if (inet->hdrincl)
+                        goto done;
+                if (ipc.opt->srr) {
+                        if (!daddr)
+                                goto done;
+                        daddr = ipc.opt->faddr;
+                }
+        }
+        tos = RT_CONN_FLAGS(sk);
+        if (msg->msg_flags & MSG_DONTROUTE)
+                tos |= RTO_ONLINK;
+        if (MULTICAST(daddr)) {
+                if (!ipc.oif)
+                        ipc.oif = inet->mc_index;
+                if (!saddr)
+                        saddr = inet->mc_addr;
+        }
+        {
+                struct flowi fl = { .oif = ipc.oif,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = daddr,
+                                                .saddr = saddr,
+                                                .tos = tos } },
+                                    .proto = inet->hdrincl ? IPPROTO_RAW :
+                                                             sk->sk_protocol,
+                                  };
+                if (!inet->hdrincl)
+                        raw_probe_proto_opt(&fl, msg);
+                err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+        }
+        if (err)
+                goto done;
+        err = -EACCES;
+        if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
+                goto done;
+        if (msg->msg_flags & MSG_CONFIRM)
+                goto do_confirm;
+back_from_confirm:
+        if (inet->hdrincl)
+                err = raw_send_hdrinc(sk, msg->msg_iov, len, 
+                                        rt, msg->msg_flags);
+        
+         else {
+                if (!ipc.addr)
+                        ipc.addr = rt->rt_dst;
+                lock_sock(sk);
+                err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
+                                        &ipc, rt, msg->msg_flags);
+                if (err)
+                        ip_flush_pending_frames(sk);
+                else if (!(msg->msg_flags & MSG_MORE))
+                        err = ip_push_pending_frames(sk);
+                release_sock(sk);
+        }
+done:
+        if (free)
+                kfree(ipc.opt);
+        ip_rt_put(rt);
+out:    return err < 0 ? err : len;
+do_confirm:
+        dst_confirm(&rt->u.dst);
+        if (!(msg->msg_flags & MSG_PROBE) || len)
+                goto back_from_confirm;
+        err = 0;
+        goto done;
+}
+static void raw_close(struct sock *sk, long timeout)
+{
+        /*
+         * Raw sockets may have direct kernel refereneces. Kill them.
+         */
+        ip_ra_control(sk, 0, NULL);
+        sk_common_release(sk);
+}
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+        int ret = -EINVAL;
+        int chk_addr_ret;
+        if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
+                goto out;
+        chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+        ret = -EADDRNOTAVAIL;
+        if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+            chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+                goto out;
+        inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+        if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+                inet->saddr = 0;  /* Use device */
+        sk_dst_reset(sk);
+        ret = 0;
+out:    return ret;
+}
+/*
+ *      This should be easy, if there is something there
+ *      we return it, otherwise we block.
+ */
+static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                       size_t len, int noblock, int flags, int *addr_len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        size_t copied = 0;
+        int err = -EOPNOTSUPP;
+        struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+        struct sk_buff *skb;
+        if (flags & MSG_OOB)
+                goto out;
+        if (addr_len)
+                *addr_len = sizeof(*sin);
+        if (flags & MSG_ERRQUEUE) {
+                err = ip_recv_error(sk, msg, len);
+                goto out;
+        }
+        skb = skb_recv_datagram(sk, flags, noblock, &err);
+        if (!skb)
+                goto out;
+        copied = skb->len;
+        if (len < copied) {
+                msg->msg_flags |= MSG_TRUNC;
+                copied = len;
+        }
+        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+        if (err)
+                goto done;
+        sock_recv_timestamp(msg, sk, skb);
+        /* Copy the address. */
+        if (sin) {
+                sin->sin_family = AF_INET;
+                sin->sin_addr.s_addr = skb->nh.iph->saddr;
+                memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+        }
+        if (inet->cmsg_flags)
+                ip_cmsg_recv(msg, skb);
+        if (flags & MSG_TRUNC)
+                copied = skb->len;
+done:
+        skb_free_datagram(sk, skb);
+out:    return err ? err : copied;
+}
+static int raw_init(struct sock *sk)
+{
+        struct raw_sock *rp = raw_sk(sk);
+        if (inet_sk(sk)->num == IPPROTO_ICMP)
+                memset(&rp->filter, 0, sizeof(rp->filter));
+        return 0;
+}
+static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+{
+        if (optlen > sizeof(struct icmp_filter))
+                optlen = sizeof(struct icmp_filter);
+        if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+                return -EFAULT;
+        return 0;
+}
+static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
+{
+        int len, ret = -EFAULT;
+        if (get_user(len, optlen))
+                goto out;
+        ret = -EINVAL;
+        if (len < 0)
+                goto out;
+        if (len > sizeof(struct icmp_filter))
+                len = sizeof(struct icmp_filter);
+        ret = -EFAULT;
+        if (put_user(len, optlen) ||
+            copy_to_user(optval, &raw_sk(sk)->filter, len))
+                goto out;
+        ret = 0;
+out:    return ret;
+}
+static int raw_setsockopt(struct sock *sk, int level, int optname, 
+                          char __user *optval, int optlen)
+{
+        if (level != SOL_RAW)
+                return ip_setsockopt(sk, level, optname, optval, optlen);
+        if (optname == ICMP_FILTER) {
+                if (inet_sk(sk)->num != IPPROTO_ICMP)
+                        return -EOPNOTSUPP;
+                else
+                        return raw_seticmpfilter(sk, optval, optlen);
+        }
+        return -ENOPROTOOPT;
+}
+static int raw_getsockopt(struct sock *sk, int level, int optname, 
+                          char __user *optval, int __user *optlen)
+{
+        if (level != SOL_RAW)
+                return ip_getsockopt(sk, level, optname, optval, optlen);
+        if (optname == ICMP_FILTER) {
+                if (inet_sk(sk)->num != IPPROTO_ICMP)
+                        return -EOPNOTSUPP;
+                else
+                        return raw_geticmpfilter(sk, optval, optlen);
+        }
+        return -ENOPROTOOPT;
+}
+static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+        switch (cmd) {
+                case SIOCOUTQ: {
+                        int amount = atomic_read(&sk->sk_wmem_alloc);
+                        return put_user(amount, (int __user *)arg);
+                }
+                case SIOCINQ: {
+                        struct sk_buff *skb;
+                        int amount = 0;
+                        spin_lock_irq(&sk->sk_receive_queue.lock);
+                        skb = skb_peek(&sk->sk_receive_queue);
+                        if (skb != NULL)
+                                amount = skb->len;
+                        spin_unlock_irq(&sk->sk_receive_queue.lock);
+                        return put_user(amount, (int __user *)arg);
+                }
+                default:
+#ifdef CONFIG_IP_MROUTE
+                        return ipmr_ioctl(sk, cmd, (void __user *)arg);
+#else
+                        return -ENOIOCTLCMD;
+#endif
+        }
+}
+struct proto raw_prot = {
+        .name =         "RAW",
+        .owner =        THIS_MODULE,
+        .close =        raw_close,
+        .connect =      ip4_datagram_connect,
+        .disconnect =   udp_disconnect,
+        .ioctl =        raw_ioctl,
+        .init =         raw_init,
+        .setsockopt =   raw_setsockopt,
+        .getsockopt =   raw_getsockopt,
+        .sendmsg =      raw_sendmsg,
+        .recvmsg =      raw_recvmsg,
+        .bind =         raw_bind,
+        .backlog_rcv =  raw_rcv_skb,
+        .hash =         raw_v4_hash,
+        .unhash =       raw_v4_unhash,
+        .obj_size =     sizeof(struct raw_sock),
+};
+#ifdef CONFIG_PROC_FS
+struct raw_iter_state {
+        int bucket;
+};
+#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private)
+static struct sock *raw_get_first(struct seq_file *seq)
+{
+        struct sock *sk;
+        struct raw_iter_state* state = raw_seq_private(seq);
+        for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) {
+                struct hlist_node *node;
+                sk_for_each(sk, node, &raw_v4_htable[state->bucket])
+                        if (sk->sk_family == PF_INET)
+                                goto found;
+        }
+        sk = NULL;
+found:
+        return sk;
+}
+static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
+{
+        struct raw_iter_state* state = raw_seq_private(seq);
+        do {
+                sk = sk_next(sk);
+try_again:
+                ;
+        } while (sk && sk->sk_family != PF_INET);
+        if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) {
+                sk = sk_head(&raw_v4_htable[state->bucket]);
+                goto try_again;
+        }
+        return sk;
+}
+static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct sock *sk = raw_get_first(seq);
+        if (sk)
+                while (pos && (sk = raw_get_next(seq, sk)) != NULL)
+                        --pos;
+        return pos ? NULL : sk;
+}
+static void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock(&raw_v4_lock);
+        return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct sock *sk;
+        if (v == SEQ_START_TOKEN)
+                sk = raw_get_first(seq);
+        else
+                sk = raw_get_next(seq, v);
+        ++*pos;
+        return sk;
+}
+static void raw_seq_stop(struct seq_file *seq, void *v)
+{
+        read_unlock(&raw_v4_lock);
+}
+static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i)
+{
+        struct inet_sock *inet = inet_sk(sp);
+        unsigned int dest = inet->daddr,
+                     src = inet->rcv_saddr;
+        __u16 destp = 0,
+              srcp  = inet->num;
+        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
+                i, src, srcp, dest, destp, sp->sk_state, 
+                atomic_read(&sp->sk_wmem_alloc),
+                atomic_read(&sp->sk_rmem_alloc),
+                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+                atomic_read(&sp->sk_refcnt), sp);
+        return tmpbuf;
+}
+static int raw_seq_show(struct seq_file *seq, void *v)
+{
+        char tmpbuf[129];
+        if (v == SEQ_START_TOKEN)
+                seq_printf(seq, "%-127s\n",
+                               "  sl  local_address rem_address   st tx_queue "
+                               "rx_queue tr tm->when retrnsmt   uid  timeout "
+                               "inode");
+        else {
+                struct raw_iter_state *state = raw_seq_private(seq);
+                seq_printf(seq, "%-127s\n",
+                           get_raw_sock(v, tmpbuf, state->bucket));
+        }
+        return 0;
+}
+static struct seq_operations raw_seq_ops = {
+        .start = raw_seq_start,
+        .next  = raw_seq_next,
+        .stop  = raw_seq_stop,
+        .show  = raw_seq_show,
+};
+static int raw_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                goto out;
+        rc = seq_open(file, &raw_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq = file->private_data;
+        seq->private = s;
+        memset(s, 0, sizeof(*s));
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations raw_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = raw_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+int __init raw_proc_init(void)
+{
+        if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops))
+                return -ENOMEM;
+        return 0;
+}
+void __init raw_proc_exit(void)
+{
+        proc_net_remove("raw");
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 000000000000..9f91a116d919
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,3177 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              ROUTE - implementation of the IP router.
+ *
+ * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *              Alan Cox        :       Verify area fixes.
+ *              Alan Cox        :       cli() protects routing changes
+ *              Rui Oliveira    :       ICMP routing table updates
+ *              (rco@di.uminho.pt)      Routing table insertion and update
+ *              Linus Torvalds  :       Rewrote bits to be sensible
+ *              Alan Cox        :       Added BSD route gw semantics
+ *              Alan Cox        :       Super /proc >4K 
+ *              Alan Cox        :       MTU in route table
+ *              Alan Cox        :       MSS actually. Also added the window
+ *                                      clamper.
+ *              Sam Lantinga    :       Fixed route matching in rt_del()
+ *              Alan Cox        :       Routing cache support.
+ *              Alan Cox        :       Removed compatibility cruft.
+ *              Alan Cox        :       RTF_REJECT support.
+ *              Alan Cox        :       TCP irtt support.
+ *              Jonathan Naylor :       Added Metric support.
+ *      Miquel van Smoorenburg  :       BSD API fixes.
+ *      Miquel van Smoorenburg  :       Metrics.
+ *              Alan Cox        :       Use __u32 properly
+ *              Alan Cox        :       Aligned routing errors more closely with BSD
+ *                                      our system is still very different.
+ *              Alan Cox        :       Faster /proc handling
+ *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
+ *                                      routing caches and better behaviour.
+ *              
+ *              Olaf Erb        :       irtt wasn't being copied right.
+ *              Bjorn Ekwall    :       Kerneld route support.
+ *              Alan Cox        :       Multicast fixed (I hope)
+ *              Pavel Krauz     :       Limited broadcast fixed
+ *              Mike McLagan    :       Routing by source
+ *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
+ *                                      route.c and rewritten from scratch.
+ *              Andi Kleen      :       Load-limit warning messages.
+ *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
+ *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
+ *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
+ *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
+ *              Marc Boucher    :       routing by fwmark
+ *      Robert Olsson           :       Added rt_cache statistics
+ *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/ip_mp_alg.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#define RT_FL_TOS(oldflp) \
+    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+#define IP_MAX_MTU      0xFFF0
+#define RT_GC_TIMEOUT (300*HZ)
+static int ip_rt_min_delay              = 2 * HZ;
+static int ip_rt_max_delay              = 10 * HZ;
+static int ip_rt_max_size;
+static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval            = 60 * HZ;
+static int ip_rt_gc_min_interval        = HZ / 2;
+static int ip_rt_redirect_number        = 9;
+static int ip_rt_redirect_load          = HZ / 50;
+static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost             = HZ;
+static int ip_rt_error_burst            = 5 * HZ;
+static int ip_rt_gc_elasticity          = 8;
+static int ip_rt_mtu_expires            = 10 * 60 * HZ;
+static int ip_rt_min_pmtu               = 512 + 20 + 20;
+static int ip_rt_min_advmss             = 256;
+static int ip_rt_secret_interval        = 10 * 60 * HZ;
+static unsigned long rt_deadline;
+#define RTprint(a...)   printk(KERN_DEBUG a)
+static struct timer_list rt_flush_timer;
+static struct timer_list rt_periodic_timer;
+static struct timer_list rt_secret_timer;
+/*
+ *      Interface to generic destination cache.
+ */
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static void              ipv4_dst_destroy(struct dst_entry *dst);
+static void              ipv4_dst_ifdown(struct dst_entry *dst,
+                                         struct net_device *dev, int how);
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void              ipv4_link_failure(struct sk_buff *skb);
+static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static int rt_garbage_collect(void);
+static struct dst_ops ipv4_dst_ops = {
+        .family =               AF_INET,
+        .protocol =             __constant_htons(ETH_P_IP),
+        .gc =                   rt_garbage_collect,
+        .check =                ipv4_dst_check,
+        .destroy =              ipv4_dst_destroy,
+        .ifdown =               ipv4_dst_ifdown,
+        .negative_advice =      ipv4_negative_advice,
+        .link_failure =         ipv4_link_failure,
+        .update_pmtu =          ip_rt_update_pmtu,
+        .entry_size =           sizeof(struct rtable),
+};
+#define ECN_OR_COST(class)      TC_PRIO_##class
+__u8 ip_tos2prio[16] = {
+        TC_PRIO_BESTEFFORT,
+        ECN_OR_COST(FILLER),
+        TC_PRIO_BESTEFFORT,
+        ECN_OR_COST(BESTEFFORT),
+        TC_PRIO_BULK,
+        ECN_OR_COST(BULK),
+        TC_PRIO_BULK,
+        ECN_OR_COST(BULK),
+        TC_PRIO_INTERACTIVE,
+        ECN_OR_COST(INTERACTIVE),
+        TC_PRIO_INTERACTIVE,
+        ECN_OR_COST(INTERACTIVE),
+        TC_PRIO_INTERACTIVE_BULK,
+        ECN_OR_COST(INTERACTIVE_BULK),
+        TC_PRIO_INTERACTIVE_BULK,
+        ECN_OR_COST(INTERACTIVE_BULK)
+};
+/*
+ * Route cache.
+ */
+/* The locking scheme is rather straight forward:
+ *
+ * 1) Read-Copy Update protects the buckets of the central route hash.
+ * 2) Only writers remove entries, and they hold the lock
+ *    as they look at rtable reference counts.
+ * 3) Only readers acquire references to rtable entries,
+ *    they do so with atomic increments and with the
+ *    lock held.
+ */
+struct rt_hash_bucket {
+        struct rtable   *chain;
+        spinlock_t      lock;
+} __attribute__((__aligned__(8)));
+static struct rt_hash_bucket    *rt_hash_table;
+static unsigned                 rt_hash_mask;
+static int                      rt_hash_log;
+static unsigned int             rt_hash_rnd;
+struct rt_cache_stat *rt_cache_stat;
+static int rt_intern_hash(unsigned hash, struct rtable *rth,
+                                struct rtable **res);
+static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
+{
+        return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
+                & rt_hash_mask);
+}
+#ifdef CONFIG_PROC_FS
+struct rt_cache_iter_state {
+        int bucket;
+};
+static struct rtable *rt_cache_get_first(struct seq_file *seq)
+{
+        struct rtable *r = NULL;
+        struct rt_cache_iter_state *st = seq->private;
+        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+                rcu_read_lock_bh();
+                r = rt_hash_table[st->bucket].chain;
+                if (r)
+                        break;
+                rcu_read_unlock_bh();
+        }
+        return r;
+}
+static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
+{
+        struct rt_cache_iter_state *st = rcu_dereference(seq->private);
+        r = r->u.rt_next;
+        while (!r) {
+                rcu_read_unlock_bh();
+                if (--st->bucket < 0)
+                        break;
+                rcu_read_lock_bh();
+                r = rt_hash_table[st->bucket].chain;
+        }
+        return r;
+}
+static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct rtable *r = rt_cache_get_first(seq);
+        if (r)
+                while (pos && (r = rt_cache_get_next(seq, r)))
+                        --pos;
+        return pos ? NULL : r;
+}
+static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct rtable *r = NULL;
+        if (v == SEQ_START_TOKEN)
+                r = rt_cache_get_first(seq);
+        else
+                r = rt_cache_get_next(seq, v);
+        ++*pos;
+        return r;
+}
+static void rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+        if (v && v != SEQ_START_TOKEN)
+                rcu_read_unlock_bh();
+}
+static int rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_printf(seq, "%-127s\n",
+                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
+                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
+                           "HHUptod\tSpecDst");
+        else {
+                struct rtable *r = v;
+                char temp[256];
+                sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
+                              "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
+                        r->u.dst.dev ? r->u.dst.dev->name : "*",
+                        (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
+                        r->rt_flags, atomic_read(&r->u.dst.__refcnt),
+                        r->u.dst.__use, 0, (unsigned long)r->rt_src,
+                        (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
+                             (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
+                        dst_metric(&r->u.dst, RTAX_WINDOW),
+                        (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
+                              dst_metric(&r->u.dst, RTAX_RTTVAR)),
+                        r->fl.fl4_tos,
+                        r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
+                        r->u.dst.hh ? (r->u.dst.hh->hh_output ==
+                                       dev_queue_xmit) : 0,
+                        r->rt_spec_dst);
+                seq_printf(seq, "%-127s\n", temp);
+        }
+        return 0;
+}
+static struct seq_operations rt_cache_seq_ops = {
+        .start  = rt_cache_seq_start,
+        .next   = rt_cache_seq_next,
+        .stop   = rt_cache_seq_stop,
+        .show   = rt_cache_seq_show,
+};
+static int rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                goto out;
+        rc = seq_open(file, &rt_cache_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq          = file->private_data;
+        seq->private = s;
+        memset(s, 0, sizeof(*s));
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations rt_cache_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = rt_cache_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        int cpu;
+        if (*pos == 0)
+                return SEQ_START_TOKEN;
+        for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+                if (!cpu_possible(cpu))
+                        continue;
+                *pos = cpu+1;
+                return per_cpu_ptr(rt_cache_stat, cpu);
+        }
+        return NULL;
+}
+static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        int cpu;
+        for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+                if (!cpu_possible(cpu))
+                        continue;
+                *pos = cpu+1;
+                return per_cpu_ptr(rt_cache_stat, cpu);
+        }
+        return NULL;
+        
+}
+static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+static int rt_cpu_seq_show(struct seq_file *seq, void *v)
+{
+        struct rt_cache_stat *st = v;
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+                return 0;
+        }
+        
+        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
+                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+                   atomic_read(&ipv4_dst_ops.entries),
+                   st->in_hit,
+                   st->in_slow_tot,
+                   st->in_slow_mc,
+                   st->in_no_route,
+                   st->in_brd,
+                   st->in_martian_dst,
+                   st->in_martian_src,
+                   st->out_hit,
+                   st->out_slow_tot,
+                   st->out_slow_mc, 
+                   st->gc_total,
+                   st->gc_ignored,
+                   st->gc_goal_miss,
+                   st->gc_dst_overflow,
+                   st->in_hlist_search,
+                   st->out_hlist_search
+                );
+        return 0;
+}
+static struct seq_operations rt_cpu_seq_ops = {
+        .start  = rt_cpu_seq_start,
+        .next   = rt_cpu_seq_next,
+        .stop   = rt_cpu_seq_stop,
+        .show   = rt_cpu_seq_show,
+};
+static int rt_cpu_seq_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &rt_cpu_seq_ops);
+}
+static struct file_operations rt_cpu_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = rt_cpu_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+  
+static __inline__ void rt_free(struct rtable *rt)
+{
+        multipath_remove(rt);
+        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+static __inline__ void rt_drop(struct rtable *rt)
+{
+        multipath_remove(rt);
+        ip_rt_put(rt);
+        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+static __inline__ int rt_fast_clean(struct rtable *rth)
+{
+        /* Kill broadcast/multicast entries very aggresively, if they
+           collide in hash table with more useful entries */
+        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
+                rth->fl.iif && rth->u.rt_next;
+}
+static __inline__ int rt_valuable(struct rtable *rth)
+{
+        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
+                rth->u.dst.expires;
+}
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+{
+        unsigned long age;
+        int ret = 0;
+        if (atomic_read(&rth->u.dst.__refcnt))
+                goto out;
+        ret = 1;
+        if (rth->u.dst.expires &&
+            time_after_eq(jiffies, rth->u.dst.expires))
+                goto out;
+        age = jiffies - rth->u.dst.lastuse;
+        ret = 0;
+        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+            (age <= tmo2 && rt_valuable(rth)))
+                goto out;
+        ret = 1;
+out:    return ret;
+}
+/* Bits of score are:
+ * 31: very valuable
+ * 30: not quite useless
+ * 29..0: usage counter
+ */
+static inline u32 rt_score(struct rtable *rt)
+{
+        u32 score = jiffies - rt->u.dst.lastuse;
+        score = ~score & ~(3<<30);
+        if (rt_valuable(rt))
+                score |= (1<<31);
+        if (!rt->fl.iif ||
+            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
+                score |= (1<<30);
+        return score;
+}
+static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
+{
+        return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
+               fl1->oif     == fl2->oif &&
+               fl1->iif     == fl2->iif;
+}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
+                                                struct rtable *expentry,
+                                                int *removed_count)
+{
+        int passedexpired = 0;
+        struct rtable **nextstep = NULL;
+        struct rtable **rthp = chain_head;
+        struct rtable *rth;
+        if (removed_count)
+                *removed_count = 0;
+        while ((rth = *rthp) != NULL) {
+                if (rth == expentry)
+                        passedexpired = 1;
+                if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
+                    compare_keys(&(*rthp)->fl, &expentry->fl)) {
+                        if (*rthp == expentry) {
+                                *rthp = rth->u.rt_next;
+                                continue;
+                        } else {
+                                *rthp = rth->u.rt_next;
+                                rt_free(rth);
+                                if (removed_count)
+                                        ++(*removed_count);
+                        }
+                } else {
+                        if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
+                            passedexpired && !nextstep)
+                                nextstep = &rth->u.rt_next;
+                        rthp = &rth->u.rt_next;
+                }
+        }
+        rt_free(expentry);
+        if (removed_count)
+                ++(*removed_count);
+        return nextstep;
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+/* This runs via a timer and thus is always in BH context. */
+static void rt_check_expire(unsigned long dummy)
+{
+        static int rover;
+        int i = rover, t;
+        struct rtable *rth, **rthp;
+        unsigned long now = jiffies;
+        for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
+             t -= ip_rt_gc_timeout) {
+                unsigned long tmo = ip_rt_gc_timeout;
+                i = (i + 1) & rt_hash_mask;
+                rthp = &rt_hash_table[i].chain;
+                spin_lock(&rt_hash_table[i].lock);
+                while ((rth = *rthp) != NULL) {
+                        if (rth->u.dst.expires) {
+                                /* Entry is expired even if it is in use */
+                                if (time_before_eq(now, rth->u.dst.expires)) {
+                                        tmo >>= 1;
+                                        rthp = &rth->u.rt_next;
+                                        continue;
+                                }
+                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
+                                tmo >>= 1;
+                                rthp = &rth->u.rt_next;
+                                continue;
+                        }
+                        /* Cleanup aged off entries. */
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+                        /* remove all related balanced entries if necessary */
+                        if (rth->u.dst.flags & DST_BALANCED) {
+                                rthp = rt_remove_balanced_route(
+                                        &rt_hash_table[i].chain,
+                                        rth, NULL);
+                                if (!rthp)
+                                        break;
+                        } else {
+                                *rthp = rth->u.rt_next;
+                                rt_free(rth);
+                        }
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+                        *rthp = rth->u.rt_next;
+                        rt_free(rth);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+                }
+                spin_unlock(&rt_hash_table[i].lock);
+                /* Fallback loop breaker. */
+                if (time_after(jiffies, now))
+                        break;
+        }
+        rover = i;
+        mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+}
+/* This can run from both BH and non-BH contexts, the latter
+ * in the case of a forced flush event.
+ */
+static void rt_run_flush(unsigned long dummy)
+{
+        int i;
+        struct rtable *rth, *next;
+        rt_deadline = 0;
+        get_random_bytes(&rt_hash_rnd, 4);
+        for (i = rt_hash_mask; i >= 0; i--) {
+                spin_lock_bh(&rt_hash_table[i].lock);
+                rth = rt_hash_table[i].chain;
+                if (rth)
+                        rt_hash_table[i].chain = NULL;
+                spin_unlock_bh(&rt_hash_table[i].lock);
+                for (; rth; rth = next) {
+                        next = rth->u.rt_next;
+                        rt_free(rth);
+                }
+        }
+}
+static DEFINE_SPINLOCK(rt_flush_lock);
+void rt_cache_flush(int delay)
+{
+        unsigned long now = jiffies;
+        int user_mode = !in_softirq();
+        if (delay < 0)
+                delay = ip_rt_min_delay;
+        /* flush existing multipath state*/
+        multipath_flush();
+        spin_lock_bh(&rt_flush_lock);
+        if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
+                long tmo = (long)(rt_deadline - now);
+                /* If flush timer is already running
+                   and flush request is not immediate (delay > 0):
+                   if deadline is not achieved, prolongate timer to "delay",
+                   otherwise fire it at deadline time.
+                 */
+                if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
+                        tmo = 0;
+                
+                if (delay > tmo)
+                        delay = tmo;
+        }
+        if (delay <= 0) {
+                spin_unlock_bh(&rt_flush_lock);
+                rt_run_flush(0);
+                return;
+        }
+        if (rt_deadline == 0)
+                rt_deadline = now + ip_rt_max_delay;
+        mod_timer(&rt_flush_timer, now+delay);
+        spin_unlock_bh(&rt_flush_lock);
+}
+static void rt_secret_rebuild(unsigned long dummy)
+{
+        unsigned long now = jiffies;
+        rt_cache_flush(0);
+        mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+}
+/*
+   Short description of GC goals.
+   We want to build algorithm, which will keep routing cache
+   at some equilibrium point, when number of aged off entries
+   is kept approximately equal to newly generated ones.
+   Current expiration strength is variable "expire".
+   We try to adjust it dynamically, so that if networking
+   is idle expires is large enough to keep enough of warm entries,
+   and when load increases it reduces to limit cache size.
+ */
+static int rt_garbage_collect(void)
+{
+        static unsigned long expire = RT_GC_TIMEOUT;
+        static unsigned long last_gc;
+        static int rover;
+        static int equilibrium;
+        struct rtable *rth, **rthp;
+        unsigned long now = jiffies;
+        int goal;
+        /*
+         * Garbage collection is pretty expensive,
+         * do not make it too frequently.
+         */
+        RT_CACHE_STAT_INC(gc_total);
+        if (now - last_gc < ip_rt_gc_min_interval &&
+            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+                RT_CACHE_STAT_INC(gc_ignored);
+                goto out;
+        }
+        /* Calculate number of entries, which we want to expire now. */
+        goal = atomic_read(&ipv4_dst_ops.entries) -
+                (ip_rt_gc_elasticity << rt_hash_log);
+        if (goal <= 0) {
+                if (equilibrium < ipv4_dst_ops.gc_thresh)
+                        equilibrium = ipv4_dst_ops.gc_thresh;
+                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+                if (goal > 0) {
+                        equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
+                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+                }
+        } else {
+                /* We are in dangerous area. Try to reduce cache really
+                 * aggressively.
+                 */
+                goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
+                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+        }
+        if (now - last_gc >= ip_rt_gc_min_interval)
+                last_gc = now;
+        if (goal <= 0) {
+                equilibrium += goal;
+                goto work_done;
+        }
+        do {
+                int i, k;
+                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+                        unsigned long tmo = expire;
+                        k = (k + 1) & rt_hash_mask;
+                        rthp = &rt_hash_table[k].chain;
+                        spin_lock_bh(&rt_hash_table[k].lock);
+                        while ((rth = *rthp) != NULL) {
+                                if (!rt_may_expire(rth, tmo, expire)) {
+                                        tmo >>= 1;
+                                        rthp = &rth->u.rt_next;
+                                        continue;
+                                }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+                                /* remove all related balanced entries
+                                 * if necessary
+                                 */
+                                if (rth->u.dst.flags & DST_BALANCED) {
+                                        int r;
+                                        rthp = rt_remove_balanced_route(
+                                                &rt_hash_table[i].chain,
+                                                rth,
+                                                &r);
+                                        goal -= r;
+                                        if (!rthp)
+                                                break;
+                                } else {
+                                        *rthp = rth->u.rt_next;
+                                        rt_free(rth);
+                                        goal--;
+                                }
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+                                *rthp = rth->u.rt_next;
+                                rt_free(rth);
+                                goal--;
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+                        }
+                        spin_unlock_bh(&rt_hash_table[k].lock);
+                        if (goal <= 0)
+                                break;
+                }
+                rover = k;
+                if (goal <= 0)
+                        goto work_done;
+                /* Goal is not achieved. We stop process if:
+                   - if expire reduced to zero. Otherwise, expire is halfed.
+                   - if table is not full.
+                   - if we are called from interrupt.
+                   - jiffies check is just fallback/debug loop breaker.
+                     We will not spin here for long time in any case.
+                 */
+                RT_CACHE_STAT_INC(gc_goal_miss);
+                if (expire == 0)
+                        break;
+                expire >>= 1;
+#if RT_CACHE_DEBUG >= 2
+                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
+                                atomic_read(&ipv4_dst_ops.entries), goal, i);
+#endif
+                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+                        goto out;
+        } while (!in_softirq() && time_before_eq(jiffies, now));
+        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+                goto out;
+        if (net_ratelimit())
+                printk(KERN_WARNING "dst cache overflow\n");
+        RT_CACHE_STAT_INC(gc_dst_overflow);
+        return 1;
+work_done:
+        expire += ip_rt_gc_min_interval;
+        if (expire > ip_rt_gc_timeout ||
+            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+                expire = ip_rt_gc_timeout;
+#if RT_CACHE_DEBUG >= 2
+        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
+                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
+#endif
+out:    return 0;
+}
+static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+{
+        struct rtable   *rth, **rthp;
+        unsigned long   now;
+        struct rtable *cand, **candp;
+        u32             min_score;
+        int             chain_length;
+        int attempts = !in_softirq();
+restart:
+        chain_length = 0;
+        min_score = ~(u32)0;
+        cand = NULL;
+        candp = NULL;
+        now = jiffies;
+        rthp = &rt_hash_table[hash].chain;
+        spin_lock_bh(&rt_hash_table[hash].lock);
+        while ((rth = *rthp) != NULL) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+                if (!(rth->u.dst.flags & DST_BALANCED) &&
+                    compare_keys(&rth->fl, &rt->fl)) {
+#else
+                if (compare_keys(&rth->fl, &rt->fl)) {
+#endif
+                        /* Put it first */
+                        *rthp = rth->u.rt_next;
+                        /*
+                         * Since lookup is lockfree, the deletion
+                         * must be visible to another weakly ordered CPU before
+                         * the insertion at the start of the hash chain.
+                         */
+                        rcu_assign_pointer(rth->u.rt_next,
+                                           rt_hash_table[hash].chain);
+                        /*
+                         * Since lookup is lockfree, the update writes
+                         * must be ordered for consistency on SMP.
+                         */
+                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+                        rth->u.dst.__use++;
+                        dst_hold(&rth->u.dst);
+                        rth->u.dst.lastuse = now;
+                        spin_unlock_bh(&rt_hash_table[hash].lock);
+                        rt_drop(rt);
+                        *rp = rth;
+                        return 0;
+                }
+                if (!atomic_read(&rth->u.dst.__refcnt)) {
+                        u32 score = rt_score(rth);
+                        if (score <= min_score) {
+                                cand = rth;
+                                candp = rthp;
+                                min_score = score;
+                        }
+                }
+                chain_length++;
+                rthp = &rth->u.rt_next;
+        }
+        if (cand) {
+                /* ip_rt_gc_elasticity used to be average length of chain
+                 * length, when exceeded gc becomes really aggressive.
+                 *
+                 * The second limit is less certain. At the moment it allows
+                 * only 2 entries per bucket. We will see.
+                 */
+                if (chain_length > ip_rt_gc_elasticity) {
+                        *candp = cand->u.rt_next;
+                        rt_free(cand);
+                }
+        }
+        /* Try to bind route to arp only if it is output
+           route or unicast forwarding path.
+         */
+        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+                int err = arp_bind_neighbour(&rt->u.dst);
+                if (err) {
+                        spin_unlock_bh(&rt_hash_table[hash].lock);
+                        if (err != -ENOBUFS) {
+                                rt_drop(rt);
+                                return err;
+                        }
+                        /* Neighbour tables are full and nothing
+                           can be released. Try to shrink route cache,
+                           it is most likely it holds some neighbour records.
+                         */
+                        if (attempts-- > 0) {
+                                int saved_elasticity = ip_rt_gc_elasticity;
+                                int saved_int = ip_rt_gc_min_interval;
+                                ip_rt_gc_elasticity     = 1;
+                                ip_rt_gc_min_interval   = 0;
+                                rt_garbage_collect();
+                                ip_rt_gc_min_interval   = saved_int;
+                                ip_rt_gc_elasticity     = saved_elasticity;
+                                goto restart;
+                        }
+                        if (net_ratelimit())
+                                printk(KERN_WARNING "Neighbour table overflow.\n");
+                        rt_drop(rt);
+                        return -ENOBUFS;
+                }
+        }
+        rt->u.rt_next = rt_hash_table[hash].chain;
+#if RT_CACHE_DEBUG >= 2
+        if (rt->u.rt_next) {
+                struct rtable *trt;
+                printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
+                       NIPQUAD(rt->rt_dst));
+                for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
+                        printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
+                printk("\n");
+        }
+#endif
+        rt_hash_table[hash].chain = rt;
+        spin_unlock_bh(&rt_hash_table[hash].lock);
+        *rp = rt;
+        return 0;
+}
+void rt_bind_peer(struct rtable *rt, int create)
+{
+        static DEFINE_SPINLOCK(rt_peer_lock);
+        struct inet_peer *peer;
+        peer = inet_getpeer(rt->rt_dst, create);
+        spin_lock_bh(&rt_peer_lock);
+        if (rt->peer == NULL) {
+                rt->peer = peer;
+                peer = NULL;
+        }
+        spin_unlock_bh(&rt_peer_lock);
+        if (peer)
+                inet_putpeer(peer);
+}
+/*
+ * Peer allocation may fail only in serious out-of-memory conditions.  However
+ * we still can generate some output.
+ * Random ID selection looks a bit dangerous because we have no chances to
+ * select ID being unique in a reasonable period of time.
+ * But broken packet identifier may be better than no packet at all.
+ */
+static void ip_select_fb_ident(struct iphdr *iph)
+{
+        static DEFINE_SPINLOCK(ip_fb_id_lock);
+        static u32 ip_fallback_id;
+        u32 salt;
+        spin_lock_bh(&ip_fb_id_lock);
+        salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
+        iph->id = htons(salt & 0xFFFF);
+        ip_fallback_id = salt;
+        spin_unlock_bh(&ip_fb_id_lock);
+}
+void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        if (rt) {
+                if (rt->peer == NULL)
+                        rt_bind_peer(rt, 1);
+                /* If peer is attached to destination, it is never detached,
+                   so that we need not to grab a lock to dereference it.
+                 */
+                if (rt->peer) {
+                        iph->id = htons(inet_getid(rt->peer, more));
+                        return;
+                }
+        } else
+                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
+        ip_select_fb_ident(iph);
+}
+static void rt_del(unsigned hash, struct rtable *rt)
+{
+        struct rtable **rthp;
+        spin_lock_bh(&rt_hash_table[hash].lock);
+        ip_rt_put(rt);
+        for (rthp = &rt_hash_table[hash].chain; *rthp;
+             rthp = &(*rthp)->u.rt_next)
+                if (*rthp == rt) {
+                        *rthp = rt->u.rt_next;
+                        rt_free(rt);
+                        break;
+                }
+        spin_unlock_bh(&rt_hash_table[hash].lock);
+}
+void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
+                    u32 saddr, u8 tos, struct net_device *dev)
+{
+        int i, k;
+        struct in_device *in_dev = in_dev_get(dev);
+        struct rtable *rth, **rthp;
+        u32  skeys[2] = { saddr, 0 };
+        int  ikeys[2] = { dev->ifindex, 0 };
+        tos &= IPTOS_RT_MASK;
+        if (!in_dev)
+                return;
+        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
+            || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+                goto reject_redirect;
+        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+                        goto reject_redirect;
+                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+                        goto reject_redirect;
+        } else {
+                if (inet_addr_type(new_gw) != RTN_UNICAST)
+                        goto reject_redirect;
+        }
+        for (i = 0; i < 2; i++) {
+                for (k = 0; k < 2; k++) {
+                        unsigned hash = rt_hash_code(daddr,
+                                                     skeys[i] ^ (ikeys[k] << 5),
+                                                     tos);
+                        rthp=&rt_hash_table[hash].chain;
+                        rcu_read_lock();
+                        while ((rth = rcu_dereference(*rthp)) != NULL) {
+                                struct rtable *rt;
+                                if (rth->fl.fl4_dst != daddr ||
+                                    rth->fl.fl4_src != skeys[i] ||
+                                    rth->fl.fl4_tos != tos ||
+                                    rth->fl.oif != ikeys[k] ||
+                                    rth->fl.iif != 0) {
+                                        rthp = &rth->u.rt_next;
+                                        continue;
+                                }
+                                if (rth->rt_dst != daddr ||
+                                    rth->rt_src != saddr ||
+                                    rth->u.dst.error ||
+                                    rth->rt_gateway != old_gw ||
+                                    rth->u.dst.dev != dev)
+                                        break;
+                                dst_hold(&rth->u.dst);
+                                rcu_read_unlock();
+                                rt = dst_alloc(&ipv4_dst_ops);
+                                if (rt == NULL) {
+                                        ip_rt_put(rth);
+                                        in_dev_put(in_dev);
+                                        return;
+                                }
+                                /* Copy all the information. */
+                                *rt = *rth;
+                                INIT_RCU_HEAD(&rt->u.dst.rcu_head);
+                                rt->u.dst.__use         = 1;
+                                atomic_set(&rt->u.dst.__refcnt, 1);
+                                rt->u.dst.child         = NULL;
+                                if (rt->u.dst.dev)
+                                        dev_hold(rt->u.dst.dev);
+                                if (rt->idev)
+                                        in_dev_hold(rt->idev);
+                                rt->u.dst.obsolete      = 0;
+                                rt->u.dst.lastuse       = jiffies;
+                                rt->u.dst.path          = &rt->u.dst;
+                                rt->u.dst.neighbour     = NULL;
+                                rt->u.dst.hh            = NULL;
+                                rt->u.dst.xfrm          = NULL;
+                                rt->rt_flags            |= RTCF_REDIRECTED;
+                                /* Gateway is different ... */
+                                rt->rt_gateway          = new_gw;
+                                /* Redirect received -> path was valid */
+                                dst_confirm(&rth->u.dst);
+                                if (rt->peer)
+                                        atomic_inc(&rt->peer->refcnt);
+                                if (arp_bind_neighbour(&rt->u.dst) ||
+                                    !(rt->u.dst.neighbour->nud_state &
+                                            NUD_VALID)) {
+                                        if (rt->u.dst.neighbour)
+                                                neigh_event_send(rt->u.dst.neighbour, NULL);
+                                        ip_rt_put(rth);
+                                        rt_drop(rt);
+                                        goto do_next;
+                                }
+                                rt_del(hash, rth);
+                                if (!rt_intern_hash(hash, rt, &rt))
+                                        ip_rt_put(rt);
+                                goto do_next;
+                        }
+                        rcu_read_unlock();
+                do_next:
+                        ;
+                }
+        }
+        in_dev_put(in_dev);
+        return;
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+                printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
+                        "%u.%u.%u.%u ignored.\n"
+                        "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
+                        "tos %02x\n",
+                       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
+                       NIPQUAD(saddr), NIPQUAD(daddr), tos);
+#endif
+        in_dev_put(in_dev);
+}
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+        struct rtable *rt = (struct rtable*)dst;
+        struct dst_entry *ret = dst;
+        if (rt) {
+                if (dst->obsolete) {
+                        ip_rt_put(rt);
+                        ret = NULL;
+                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+                           rt->u.dst.expires) {
+                        unsigned hash = rt_hash_code(rt->fl.fl4_dst,
+                                                     rt->fl.fl4_src ^
+                                                        (rt->fl.oif << 5),
+                                                     rt->fl.fl4_tos);
+#if RT_CACHE_DEBUG >= 1
+                        printk(KERN_DEBUG "ip_rt_advice: redirect to "
+                                          "%u.%u.%u.%u/%02x dropped\n",
+                                NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
+#endif
+                        rt_del(hash, rt);
+                        ret = NULL;
+                }
+        }
+        return ret;
+}
+/*
+ * Algorithm:
+ *      1. The first ip_rt_redirect_number redirects are sent
+ *         with exponential backoff, then we stop sending them at all,
+ *         assuming that the host ignores our redirects.
+ *      2. If we did not see packets requiring redirects
+ *         during ip_rt_redirect_silence, we assume that the host
+ *         forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+        struct rtable *rt = (struct rtable*)skb->dst;
+        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
+        if (!in_dev)
+                return;
+        if (!IN_DEV_TX_REDIRECTS(in_dev))
+                goto out;
+        /* No redirected packets during ip_rt_redirect_silence;
+         * reset the algorithm.
+         */
+        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
+                rt->u.dst.rate_tokens = 0;
+        /* Too many ignored redirects; do not send anything
+         * set u.dst.rate_last to the last seen redirected packet.
+         */
+        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
+                rt->u.dst.rate_last = jiffies;
+                goto out;
+        }
+        /* Check for load limit; set rate_last to the latest sent
+         * redirect.
+         */
+        if (time_after(jiffies,
+                       (rt->u.dst.rate_last +
+                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
+                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+                rt->u.dst.rate_last = jiffies;
+                ++rt->u.dst.rate_tokens;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+                if (IN_DEV_LOG_MARTIANS(in_dev) &&
+                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
+                    net_ratelimit())
+                        printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
+                                "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
+                                NIPQUAD(rt->rt_src), rt->rt_iif,
+                                NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
+#endif
+        }
+out:
+        in_dev_put(in_dev);
+}
+static int ip_error(struct sk_buff *skb)
+{
+        struct rtable *rt = (struct rtable*)skb->dst;
+        unsigned long now;
+        int code;
+        switch (rt->u.dst.error) {
+                case EINVAL:
+                default:
+                        goto out;
+                case EHOSTUNREACH:
+                        code = ICMP_HOST_UNREACH;
+                        break;
+                case ENETUNREACH:
+                        code = ICMP_NET_UNREACH;
+                        break;
+                case EACCES:
+                        code = ICMP_PKT_FILTERED;
+                        break;
+        }
+        now = jiffies;
+        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
+        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
+                rt->u.dst.rate_tokens = ip_rt_error_burst;
+        rt->u.dst.rate_last = now;
+        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
+                rt->u.dst.rate_tokens -= ip_rt_error_cost;
+                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+        }
+out:    kfree_skb(skb);
+        return 0;
+} 
+/*
+ *      The last two values are not from the RFC but
+ *      are needed for AMPRnet AX.25 paths.
+ */
+static unsigned short mtu_plateau[] =
+{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
+{
+        int i;
+        
+        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
+                if (old_mtu > mtu_plateau[i])
+                        return mtu_plateau[i];
+        return 68;
+}
+unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+{
+        int i;
+        unsigned short old_mtu = ntohs(iph->tot_len);
+        struct rtable *rth;
+        u32  skeys[2] = { iph->saddr, 0, };
+        u32  daddr = iph->daddr;
+        u8   tos = iph->tos & IPTOS_RT_MASK;
+        unsigned short est_mtu = 0;
+        if (ipv4_config.no_pmtu_disc)
+                return 0;
+        for (i = 0; i < 2; i++) {
+                unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+                rcu_read_lock();
+                for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+                     rth = rcu_dereference(rth->u.rt_next)) {
+                        if (rth->fl.fl4_dst == daddr &&
+                            rth->fl.fl4_src == skeys[i] &&
+                            rth->rt_dst  == daddr &&
+                            rth->rt_src  == iph->saddr &&
+                            rth->fl.fl4_tos == tos &&
+                            rth->fl.iif == 0 &&
+                            !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
+                                unsigned short mtu = new_mtu;
+                                if (new_mtu < 68 || new_mtu >= old_mtu) {
+                                        /* BSD 4.2 compatibility hack :-( */
+                                        if (mtu == 0 &&
+                                            old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
+                                            old_mtu >= 68 + (iph->ihl << 2))
+                                                old_mtu -= iph->ihl << 2;
+                                        mtu = guess_mtu(old_mtu);
+                                }
+                                if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
+                                        if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
+                                                dst_confirm(&rth->u.dst);
+                                                if (mtu < ip_rt_min_pmtu) {
+                                                        mtu = ip_rt_min_pmtu;
+                                                        rth->u.dst.metrics[RTAX_LOCK-1] |=
+                                                                (1 << RTAX_MTU);
+                                                }
+                                                rth->u.dst.metrics[RTAX_MTU-1] = mtu;
+                                                dst_set_expires(&rth->u.dst,
+                                                        ip_rt_mtu_expires);
+                                        }
+                                        est_mtu = mtu;
+                                }
+                        }
+                }
+                rcu_read_unlock();
+        }
+        return est_mtu ? : new_mtu;
+}
+static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+        if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
+            !(dst_metric_locked(dst, RTAX_MTU))) {
+                if (mtu < ip_rt_min_pmtu) {
+                        mtu = ip_rt_min_pmtu;
+                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
+                }
+                dst->metrics[RTAX_MTU-1] = mtu;
+                dst_set_expires(dst, ip_rt_mtu_expires);
+        }
+}
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+        return NULL;
+}
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        struct inet_peer *peer = rt->peer;
+        struct in_device *idev = rt->idev;
+        if (peer) {
+                rt->peer = NULL;
+                inet_putpeer(peer);
+        }
+        if (idev) {
+                rt->idev = NULL;
+                in_dev_put(idev);
+        }
+}
+static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+                            int how)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        struct in_device *idev = rt->idev;
+        if (dev != &loopback_dev && idev && idev->dev == dev) {
+                struct in_device *loopback_idev = in_dev_get(&loopback_dev);
+                if (loopback_idev) {
+                        rt->idev = loopback_idev;
+                        in_dev_put(idev);
+                }
+        }
+}
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+        struct rtable *rt;
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+        rt = (struct rtable *) skb->dst;
+        if (rt)
+                dst_set_expires(&rt->u.dst, 0);
+}
+static int ip_rt_bug(struct sk_buff *skb)
+{
+        printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
+                NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
+                skb->dev ? skb->dev->name : "?");
+        kfree_skb(skb);
+        return 0;
+}
+/*
+   We do not cache source address of outgoing interface,
+   because it is used only by IP RR, TS and SRR options,
+   so that it out of fast path.
+   BTW remember: "addr" is allowed to be not aligned
+   in IP options!
+ */
+void ip_rt_get_source(u8 *addr, struct rtable *rt)
+{
+        u32 src;
+        struct fib_result res;
+        if (rt->fl.iif == 0)
+                src = rt->rt_src;
+        else if (fib_lookup(&rt->fl, &res) == 0) {
+                src = FIB_RES_PREFSRC(res);
+                fib_res_put(&res);
+        } else
+                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+                                        RT_SCOPE_UNIVERSE);
+        memcpy(addr, &src, 4);
+}
+#ifdef CONFIG_NET_CLS_ROUTE
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+        if (!(rt->u.dst.tclassid & 0xFFFF))
+                rt->u.dst.tclassid |= tag & 0xFFFF;
+        if (!(rt->u.dst.tclassid & 0xFFFF0000))
+                rt->u.dst.tclassid |= tag & 0xFFFF0000;
+}
+#endif
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+{
+        struct fib_info *fi = res->fi;
+        if (fi) {
+                if (FIB_RES_GW(*res) &&
+                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+                        rt->rt_gateway = FIB_RES_GW(*res);
+                memcpy(rt->u.dst.metrics, fi->fib_metrics,
+                       sizeof(rt->u.dst.metrics));
+                if (fi->fib_mtu == 0) {
+                        rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
+                        if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
+                            rt->rt_gateway != rt->rt_dst &&
+                            rt->u.dst.dev->mtu > 576)
+                                rt->u.dst.metrics[RTAX_MTU-1] = 576;
+                }
+#ifdef CONFIG_NET_CLS_ROUTE
+                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+#endif
+        } else
+                rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
+        if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
+                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+        if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
+                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
+        if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
+                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+                                       ip_rt_min_advmss);
+        if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
+                rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        set_class_tag(rt, fib_rules_tclass(res));
+#endif
+        set_class_tag(rt, itag);
+#endif
+        rt->rt_type = res->type;
+}
+static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
+                                u8 tos, struct net_device *dev, int our)
+{
+        unsigned hash;
+        struct rtable *rth;
+        u32 spec_dst;
+        struct in_device *in_dev = in_dev_get(dev);
+        u32 itag = 0;
+        /* Primary sanity checks. */
+        if (in_dev == NULL)
+                return -EINVAL;
+        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
+            skb->protocol != htons(ETH_P_IP))
+                goto e_inval;
+        if (ZERONET(saddr)) {
+                if (!LOCAL_MCAST(daddr))
+                        goto e_inval;
+                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+        } else if (fib_validate_source(saddr, 0, tos, 0,
+                                        dev, &spec_dst, &itag) < 0)
+                goto e_inval;
+        rth = dst_alloc(&ipv4_dst_ops);
+        if (!rth)
+                goto e_nobufs;
+        rth->u.dst.output= ip_rt_bug;
+        atomic_set(&rth->u.dst.__refcnt, 1);
+        rth->u.dst.flags= DST_HOST;
+        if (in_dev->cnf.no_policy)
+                rth->u.dst.flags |= DST_NOPOLICY;
+        rth->fl.fl4_dst = daddr;
+        rth->rt_dst     = daddr;
+        rth->fl.fl4_tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+        rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+        rth->fl.fl4_src = saddr;
+        rth->rt_src     = saddr;
+#ifdef CONFIG_NET_CLS_ROUTE
+        rth->u.dst.tclassid = itag;
+#endif
+        rth->rt_iif     =
+        rth->fl.iif     = dev->ifindex;
+        rth->u.dst.dev  = &loopback_dev;
+        dev_hold(rth->u.dst.dev);
+        rth->idev       = in_dev_get(rth->u.dst.dev);
+        rth->fl.oif     = 0;
+        rth->rt_gateway = daddr;
+        rth->rt_spec_dst= spec_dst;
+        rth->rt_type    = RTN_MULTICAST;
+        rth->rt_flags   = RTCF_MULTICAST;
+        if (our) {
+                rth->u.dst.input= ip_local_deliver;
+                rth->rt_flags |= RTCF_LOCAL;
+        }
+#ifdef CONFIG_IP_MROUTE
+        if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+                rth->u.dst.input = ip_mr_input;
+#endif
+        RT_CACHE_STAT_INC(in_slow_mc);
+        in_dev_put(in_dev);
+        hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
+        return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
+e_nobufs:
+        in_dev_put(in_dev);
+        return -ENOBUFS;
+e_inval:
+        in_dev_put(in_dev);
+        return -EINVAL;
+}
+static void ip_handle_martian_source(struct net_device *dev,
+                                     struct in_device *in_dev,
+                                     struct sk_buff *skb,
+                                     u32 daddr,
+                                     u32 saddr) 
+{
+        RT_CACHE_STAT_INC(in_martian_src);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+                /*
+                 *      RFC1812 recommendation, if source is martian,
+                 *      the only hint is MAC header.
+                 */
+                printk(KERN_WARNING "martian source %u.%u.%u.%u from "
+                        "%u.%u.%u.%u, on dev %s\n",
+                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+                if (dev->hard_header_len) {
+                        int i;
+                        unsigned char *p = skb->mac.raw;
+                        printk(KERN_WARNING "ll header: ");
+                        for (i = 0; i < dev->hard_header_len; i++, p++) {
+                                printk("%02x", *p);
+                                if (i < (dev->hard_header_len - 1))
+                                        printk(":");
+                        }
+                        printk("\n");
+                }
+        }
+#endif
+}
+static inline int __mkroute_input(struct sk_buff *skb, 
+                                  struct fib_result* res, 
+                                  struct in_device *in_dev, 
+                                  u32 daddr, u32 saddr, u32 tos, 
+                                  struct rtable **result) 
+{
+        struct rtable *rth;
+        int err;
+        struct in_device *out_dev;
+        unsigned flags = 0;
+        u32 spec_dst, itag;
+        /* get a working reference to the output device */
+        out_dev = in_dev_get(FIB_RES_DEV(*res));
+        if (out_dev == NULL) {
+                if (net_ratelimit())
+                        printk(KERN_CRIT "Bug in ip_route_input" \
+                               "_slow(). Please, report\n");
+                return -EINVAL;
+        }
+        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
+                                  in_dev->dev, &spec_dst, &itag);
+        if (err < 0) {
+                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
+                                         saddr);
+                
+                err = -EINVAL;
+                goto cleanup;
+        }
+        if (err)
+                flags |= RTCF_DIRECTSRC;
+        if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
+            (IN_DEV_SHARED_MEDIA(out_dev) ||
+             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+                flags |= RTCF_DOREDIRECT;
+        if (skb->protocol != htons(ETH_P_IP)) {
+                /* Not IP (i.e. ARP). Do not create route, if it is
+                 * invalid for proxy arp. DNAT routes are always valid.
+                 */
+                if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
+                        err = -EINVAL;
+                        goto cleanup;
+                }
+        }
+        rth = dst_alloc(&ipv4_dst_ops);
+        if (!rth) {
+                err = -ENOBUFS;
+                goto cleanup;
+        }
+        rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        if (res->fi->fib_nhs > 1)
+                rth->u.dst.flags |= DST_BALANCED;
+#endif
+        if (in_dev->cnf.no_policy)
+                rth->u.dst.flags |= DST_NOPOLICY;
+        if (in_dev->cnf.no_xfrm)
+                rth->u.dst.flags |= DST_NOXFRM;
+        rth->fl.fl4_dst = daddr;
+        rth->rt_dst     = daddr;
+        rth->fl.fl4_tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+        rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+        rth->fl.fl4_src = saddr;
+        rth->rt_src     = saddr;
+        rth->rt_gateway = daddr;
+        rth->rt_iif     =
+                rth->fl.iif     = in_dev->dev->ifindex;
+        rth->u.dst.dev  = (out_dev)->dev;
+        dev_hold(rth->u.dst.dev);
+        rth->idev       = in_dev_get(rth->u.dst.dev);
+        rth->fl.oif     = 0;
+        rth->rt_spec_dst= spec_dst;
+        rth->u.dst.input = ip_forward;
+        rth->u.dst.output = ip_output;
+        rt_set_nexthop(rth, res, itag);
+        rth->rt_flags = flags;
+        *result = rth;
+        err = 0;
+ cleanup:
+        /* release the working reference to the output device */
+        in_dev_put(out_dev);
+        return err;
+}                                               
+static inline int ip_mkroute_input_def(struct sk_buff *skb, 
+                                       struct fib_result* res, 
+                                       const struct flowi *fl,
+                                       struct in_device *in_dev,
+                                       u32 daddr, u32 saddr, u32 tos)
+{
+        struct rtable* rth;
+        int err;
+        unsigned hash;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
+                fib_select_multipath(fl, res);
+#endif
+        /* create a routing cache entry */
+        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
+        if (err)
+                return err;
+        atomic_set(&rth->u.dst.__refcnt, 1);
+        /* put it into the cache */
+        hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+        return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
+}
+static inline int ip_mkroute_input(struct sk_buff *skb, 
+                                   struct fib_result* res, 
+                                   const struct flowi *fl,
+                                   struct in_device *in_dev,
+                                   u32 daddr, u32 saddr, u32 tos)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        struct rtable* rth;
+        unsigned char hop, hopcount, lasthop;
+        int err = -EINVAL;
+        unsigned int hash;
+        if (res->fi)
+                hopcount = res->fi->fib_nhs;
+        else
+                hopcount = 1;
+        lasthop = hopcount - 1;
+        /* distinguish between multipath and singlepath */
+        if (hopcount < 2)
+                return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
+                                            saddr, tos);
+        
+        /* add all alternatives to the routing cache */
+        for (hop = 0; hop < hopcount; hop++) {
+                res->nh_sel = hop;
+                /* create a routing cache entry */
+                err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
+                                      &rth);
+                if (err)
+                        return err;
+                /* put it into the cache */
+                hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+                err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+                if (err)
+                        return err;
+                /* forward hop information to multipath impl. */
+                multipath_set_nhinfo(rth,
+                                     FIB_RES_NETWORK(*res),
+                                     FIB_RES_NETMASK(*res),
+                                     res->prefixlen,
+                                     &FIB_RES_NH(*res));
+                /* only for the last hop the reference count is handled
+                 * outside
+                 */
+                if (hop == lasthop)
+                        atomic_set(&(skb->dst->__refcnt), 1);
+        }
+        return err;
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
+        return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
+}
+/*
+ *      NOTE. We drop all the packets that has local source
+ *      addresses, because every properly looped back packet
+ *      must have correct destination already attached by output routine.
+ *
+ *      Such approach solves two big problems:
+ *      1. Not simplex devices are handled properly.
+ *      2. IP spoofing attempts are filtered with 100% of guarantee.
+ */
+static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
+                               u8 tos, struct net_device *dev)
+{
+        struct fib_result res;
+        struct in_device *in_dev = in_dev_get(dev);
+        struct flowi fl = { .nl_u = { .ip4_u =
+                                      { .daddr = daddr,
+                                        .saddr = saddr,
+                                        .tos = tos,
+                                        .scope = RT_SCOPE_UNIVERSE,
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                                        .fwmark = skb->nfmark
+#endif
+                                      } },
+                            .iif = dev->ifindex };
+        unsigned        flags = 0;
+        u32             itag = 0;
+        struct rtable * rth;
+        unsigned        hash;
+        u32             spec_dst;
+        int             err = -EINVAL;
+        int             free_res = 0;
+        /* IP on this device is disabled. */
+        if (!in_dev)
+                goto out;
+        /* Check for the most weird martians, which can be not detected
+           by fib_lookup.
+         */
+        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+                goto martian_source;
+        if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
+                goto brd_input;
+        /* Accept zero addresses only to limited broadcast;
+         * I even do not know to fix it or not. Waiting for complains :-)
+         */
+        if (ZERONET(saddr))
+                goto martian_source;
+        if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+                goto martian_destination;
+        /*
+         *      Now we are ready to route packet.
+         */
+        if ((err = fib_lookup(&fl, &res)) != 0) {
+                if (!IN_DEV_FORWARD(in_dev))
+                        goto e_inval;
+                goto no_route;
+        }
+        free_res = 1;
+        RT_CACHE_STAT_INC(in_slow_tot);
+        if (res.type == RTN_BROADCAST)
+                goto brd_input;
+        if (res.type == RTN_LOCAL) {
+                int result;
+                result = fib_validate_source(saddr, daddr, tos,
+                                             loopback_dev.ifindex,
+                                             dev, &spec_dst, &itag);
+                if (result < 0)
+                        goto martian_source;
+                if (result)
+                        flags |= RTCF_DIRECTSRC;
+                spec_dst = daddr;
+                goto local_input;
+        }
+        if (!IN_DEV_FORWARD(in_dev))
+                goto e_inval;
+        if (res.type != RTN_UNICAST)
+                goto martian_destination;
+        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
+        if (err == -ENOBUFS)
+                goto e_nobufs;
+        if (err == -EINVAL)
+                goto e_inval;
+        
+done:
+        in_dev_put(in_dev);
+        if (free_res)
+                fib_res_put(&res);
+out:    return err;
+brd_input:
+        if (skb->protocol != htons(ETH_P_IP))
+                goto e_inval;
+        if (ZERONET(saddr))
+                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+        else {
+                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+                                          &itag);
+                if (err < 0)
+                        goto martian_source;
+                if (err)
+                        flags |= RTCF_DIRECTSRC;
+        }
+        flags |= RTCF_BROADCAST;
+        res.type = RTN_BROADCAST;
+        RT_CACHE_STAT_INC(in_brd);
+local_input:
+        rth = dst_alloc(&ipv4_dst_ops);
+        if (!rth)
+                goto e_nobufs;
+        rth->u.dst.output= ip_rt_bug;
+        atomic_set(&rth->u.dst.__refcnt, 1);
+        rth->u.dst.flags= DST_HOST;
+        if (in_dev->cnf.no_policy)
+                rth->u.dst.flags |= DST_NOPOLICY;
+        rth->fl.fl4_dst = daddr;
+        rth->rt_dst     = daddr;
+        rth->fl.fl4_tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+        rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+        rth->fl.fl4_src = saddr;
+        rth->rt_src     = saddr;
+#ifdef CONFIG_NET_CLS_ROUTE
+        rth->u.dst.tclassid = itag;
+#endif
+        rth->rt_iif     =
+        rth->fl.iif     = dev->ifindex;
+        rth->u.dst.dev  = &loopback_dev;
+        dev_hold(rth->u.dst.dev);
+        rth->idev       = in_dev_get(rth->u.dst.dev);
+        rth->rt_gateway = daddr;
+        rth->rt_spec_dst= spec_dst;
+        rth->u.dst.input= ip_local_deliver;
+        rth->rt_flags   = flags|RTCF_LOCAL;
+        if (res.type == RTN_UNREACHABLE) {
+                rth->u.dst.input= ip_error;
+                rth->u.dst.error= -err;
+                rth->rt_flags   &= ~RTCF_LOCAL;
+        }
+        rth->rt_type    = res.type;
+        hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
+        err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+        goto done;
+no_route:
+        RT_CACHE_STAT_INC(in_no_route);
+        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+        res.type = RTN_UNREACHABLE;
+        goto local_input;
+        /*
+         *      Do not cache martian addresses: they should be logged (RFC1812)
+         */
+martian_destination:
+        RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+                printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
+                        "%u.%u.%u.%u, dev %s\n",
+                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+#endif
+e_inval:
+        err = -EINVAL;
+        goto done;
+e_nobufs:
+        err = -ENOBUFS;
+        goto done;
+martian_source:
+        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+        goto e_inval;
+}
+int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
+                   u8 tos, struct net_device *dev)
+{
+        struct rtable * rth;
+        unsigned        hash;
+        int iif = dev->ifindex;
+        tos &= IPTOS_RT_MASK;
+        hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
+        rcu_read_lock();
+        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+             rth = rcu_dereference(rth->u.rt_next)) {
+                if (rth->fl.fl4_dst == daddr &&
+                    rth->fl.fl4_src == saddr &&
+                    rth->fl.iif == iif &&
+                    rth->fl.oif == 0 &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                    rth->fl.fl4_fwmark == skb->nfmark &&
+#endif
+                    rth->fl.fl4_tos == tos) {
+                        rth->u.dst.lastuse = jiffies;
+                        dst_hold(&rth->u.dst);
+                        rth->u.dst.__use++;
+                        RT_CACHE_STAT_INC(in_hit);
+                        rcu_read_unlock();
+                        skb->dst = (struct dst_entry*)rth;
+                        return 0;
+                }
+                RT_CACHE_STAT_INC(in_hlist_search);
+        }
+        rcu_read_unlock();
+        /* Multicast recognition logic is moved from route cache to here.
+           The problem was that too many Ethernet cards have broken/missing
+           hardware multicast filters :-( As result the host on multicasting
+           network acquires a lot of useless route cache entries, sort of
+           SDR messages from all the world. Now we try to get rid of them.
+           Really, provided software IP multicast filter is organized
+           reasonably (at least, hashed), it does not result in a slowdown
+           comparing with route cache reject entries.
+           Note, that multicast routers are not affected, because
+           route cache entry is created eventually.
+         */
+        if (MULTICAST(daddr)) {
+                struct in_device *in_dev;
+                rcu_read_lock();
+                if ((in_dev = __in_dev_get(dev)) != NULL) {
+                        int our = ip_check_mc(in_dev, daddr, saddr,
+                                skb->nh.iph->protocol);
+                        if (our
+#ifdef CONFIG_IP_MROUTE
+                            || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+#endif
+                            ) {
+                                rcu_read_unlock();
+                                return ip_route_input_mc(skb, daddr, saddr,
+                                                         tos, dev, our);
+                        }
+                }
+                rcu_read_unlock();
+                return -EINVAL;
+        }
+        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+}
+static inline int __mkroute_output(struct rtable **result,
+                                   struct fib_result* res, 
+                                   const struct flowi *fl,
+                                   const struct flowi *oldflp, 
+                                   struct net_device *dev_out, 
+                                   unsigned flags) 
+{
+        struct rtable *rth;
+        struct in_device *in_dev;
+        u32 tos = RT_FL_TOS(oldflp);
+        int err = 0;
+        if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+                return -EINVAL;
+        if (fl->fl4_dst == 0xFFFFFFFF)
+                res->type = RTN_BROADCAST;
+        else if (MULTICAST(fl->fl4_dst))
+                res->type = RTN_MULTICAST;
+        else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
+                return -EINVAL;
+        if (dev_out->flags & IFF_LOOPBACK)
+                flags |= RTCF_LOCAL;
+        /* get work reference to inet device */
+        in_dev = in_dev_get(dev_out);
+        if (!in_dev)
+                return -EINVAL;
+        if (res->type == RTN_BROADCAST) {
+                flags |= RTCF_BROADCAST | RTCF_LOCAL;
+                if (res->fi) {
+                        fib_info_put(res->fi);
+                        res->fi = NULL;
+                }
+        } else if (res->type == RTN_MULTICAST) {
+                flags |= RTCF_MULTICAST|RTCF_LOCAL;
+                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
+                                 oldflp->proto))
+                        flags &= ~RTCF_LOCAL;
+                /* If multicast route do not exist use
+                   default one, but do not gateway in this case.
+                   Yes, it is hack.
+                 */
+                if (res->fi && res->prefixlen < 4) {
+                        fib_info_put(res->fi);
+                        res->fi = NULL;
+                }
+        }
+        rth = dst_alloc(&ipv4_dst_ops);
+        if (!rth) {
+                err = -ENOBUFS;
+                goto cleanup;
+        }               
+        rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        if (res->fi) {
+                rth->rt_multipath_alg = res->fi->fib_mp_alg;
+                if (res->fi->fib_nhs > 1)
+                        rth->u.dst.flags |= DST_BALANCED;
+        }
+#endif
+        if (in_dev->cnf.no_xfrm)
+                rth->u.dst.flags |= DST_NOXFRM;
+        if (in_dev->cnf.no_policy)
+                rth->u.dst.flags |= DST_NOPOLICY;
+        rth->fl.fl4_dst = oldflp->fl4_dst;
+        rth->fl.fl4_tos = tos;
+        rth->fl.fl4_src = oldflp->fl4_src;
+        rth->fl.oif     = oldflp->oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+        rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
+#endif
+        rth->rt_dst     = fl->fl4_dst;
+        rth->rt_src     = fl->fl4_src;
+        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
+        /* get references to the devices that are to be hold by the routing 
+           cache entry */
+        rth->u.dst.dev  = dev_out;
+        dev_hold(dev_out);
+        rth->idev       = in_dev_get(dev_out);
+        rth->rt_gateway = fl->fl4_dst;
+        rth->rt_spec_dst= fl->fl4_src;
+        rth->u.dst.output=ip_output;
+        RT_CACHE_STAT_INC(out_slow_tot);
+        if (flags & RTCF_LOCAL) {
+                rth->u.dst.input = ip_local_deliver;
+                rth->rt_spec_dst = fl->fl4_dst;
+        }
+        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+                rth->rt_spec_dst = fl->fl4_src;
+                if (flags & RTCF_LOCAL && 
+                    !(dev_out->flags & IFF_LOOPBACK)) {
+                        rth->u.dst.output = ip_mc_output;
+                        RT_CACHE_STAT_INC(out_slow_mc);
+                }
+#ifdef CONFIG_IP_MROUTE
+                if (res->type == RTN_MULTICAST) {
+                        if (IN_DEV_MFORWARD(in_dev) &&
+                            !LOCAL_MCAST(oldflp->fl4_dst)) {
+                                rth->u.dst.input = ip_mr_input;
+                                rth->u.dst.output = ip_mc_output;
+                        }
+                }
+#endif
+        }
+        rt_set_nexthop(rth, res, 0);
+        rth->rt_flags = flags;
+        *result = rth;
+ cleanup:
+        /* release work reference to inet device */
+        in_dev_put(in_dev);
+        return err;
+}
+static inline int ip_mkroute_output_def(struct rtable **rp,
+                                        struct fib_result* res,
+                                        const struct flowi *fl,
+                                        const struct flowi *oldflp,
+                                        struct net_device *dev_out,
+                                        unsigned flags)
+{
+        struct rtable *rth;
+        int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
+        unsigned hash;
+        if (err == 0) {
+                u32 tos = RT_FL_TOS(oldflp);
+                atomic_set(&rth->u.dst.__refcnt, 1);
+                
+                hash = rt_hash_code(oldflp->fl4_dst, 
+                                    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
+                err = rt_intern_hash(hash, rth, rp);
+        }
+        
+        return err;
+}
+static inline int ip_mkroute_output(struct rtable** rp,
+                                    struct fib_result* res,
+                                    const struct flowi *fl,
+                                    const struct flowi *oldflp,
+                                    struct net_device *dev_out,
+                                    unsigned flags)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        u32 tos = RT_FL_TOS(oldflp);
+        unsigned char hop;
+        unsigned hash;
+        int err = -EINVAL;
+        struct rtable *rth;
+        if (res->fi && res->fi->fib_nhs > 1) {
+                unsigned char hopcount = res->fi->fib_nhs;
+                for (hop = 0; hop < hopcount; hop++) {
+                        struct net_device *dev2nexthop;
+                        res->nh_sel = hop;
+                        /* hold a work reference to the output device */
+                        dev2nexthop = FIB_RES_DEV(*res);
+                        dev_hold(dev2nexthop);
+                        err = __mkroute_output(&rth, res, fl, oldflp,
+                                               dev2nexthop, flags);
+                        if (err != 0)
+                                goto cleanup;
+                        hash = rt_hash_code(oldflp->fl4_dst, 
+                                            oldflp->fl4_src ^
+                                            (oldflp->oif << 5), tos);
+                        err = rt_intern_hash(hash, rth, rp);
+                        /* forward hop information to multipath impl. */
+                        multipath_set_nhinfo(rth,
+                                             FIB_RES_NETWORK(*res),
+                                             FIB_RES_NETMASK(*res),
+                                             res->prefixlen,
+                                             &FIB_RES_NH(*res));
+                cleanup:
+                        /* release work reference to output device */
+                        dev_put(dev2nexthop);
+                        if (err != 0)
+                                return err;
+                }
+                atomic_set(&(*rp)->u.dst.__refcnt, 1);
+                return err;
+        } else {
+                return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
+                                             flags);
+        }
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+        return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
+#endif
+}
+/*
+ * Major route resolver routine.
+ */
+static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
+{
+        u32 tos = RT_FL_TOS(oldflp);
+        struct flowi fl = { .nl_u = { .ip4_u =
+                                      { .daddr = oldflp->fl4_dst,
+                                        .saddr = oldflp->fl4_src,
+                                        .tos = tos & IPTOS_RT_MASK,
+                                        .scope = ((tos & RTO_ONLINK) ?
+                                                  RT_SCOPE_LINK :
+                                                  RT_SCOPE_UNIVERSE),
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                                        .fwmark = oldflp->fl4_fwmark
+#endif
+                                      } },
+                            .iif = loopback_dev.ifindex,
+                            .oif = oldflp->oif };
+        struct fib_result res;
+        unsigned flags = 0;
+        struct net_device *dev_out = NULL;
+        int free_res = 0;
+        int err;
+        res.fi          = NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        res.r           = NULL;
+#endif
+        if (oldflp->fl4_src) {
+                err = -EINVAL;
+                if (MULTICAST(oldflp->fl4_src) ||
+                    BADCLASS(oldflp->fl4_src) ||
+                    ZERONET(oldflp->fl4_src))
+                        goto out;
+                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+                dev_out = ip_dev_find(oldflp->fl4_src);
+                if (dev_out == NULL)
+                        goto out;
+                /* I removed check for oif == dev_out->oif here.
+                   It was wrong for two reasons:
+                   1. ip_dev_find(saddr) can return wrong iface, if saddr is
+                      assigned to multiple interfaces.
+                   2. Moreover, we are allowed to send packets with saddr
+                      of another iface. --ANK
+                 */
+                if (oldflp->oif == 0
+                    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
+                        /* Special hack: user can direct multicasts
+                           and limited broadcast via necessary interface
+                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+                           This hack is not just for fun, it allows
+                           vic,vat and friends to work.
+                           They bind socket to loopback, set ttl to zero
+                           and expect that it will work.
+                           From the viewpoint of routing cache they are broken,
+                           because we are not allowed to build multicast path
+                           with loopback source addr (look, routing cache
+                           cannot know, that ttl is zero, so that packet
+                           will not leave this host and route is valid).
+                           Luckily, this hack is good workaround.
+                         */
+                        fl.oif = dev_out->ifindex;
+                        goto make_route;
+                }
+                if (dev_out)
+                        dev_put(dev_out);
+                dev_out = NULL;
+        }
+        if (oldflp->oif) {
+                dev_out = dev_get_by_index(oldflp->oif);
+                err = -ENODEV;
+                if (dev_out == NULL)
+                        goto out;
+                if (__in_dev_get(dev_out) == NULL) {
+                        dev_put(dev_out);
+                        goto out;       /* Wrong error code */
+                }
+                if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
+                        if (!fl.fl4_src)
+                                fl.fl4_src = inet_select_addr(dev_out, 0,
+                                                              RT_SCOPE_LINK);
+                        goto make_route;
+                }
+                if (!fl.fl4_src) {
+                        if (MULTICAST(oldflp->fl4_dst))
+                                fl.fl4_src = inet_select_addr(dev_out, 0,
+                                                              fl.fl4_scope);
+                        else if (!oldflp->fl4_dst)
+                                fl.fl4_src = inet_select_addr(dev_out, 0,
+                                                              RT_SCOPE_HOST);
+                }
+        }
+        if (!fl.fl4_dst) {
+                fl.fl4_dst = fl.fl4_src;
+                if (!fl.fl4_dst)
+                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
+                if (dev_out)
+                        dev_put(dev_out);
+                dev_out = &loopback_dev;
+                dev_hold(dev_out);
+                fl.oif = loopback_dev.ifindex;
+                res.type = RTN_LOCAL;
+                flags |= RTCF_LOCAL;
+                goto make_route;
+        }
+        if (fib_lookup(&fl, &res)) {
+                res.fi = NULL;
+                if (oldflp->oif) {
+                        /* Apparently, routing tables are wrong. Assume,
+                           that the destination is on link.
+                           WHY? DW.
+                           Because we are allowed to send to iface
+                           even if it has NO routes and NO assigned
+                           addresses. When oif is specified, routing
+                           tables are looked up with only one purpose:
+                           to catch if destination is gatewayed, rather than
+                           direct. Moreover, if MSG_DONTROUTE is set,
+                           we send packet, ignoring both routing tables
+                           and ifaddr state. --ANK
+                           We could make it even if oif is unknown,
+                           likely IPv6, but we do not.
+                         */
+                        if (fl.fl4_src == 0)
+                                fl.fl4_src = inet_select_addr(dev_out, 0,
+                                                              RT_SCOPE_LINK);
+                        res.type = RTN_UNICAST;
+                        goto make_route;
+                }
+                if (dev_out)
+                        dev_put(dev_out);
+                err = -ENETUNREACH;
+                goto out;
+        }
+        free_res = 1;
+        if (res.type == RTN_LOCAL) {
+                if (!fl.fl4_src)
+                        fl.fl4_src = fl.fl4_dst;
+                if (dev_out)
+                        dev_put(dev_out);
+                dev_out = &loopback_dev;
+                dev_hold(dev_out);
+                fl.oif = dev_out->ifindex;
+                if (res.fi)
+                        fib_info_put(res.fi);
+                res.fi = NULL;
+                flags |= RTCF_LOCAL;
+                goto make_route;
+        }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+        if (res.fi->fib_nhs > 1 && fl.oif == 0)
+                fib_select_multipath(&fl, &res);
+        else
+#endif
+        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
+                fib_select_default(&fl, &res);
+        if (!fl.fl4_src)
+                fl.fl4_src = FIB_RES_PREFSRC(res);
+        if (dev_out)
+                dev_put(dev_out);
+        dev_out = FIB_RES_DEV(res);
+        dev_hold(dev_out);
+        fl.oif = dev_out->ifindex;
+make_route:
+        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+        if (free_res)
+                fib_res_put(&res);
+        if (dev_out)
+                dev_put(dev_out);
+out:    return err;
+}
+int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
+{
+        unsigned hash;
+        struct rtable *rth;
+        hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
+        rcu_read_lock_bh();
+        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+                rth = rcu_dereference(rth->u.rt_next)) {
+                if (rth->fl.fl4_dst == flp->fl4_dst &&
+                    rth->fl.fl4_src == flp->fl4_src &&
+                    rth->fl.iif == 0 &&
+                    rth->fl.oif == flp->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
+#endif
+                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+                            (IPTOS_RT_MASK | RTO_ONLINK))) {
+                        /* check for multipath routes and choose one if
+                         * necessary
+                         */
+                        if (multipath_select_route(flp, rth, rp)) {
+                                dst_hold(&(*rp)->u.dst);
+                                RT_CACHE_STAT_INC(out_hit);
+                                rcu_read_unlock_bh();
+                                return 0;
+                        }
+                        rth->u.dst.lastuse = jiffies;
+                        dst_hold(&rth->u.dst);
+                        rth->u.dst.__use++;
+                        RT_CACHE_STAT_INC(out_hit);
+                        rcu_read_unlock_bh();
+                        *rp = rth;
+                        return 0;
+                }
+                RT_CACHE_STAT_INC(out_hlist_search);
+        }
+        rcu_read_unlock_bh();
+        return ip_route_output_slow(rp, flp);
+}
+int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
+{
+        int err;
+        if ((err = __ip_route_output_key(rp, flp)) != 0)
+                return err;
+        if (flp->proto) {
+                if (!flp->fl4_src)
+                        flp->fl4_src = (*rp)->rt_src;
+                if (!flp->fl4_dst)
+                        flp->fl4_dst = (*rp)->rt_dst;
+                return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
+        }
+        return 0;
+}
+int ip_route_output_key(struct rtable **rp, struct flowi *flp)
+{
+        return ip_route_output_flow(rp, flp, NULL, 0);
+}
+static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+                        int nowait)
+{
+        struct rtable *rt = (struct rtable*)skb->dst;
+        struct rtmsg *r;
+        struct nlmsghdr  *nlh;
+        unsigned char    *b = skb->tail;
+        struct rta_cacheinfo ci;
+#ifdef CONFIG_IP_MROUTE
+        struct rtattr *eptr;
+#endif
+        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+        r = NLMSG_DATA(nlh);
+        nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+        r->rtm_family    = AF_INET;
+        r->rtm_dst_len  = 32;
+        r->rtm_src_len  = 0;
+        r->rtm_tos      = rt->fl.fl4_tos;
+        r->rtm_table    = RT_TABLE_MAIN;
+        r->rtm_type     = rt->rt_type;
+        r->rtm_scope    = RT_SCOPE_UNIVERSE;
+        r->rtm_protocol = RTPROT_UNSPEC;
+        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+        if (rt->rt_flags & RTCF_NOTIFY)
+                r->rtm_flags |= RTM_F_NOTIFY;
+        RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+        if (rt->fl.fl4_src) {
+                r->rtm_src_len = 32;
+                RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
+        }
+        if (rt->u.dst.dev)
+                RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+#ifdef CONFIG_NET_CLS_ROUTE
+        if (rt->u.dst.tclassid)
+                RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+        if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
+                __u32 alg = rt->rt_multipath_alg;
+                RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
+        }
+#endif
+        if (rt->fl.iif)
+                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+        else if (rt->rt_src != rt->fl.fl4_src)
+                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
+        if (rt->rt_dst != rt->rt_gateway)
+                RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+                goto rtattr_failure;
+        ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
+        ci.rta_used     = rt->u.dst.__use;
+        ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
+        if (rt->u.dst.expires)
+                ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
+        else
+                ci.rta_expires = 0;
+        ci.rta_error    = rt->u.dst.error;
+        ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
+        if (rt->peer) {
+                ci.rta_id = rt->peer->ip_id_count;
+                if (rt->peer->tcp_ts_stamp) {
+                        ci.rta_ts = rt->peer->tcp_ts;
+                        ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
+                }
+        }
+#ifdef CONFIG_IP_MROUTE
+        eptr = (struct rtattr*)skb->tail;
+#endif
+        RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+        if (rt->fl.iif) {
+#ifdef CONFIG_IP_MROUTE
+                u32 dst = rt->rt_dst;
+                if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
+                    ipv4_devconf.mc_forwarding) {
+                        int err = ipmr_get_route(skb, r, nowait);
+                        if (err <= 0) {
+                                if (!nowait) {
+                                        if (err == 0)
+                                                return 0;
+                                        goto nlmsg_failure;
+                                } else {
+                                        if (err == -EMSGSIZE)
+                                                goto nlmsg_failure;
+                                        ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
+                                }
+                        }
+                } else
+#endif
+                        RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
+        }
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+{
+        struct rtattr **rta = arg;
+        struct rtmsg *rtm = NLMSG_DATA(nlh);
+        struct rtable *rt = NULL;
+        u32 dst = 0;
+        u32 src = 0;
+        int iif = 0;
+        int err = -ENOBUFS;
+        struct sk_buff *skb;
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb)
+                goto out;
+        /* Reserve room for dummy headers, this skb can pass
+           through good chunk of routing engine.
+         */
+        skb->mac.raw = skb->data;
+        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+        if (rta[RTA_SRC - 1])
+                memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
+        if (rta[RTA_DST - 1])
+                memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
+        if (rta[RTA_IIF - 1])
+                memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
+        if (iif) {
+                struct net_device *dev = __dev_get_by_index(iif);
+                err = -ENODEV;
+                if (!dev)
+                        goto out_free;
+                skb->protocol   = htons(ETH_P_IP);
+                skb->dev        = dev;
+                local_bh_disable();
+                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+                local_bh_enable();
+                rt = (struct rtable*)skb->dst;
+                if (!err && rt->u.dst.error)
+                        err = -rt->u.dst.error;
+        } else {
+                struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
+                                                         .saddr = src,
+                                                         .tos = rtm->rtm_tos } } };
+                int oif = 0;
+                if (rta[RTA_OIF - 1])
+                        memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
+                fl.oif = oif;
+                err = ip_route_output_key(&rt, &fl);
+        }
+        if (err)
+                goto out_free;
+        skb->dst = &rt->u.dst;
+        if (rtm->rtm_flags & RTM_F_NOTIFY)
+                rt->rt_flags |= RTCF_NOTIFY;
+        NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
+        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+                                RTM_NEWROUTE, 0);
+        if (!err)
+                goto out_free;
+        if (err < 0) {
+                err = -EMSGSIZE;
+                goto out_free;
+        }
+        err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+        if (err > 0)
+                err = 0;
+out:    return err;
+out_free:
+        kfree_skb(skb);
+        goto out;
+}
+int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
+{
+        struct rtable *rt;
+        int h, s_h;
+        int idx, s_idx;
+        s_h = cb->args[0];
+        s_idx = idx = cb->args[1];
+        for (h = 0; h <= rt_hash_mask; h++) {
+                if (h < s_h) continue;
+                if (h > s_h)
+                        s_idx = 0;
+                rcu_read_lock_bh();
+                for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
+                     rt = rcu_dereference(rt->u.rt_next), idx++) {
+                        if (idx < s_idx)
+                                continue;
+                        skb->dst = dst_clone(&rt->u.dst);
+                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+                                         cb->nlh->nlmsg_seq,
+                                         RTM_NEWROUTE, 1) <= 0) {
+                                dst_release(xchg(&skb->dst, NULL));
+                                rcu_read_unlock_bh();
+                                goto done;
+                        }
+                        dst_release(xchg(&skb->dst, NULL));
+                }
+                rcu_read_unlock_bh();
+        }
+done:
+        cb->args[0] = h;
+        cb->args[1] = idx;
+        return skb->len;
+}
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+        rt_cache_flush(0);
+}
+#ifdef CONFIG_SYSCTL
+static int flush_delay;
+static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
+                                        struct file *filp, void __user *buffer,
+                                        size_t *lenp, loff_t *ppos)
+{
+        if (write) {
+                proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+                rt_cache_flush(flush_delay);
+                return 0;
+        } 
+        return -EINVAL;
+}
+static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
+                                                int __user *name,
+                                                int nlen,
+                                                void __user *oldval,
+                                                size_t __user *oldlenp,
+                                                void __user *newval,
+                                                size_t newlen,
+                                                void **context)
+{
+        int delay;
+        if (newlen != sizeof(int))
+                return -EINVAL;
+        if (get_user(delay, (int __user *)newval))
+                return -EFAULT; 
+        rt_cache_flush(delay); 
+        return 0;
+}
+ctl_table ipv4_route_table[] = {
+        {
+                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
+                .procname       = "flush",
+                .data           = &flush_delay,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &ipv4_sysctl_rtcache_flush,
+                .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
+                .procname       = "min_delay",
+                .data           = &ip_rt_min_delay,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
+                .procname       = "max_delay",
+                .data           = &ip_rt_max_delay,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
+                .procname       = "gc_thresh",
+                .data           = &ipv4_dst_ops.gc_thresh,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
+                .procname       = "max_size",
+                .data           = &ip_rt_max_size,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                /*  Deprecated. Use gc_min_interval_ms */
+ 
+                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
+                .procname       = "gc_min_interval",
+                .data           = &ip_rt_gc_min_interval,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
+                .procname       = "gc_min_interval_ms",
+                .data           = &ip_rt_gc_min_interval,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_ms_jiffies,
+                .strategy       = &sysctl_ms_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
+                .procname       = "gc_timeout",
+                .data           = &ip_rt_gc_timeout,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
+                .procname       = "gc_interval",
+                .data           = &ip_rt_gc_interval,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
+                .procname       = "redirect_load",
+                .data           = &ip_rt_redirect_load,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
+                .procname       = "redirect_number",
+                .data           = &ip_rt_redirect_number,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
+                .procname       = "redirect_silence",
+                .data           = &ip_rt_redirect_silence,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
+                .procname       = "error_cost",
+                .data           = &ip_rt_error_cost,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
+                .procname       = "error_burst",
+                .data           = &ip_rt_error_burst,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
+                .procname       = "gc_elasticity",
+                .data           = &ip_rt_gc_elasticity,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
+                .procname       = "mtu_expires",
+                .data           = &ip_rt_mtu_expires,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
+                .procname       = "min_pmtu",
+                .data           = &ip_rt_min_pmtu,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
+                .procname       = "min_adv_mss",
+                .data           = &ip_rt_min_advmss,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
+                .procname       = "secret_interval",
+                .data           = &ip_rt_secret_interval,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+        { .ctl_name = 0 }
+};
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+struct ip_rt_acct *ip_rt_acct;
+/* This code sucks.  But you should have seen it before! --RR */
+/* IP route accounting ptr for this logical cpu number. */
+#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
+#ifdef CONFIG_PROC_FS
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+                           int length, int *eof, void *data)
+{
+        unsigned int i;
+        if ((offset & 3) || (length & 3))
+                return -EIO;
+        if (offset >= sizeof(struct ip_rt_acct) * 256) {
+                *eof = 1;
+                return 0;
+        }
+        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
+                length = sizeof(struct ip_rt_acct) * 256 - offset;
+                *eof = 1;
+        }
+        offset /= sizeof(u32);
+        if (length > 0) {
+                u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
+                u32 *dst = (u32 *) buffer;
+                /* Copy first cpu. */
+                *start = buffer;
+                memcpy(dst, src, length);
+                /* Add the other cpus in, one int at a time */
+                for_each_cpu(i) {
+                        unsigned int j;
+                        src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
+                        for (j = 0; j < length/4; j++)
+                                dst[j] += src[j];
+                }
+        }
+        return length;
+}
+#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NET_CLS_ROUTE */
+static __initdata unsigned long rhash_entries;
+static int __init set_rhash_entries(char *str)
+{
+        if (!str)
+                return 0;
+        rhash_entries = simple_strtoul(str, &str, 0);
+        return 1;
+}
+__setup("rhash_entries=", set_rhash_entries);
+int __init ip_rt_init(void)
+{
+        int i, order, goal, rc = 0;
+        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
+                             (jiffies ^ (jiffies >> 7)));
+#ifdef CONFIG_NET_CLS_ROUTE
+        for (order = 0;
+             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
+                /* NOTHING */;
+        ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
+        if (!ip_rt_acct)
+                panic("IP: failed to allocate ip_rt_acct\n");
+        memset(ip_rt_acct, 0, PAGE_SIZE << order);
+#endif
+        ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
+                                                     sizeof(struct rtable),
+                                                     0, SLAB_HWCACHE_ALIGN,
+                                                     NULL, NULL);
+        if (!ipv4_dst_ops.kmem_cachep)
+                panic("IP: failed to allocate ip_dst_cache\n");
+        goal = num_physpages >> (26 - PAGE_SHIFT);
+        if (rhash_entries)
+                goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
+        for (order = 0; (1UL << order) < goal; order++)
+                /* NOTHING */;
+        do {
+                rt_hash_mask = (1UL << order) * PAGE_SIZE /
+                        sizeof(struct rt_hash_bucket);
+                while (rt_hash_mask & (rt_hash_mask - 1))
+                        rt_hash_mask--;
+                rt_hash_table = (struct rt_hash_bucket *)
+                        __get_free_pages(GFP_ATOMIC, order);
+        } while (rt_hash_table == NULL && --order > 0);
+        if (!rt_hash_table)
+                panic("Failed to allocate IP route cache hash table\n");
+        printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
+               rt_hash_mask,
+               (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
+        for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
+                /* NOTHING */;
+        rt_hash_mask--;
+        for (i = 0; i <= rt_hash_mask; i++) {
+                spin_lock_init(&rt_hash_table[i].lock);
+                rt_hash_table[i].chain = NULL;
+        }
+        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
+        ip_rt_max_size = (rt_hash_mask + 1) * 16;
+        rt_cache_stat = alloc_percpu(struct rt_cache_stat);
+        if (!rt_cache_stat)
+                return -ENOMEM;
+        devinet_init();
+        ip_fib_init();
+        init_timer(&rt_flush_timer);
+        rt_flush_timer.function = rt_run_flush;
+        init_timer(&rt_periodic_timer);
+        rt_periodic_timer.function = rt_check_expire;
+        init_timer(&rt_secret_timer);
+        rt_secret_timer.function = rt_secret_rebuild;
+        /* All the timers, started at system startup tend
+           to synchronize. Perturb it a bit.
+         */
+        rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
+                                        ip_rt_gc_interval;
+        add_timer(&rt_periodic_timer);
+        rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
+                ip_rt_secret_interval;
+        add_timer(&rt_secret_timer);
+#ifdef CONFIG_PROC_FS
+        {
+        struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
+        if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
+            !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
+                                             proc_net_stat))) {
+                free_percpu(rt_cache_stat);
+                return -ENOMEM;
+        }
+        rtstat_pde->proc_fops = &rt_cpu_seq_fops;
+        }
+#ifdef CONFIG_NET_CLS_ROUTE
+        create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
+#endif
+#endif
+#ifdef CONFIG_XFRM
+        xfrm_init();
+        xfrm4_init();
+#endif
+        return rc;
+}
+EXPORT_SYMBOL(__ip_select_ident);
+EXPORT_SYMBOL(ip_route_input);
+EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 000000000000..e923d2f021aa
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,279 @@
+/*
+ *  Syncookies implementation for the Linux kernel
+ *
+ *  Copyright (C) 1997 Andi Kleen
+ *  Based on ideas by D.J.Bernstein and Eric Schenk. 
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ * 
+ *  $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $
+ *
+ *  Missing: IPv6 support. 
+ */
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+extern int sysctl_tcp_syncookies;
+static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
+static __init int init_syncookies(void)
+{
+        get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
+        return 0;
+}
+module_init(init_syncookies);
+#define COOKIEBITS 24   /* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport,
+                       u32 count, int c)
+{
+        __u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
+        memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c]));
+        tmp[0] = saddr;
+        tmp[1] = daddr;
+        tmp[2] = (sport << 16) + dport;
+        tmp[3] = count;
+        sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+        return tmp[17];
+}
+static __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
+                                   __u16 dport, __u32 sseq, __u32 count,
+                                   __u32 data)
+{
+        /*
+         * Compute the secure sequence number.
+         * The output should be:
+         *   HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+         *      + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+         * Where sseq is their sequence number and count increases every
+         * minute by 1.
+         * As an extra hack, we add a small "data" value that encodes the
+         * MSS into the second hash value.
+         */
+        return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+                sseq + (count << COOKIEBITS) +
+                ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+                 & COOKIEMASK));
+}
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range.  This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be within
+ * "maxdiff" if the current (passed-in) "count".  The return value
+ * is (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr,
+                                  __u16 sport, __u16 dport, __u32 sseq,
+                                  __u32 count, __u32 maxdiff)
+{
+        __u32 diff;
+        /* Strip away the layers from the cookie */
+        cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+        /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+        diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
+        if (diff >= maxdiff)
+                return (__u32)-1;
+        return (cookie -
+                cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+                & COOKIEMASK;   /* Leaving the data behind */
+}
+/* 
+ * This table has to be sorted and terminated with (__u16)-1.
+ * XXX generate a better table.
+ * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ */
+static __u16 const msstab[] = {
+        64 - 1,
+        256 - 1,        
+        512 - 1,
+        536 - 1,
+        1024 - 1,       
+        1440 - 1,
+        1460 - 1,
+        4312 - 1,
+        (__u16)-1
+};
+/* The number doesn't include the -1 terminator */
+#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
+/*
+ * Generate a syncookie.  mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int mssind;
+        const __u16 mss = *mssp;
+        
+        tp->last_synq_overflow = jiffies;
+        /* XXX sort msstab[] by probability?  Binary search? */
+        for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
+                ;
+        *mssp = msstab[mssind] + 1;
+        NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT);
+        return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr,
+                                     skb->h.th->source, skb->h.th->dest,
+                                     ntohl(skb->h.th->seq),
+                                     jiffies / (HZ * 60), mssind);
+}
+/* 
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+/*  
+ * Check if a ack sequence number is a valid syncookie. 
+ * Return the decoded mss if it is, or 0 if not.
+ */
+static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+{
+        __u32 seq; 
+        __u32 mssind;
+        seq = ntohl(skb->h.th->seq)-1; 
+        mssind = check_tcp_syn_cookie(cookie,
+                                      skb->nh.iph->saddr, skb->nh.iph->daddr,
+                                      skb->h.th->source, skb->h.th->dest,
+                                      seq, jiffies / (HZ * 60), COUNTER_TRIES);
+        return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+}
+extern struct or_calltable or_ipv4;
+static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+                                           struct open_request *req,
+                                           struct dst_entry *dst)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sock *child;
+        child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+        if (child)
+                tcp_acceptq_queue(sk, req, child);
+        else
+                tcp_openreq_free(req);
+        return child;
+}
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
+                             struct ip_options *opt)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 
+        struct sock *ret = sk;
+        struct open_request *req; 
+        int mss; 
+        struct rtable *rt; 
+        __u8 rcv_wscale;
+        if (!sysctl_tcp_syncookies || !skb->h.th->ack)
+                goto out;
+        if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
+            (mss = cookie_check(skb, cookie)) == 0) {
+                NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED);
+                goto out;
+        }
+        NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
+        req = tcp_openreq_alloc();
+        ret = NULL;
+        if (!req)
+                goto out;
+        req->rcv_isn            = htonl(skb->h.th->seq) - 1;
+        req->snt_isn            = cookie; 
+        req->mss                = mss;
+        req->rmt_port           = skb->h.th->source;
+        req->af.v4_req.loc_addr = skb->nh.iph->daddr;
+        req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
+        req->class              = &or_ipv4; /* for savety */
+        req->af.v4_req.opt      = NULL;
+        /* We throwed the options of the initial SYN away, so we hope
+         * the ACK carries the same options again (see RFC1122 4.2.3.8)
+         */
+        if (opt && opt->optlen) {
+                int opt_size = sizeof(struct ip_options) + opt->optlen;
+                req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
+                if (req->af.v4_req.opt) {
+                        if (ip_options_echo(req->af.v4_req.opt, skb)) {
+                                kfree(req->af.v4_req.opt);
+                                req->af.v4_req.opt = NULL;
+                        }
+                }
+        }
+        req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
+        req->wscale_ok  = req->sack_ok = 0; 
+        req->expires    = 0UL; 
+        req->retrans    = 0; 
+        
+        /*
+         * We need to lookup the route here to get at the correct
+         * window size. We should better make sure that the window size
+         * hasn't changed since we received the original syn, but I see
+         * no easy way to do this. 
+         */
+        {
+                struct flowi fl = { .nl_u = { .ip4_u =
+                                              { .daddr = ((opt && opt->srr) ?
+                                                          opt->faddr :
+                                                          req->af.v4_req.rmt_addr),
+                                                .saddr = req->af.v4_req.loc_addr,
+                                                .tos = RT_CONN_FLAGS(sk) } },
+                                    .proto = IPPROTO_TCP,
+                                    .uli_u = { .ports =
+                                               { .sport = skb->h.th->dest,
+                                                 .dport = skb->h.th->source } } };
+                if (ip_route_output_key(&rt, &fl)) {
+                        tcp_openreq_free(req);
+                        goto out; 
+                }
+        }
+        /* Try to redo what tcp_v4_send_synack did. */
+        req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW);
+        tcp_select_initial_window(tcp_full_space(sk), req->mss,
+                                  &req->rcv_wnd, &req->window_clamp, 
+                                  0, &rcv_wscale);
+        /* BTW win scale with syncookies is 0 by definition */
+        req->rcv_wscale   = rcv_wscale; 
+        ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
+out:    return ret;
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 000000000000..3aafb298c1c1
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,698 @@
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/config.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+/* From af_inet.c */
+extern int sysctl_ip_nonlocal_bind;
+/* From icmp.c */
+extern int sysctl_icmp_echo_ignore_all;
+extern int sysctl_icmp_echo_ignore_broadcasts;
+extern int sysctl_icmp_ignore_bogus_error_responses;
+/* From ip_fragment.c */
+extern int sysctl_ipfrag_low_thresh;
+extern int sysctl_ipfrag_high_thresh; 
+extern int sysctl_ipfrag_time;
+extern int sysctl_ipfrag_secret_interval;
+/* From ip_output.c */
+extern int sysctl_ip_dynaddr;
+/* From icmp.c */
+extern int sysctl_icmp_ratelimit;
+extern int sysctl_icmp_ratemask;
+/* From igmp.c */
+extern int sysctl_igmp_max_memberships;
+extern int sysctl_igmp_max_msf;
+/* From inetpeer.c */
+extern int inet_peer_threshold;
+extern int inet_peer_minttl;
+extern int inet_peer_maxttl;
+extern int inet_peer_gc_mintime;
+extern int inet_peer_gc_maxtime;
+#ifdef CONFIG_SYSCTL
+static int tcp_retr1_max = 255; 
+static int ip_local_port_range_min[] = { 1, 1 };
+static int ip_local_port_range_max[] = { 65535, 65535 };
+#endif
+struct ipv4_config ipv4_config;
+extern ctl_table ipv4_route_table[];
+#ifdef CONFIG_SYSCTL
+static
+int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int val = ipv4_devconf.forwarding;
+        int ret;
+        ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+        if (write && ipv4_devconf.forwarding != val)
+                inet_forward_change();
+        return ret;
+}
+static int ipv4_sysctl_forward_strategy(ctl_table *table,
+                         int __user *name, int nlen,
+                         void __user *oldval, size_t __user *oldlenp,
+                         void __user *newval, size_t newlen, 
+                         void **context)
+{
+        int *valp = table->data;
+        int new;
+        if (!newval || !newlen)
+                return 0;
+        if (newlen != sizeof(int))
+                return -EINVAL;
+        if (get_user(new, (int __user *)newval))
+                return -EFAULT;
+        if (new == *valp)
+                return 0;
+        if (oldval && oldlenp) {
+                size_t len;
+                if (get_user(len, oldlenp))
+                        return -EFAULT;
+                if (len) {
+                        if (len > table->maxlen)
+                                len = table->maxlen;
+                        if (copy_to_user(oldval, valp, len))
+                                return -EFAULT;
+                        if (put_user(len, oldlenp))
+                                return -EFAULT;
+                }
+        }
+        *valp = new;
+        inet_forward_change();
+        return 1;
+}
+ctl_table ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4_TCP_TIMESTAMPS,
+                .procname       = "tcp_timestamps",
+                .data           = &sysctl_tcp_timestamps,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_WINDOW_SCALING,
+                .procname       = "tcp_window_scaling",
+                .data           = &sysctl_tcp_window_scaling,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_SACK,
+                .procname       = "tcp_sack",
+                .data           = &sysctl_tcp_sack,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_RETRANS_COLLAPSE,
+                .procname       = "tcp_retrans_collapse",
+                .data           = &sysctl_tcp_retrans_collapse,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_FORWARD,
+                .procname       = "ip_forward",
+                .data           = &ipv4_devconf.forwarding,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &ipv4_sysctl_forward,
+                .strategy       = &ipv4_sysctl_forward_strategy
+        },
+        {
+                .ctl_name       = NET_IPV4_DEFAULT_TTL,
+                .procname       = "ip_default_ttl",
+                .data           = &sysctl_ip_default_ttl,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &ipv4_doint_and_flush,
+                .strategy       = &ipv4_doint_and_flush_strategy,
+        },
+        {
+                .ctl_name       = NET_IPV4_AUTOCONFIG,
+                .procname       = "ip_autoconfig",
+                .data           = &ipv4_config.autoconfig,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_NO_PMTU_DISC,
+                .procname       = "ip_no_pmtu_disc",
+                .data           = &ipv4_config.no_pmtu_disc,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_NONLOCAL_BIND,
+                .procname       = "ip_nonlocal_bind",
+                .data           = &sysctl_ip_nonlocal_bind,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_SYN_RETRIES,
+                .procname       = "tcp_syn_retries",
+                .data           = &sysctl_tcp_syn_retries,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_SYNACK_RETRIES,
+                .procname       = "tcp_synack_retries",
+                .data           = &sysctl_tcp_synack_retries,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_MAX_ORPHANS,
+                .procname       = "tcp_max_orphans",
+                .data           = &sysctl_tcp_max_orphans,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_MAX_TW_BUCKETS,
+                .procname       = "tcp_max_tw_buckets",
+                .data           = &sysctl_tcp_max_tw_buckets,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_IPFRAG_HIGH_THRESH,
+                .procname       = "ipfrag_high_thresh",
+                .data           = &sysctl_ipfrag_high_thresh,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_IPFRAG_LOW_THRESH,
+                .procname       = "ipfrag_low_thresh",
+                .data           = &sysctl_ipfrag_low_thresh,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_DYNADDR,
+                .procname       = "ip_dynaddr",
+                .data           = &sysctl_ip_dynaddr,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_IPFRAG_TIME,
+                .procname       = "ipfrag_time",
+                .data           = &sysctl_ipfrag_time,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_KEEPALIVE_TIME,
+                .procname       = "tcp_keepalive_time",
+                .data           = &sysctl_tcp_keepalive_time,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_KEEPALIVE_PROBES,
+                .procname       = "tcp_keepalive_probes",
+                .data           = &sysctl_tcp_keepalive_probes,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_KEEPALIVE_INTVL,
+                .procname       = "tcp_keepalive_intvl",
+                .data           = &sysctl_tcp_keepalive_intvl,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_RETRIES1,
+                .procname       = "tcp_retries1",
+                .data           = &sysctl_tcp_retries1,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra2         = &tcp_retr1_max
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_RETRIES2,
+                .procname       = "tcp_retries2",
+                .data           = &sysctl_tcp_retries2,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_TCP_FIN_TIMEOUT,
+                .procname       = "tcp_fin_timeout",
+                .data           = &sysctl_tcp_fin_timeout,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+#ifdef CONFIG_SYN_COOKIES
+        {
+                .ctl_name       = NET_TCP_SYNCOOKIES,
+                .procname       = "tcp_syncookies",
+                .data           = &sysctl_tcp_syncookies,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+#endif
+        {
+                .ctl_name       = NET_TCP_TW_RECYCLE,
+                .procname       = "tcp_tw_recycle",
+                .data           = &sysctl_tcp_tw_recycle,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_ABORT_ON_OVERFLOW,
+                .procname       = "tcp_abort_on_overflow",
+                .data           = &sysctl_tcp_abort_on_overflow,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_STDURG,
+                .procname       = "tcp_stdurg",
+                .data           = &sysctl_tcp_stdurg,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_RFC1337,
+                .procname       = "tcp_rfc1337",
+                .data           = &sysctl_tcp_rfc1337,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_MAX_SYN_BACKLOG,
+                .procname       = "tcp_max_syn_backlog",
+                .data           = &sysctl_max_syn_backlog,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_LOCAL_PORT_RANGE,
+                .procname       = "ip_local_port_range",
+                .data           = &sysctl_local_port_range,
+                .maxlen         = sizeof(sysctl_local_port_range),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = ip_local_port_range_min,
+                .extra2         = ip_local_port_range_max
+        },
+        {
+                .ctl_name       = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
+                .procname       = "icmp_echo_ignore_all",
+                .data           = &sysctl_icmp_echo_ignore_all,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
+                .procname       = "icmp_echo_ignore_broadcasts",
+                .data           = &sysctl_icmp_echo_ignore_broadcasts,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
+                .procname       = "icmp_ignore_bogus_error_responses",
+                .data           = &sysctl_icmp_ignore_bogus_error_responses,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_ROUTE,
+                .procname       = "route",
+                .maxlen         = 0,
+                .mode           = 0555,
+                .child          = ipv4_route_table
+        },
+#ifdef CONFIG_IP_MULTICAST
+        {
+                .ctl_name       = NET_IPV4_IGMP_MAX_MEMBERSHIPS,
+                .procname       = "igmp_max_memberships",
+                .data           = &sysctl_igmp_max_memberships,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+#endif
+        {
+                .ctl_name       = NET_IPV4_IGMP_MAX_MSF,
+                .procname       = "igmp_max_msf",
+                .data           = &sysctl_igmp_max_msf,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_INET_PEER_THRESHOLD,
+                .procname       = "inet_peer_threshold",
+                .data           = &inet_peer_threshold,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_INET_PEER_MINTTL,
+                .procname       = "inet_peer_minttl",
+                .data           = &inet_peer_minttl,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_IPV4_INET_PEER_MAXTTL,
+                .procname       = "inet_peer_maxttl",
+                .data           = &inet_peer_maxttl,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_IPV4_INET_PEER_GC_MINTIME,
+                .procname       = "inet_peer_gc_mintime",
+                .data           = &inet_peer_gc_mintime,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_IPV4_INET_PEER_GC_MAXTIME,
+                .procname       = "inet_peer_gc_maxtime",
+                .data           = &inet_peer_gc_maxtime,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_TCP_ORPHAN_RETRIES,
+                .procname       = "tcp_orphan_retries",
+                .data           = &sysctl_tcp_orphan_retries,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_FACK,
+                .procname       = "tcp_fack",
+                .data           = &sysctl_tcp_fack,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_REORDERING,
+                .procname       = "tcp_reordering",
+                .data           = &sysctl_tcp_reordering,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_ECN,
+                .procname       = "tcp_ecn",
+                .data           = &sysctl_tcp_ecn,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_DSACK,
+                .procname       = "tcp_dsack",
+                .data           = &sysctl_tcp_dsack,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_MEM,
+                .procname       = "tcp_mem",
+                .data           = &sysctl_tcp_mem,
+                .maxlen         = sizeof(sysctl_tcp_mem),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_WMEM,
+                .procname       = "tcp_wmem",
+                .data           = &sysctl_tcp_wmem,
+                .maxlen         = sizeof(sysctl_tcp_wmem),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_RMEM,
+                .procname       = "tcp_rmem",
+                .data           = &sysctl_tcp_rmem,
+                .maxlen         = sizeof(sysctl_tcp_rmem),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_APP_WIN,
+                .procname       = "tcp_app_win",
+                .data           = &sysctl_tcp_app_win,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_ADV_WIN_SCALE,
+                .procname       = "tcp_adv_win_scale",
+                .data           = &sysctl_tcp_adv_win_scale,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_ICMP_RATELIMIT,
+                .procname       = "icmp_ratelimit",
+                .data           = &sysctl_icmp_ratelimit,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_ICMP_RATEMASK,
+                .procname       = "icmp_ratemask",
+                .data           = &sysctl_icmp_ratemask,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_TW_REUSE,
+                .procname       = "tcp_tw_reuse",
+                .data           = &sysctl_tcp_tw_reuse,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_FRTO,
+                .procname       = "tcp_frto",
+                .data           = &sysctl_tcp_frto,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_TCP_LOW_LATENCY,
+                .procname       = "tcp_low_latency",
+                .data           = &sysctl_tcp_low_latency,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
+                .ctl_name       = NET_IPV4_IPFRAG_SECRET_INTERVAL,
+                .procname       = "ipfrag_secret_interval",
+                .data           = &sysctl_ipfrag_secret_interval,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies
+        },
+        {
+                .ctl_name       = NET_TCP_NO_METRICS_SAVE,
+                .procname       = "tcp_no_metrics_save",
+                .data           = &sysctl_tcp_nometrics_save,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_WESTWOOD, 
+                .procname       = "tcp_westwood",
+                .data           = &sysctl_tcp_westwood,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_VEGAS,
+                .procname       = "tcp_vegas_cong_avoid",
+                .data           = &sysctl_tcp_vegas_cong_avoid,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_VEGAS_ALPHA,
+                .procname       = "tcp_vegas_alpha",
+                .data           = &sysctl_tcp_vegas_alpha,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_VEGAS_BETA,
+                .procname       = "tcp_vegas_beta",
+                .data           = &sysctl_tcp_vegas_beta,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_VEGAS_GAMMA,
+                .procname       = "tcp_vegas_gamma",
+                .data           = &sysctl_tcp_vegas_gamma,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_BIC,
+                .procname       = "tcp_bic",
+                .data           = &sysctl_tcp_bic,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_BIC_FAST_CONVERGENCE,
+                .procname       = "tcp_bic_fast_convergence",
+                .data           = &sysctl_tcp_bic_fast_convergence,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_BIC_LOW_WINDOW,
+                .procname       = "tcp_bic_low_window",
+                .data           = &sysctl_tcp_bic_low_window,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_MODERATE_RCVBUF,
+                .procname       = "tcp_moderate_rcvbuf",
+                .data           = &sysctl_tcp_moderate_rcvbuf,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_TSO_WIN_DIVISOR,
+                .procname       = "tcp_tso_win_divisor",
+                .data           = &sysctl_tcp_tso_win_divisor,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_TCP_BIC_BETA,
+                .procname       = "tcp_bic_beta",
+                .data           = &sysctl_tcp_bic_beta,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+EXPORT_SYMBOL(ipv4_config);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 000000000000..5cff56af7855
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,2386 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *              Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *              Florian La Roche, <flla@stud.uni-sb.de>
+ *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *              Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *              Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *              Matthew Dillon, <dillon@apollo.west.oic.com>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ *              Alan Cox        :       Numerous verify_area() calls
+ *              Alan Cox        :       Set the ACK bit on a reset
+ *              Alan Cox        :       Stopped it crashing if it closed while
+ *                                      sk->inuse=1 and was trying to connect
+ *                                      (tcp_err()).
+ *              Alan Cox        :       All icmp error handling was broken
+ *                                      pointers passed where wrong and the
+ *                                      socket was looked up backwards. Nobody
+ *                                      tested any icmp error code obviously.
+ *              Alan Cox        :       tcp_err() now handled properly. It
+ *                                      wakes people on errors. poll
+ *                                      behaves and the icmp error race
+ *                                      has gone by moving it into sock.c
+ *              Alan Cox        :       tcp_send_reset() fixed to work for
+ *                                      everything not just packets for
+ *                                      unknown sockets.
+ *              Alan Cox        :       tcp option processing.
+ *              Alan Cox        :       Reset tweaked (still not 100%) [Had
+ *                                      syn rule wrong]
+ *              Herp Rosmanith  :       More reset fixes
+ *              Alan Cox        :       No longer acks invalid rst frames.
+ *                                      Acking any kind of RST is right out.
+ *              Alan Cox        :       Sets an ignore me flag on an rst
+ *                                      receive otherwise odd bits of prattle
+ *                                      escape still
+ *              Alan Cox        :       Fixed another acking RST frame bug.
+ *                                      Should stop LAN workplace lockups.
+ *              Alan Cox        :       Some tidyups using the new skb list
+ *                                      facilities
+ *              Alan Cox        :       sk->keepopen now seems to work
+ *              Alan Cox        :       Pulls options out correctly on accepts
+ *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
+ *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
+ *                                      bit to skb ops.
+ *              Alan Cox        :       Tidied tcp_data to avoid a potential
+ *                                      nasty.
+ *              Alan Cox        :       Added some better commenting, as the
+ *                                      tcp is hard to follow
+ *              Alan Cox        :       Removed incorrect check for 20 * psh
+ *      Michael O'Reilly        :       ack < copied bug fix.
+ *      Johannes Stille         :       Misc tcp fixes (not all in yet).
+ *              Alan Cox        :       FIN with no memory -> CRASH
+ *              Alan Cox        :       Added socket option proto entries.
+ *                                      Also added awareness of them to accept.
+ *              Alan Cox        :       Added TCP options (SOL_TCP)
+ *              Alan Cox        :       Switched wakeup calls to callbacks,
+ *                                      so the kernel can layer network
+ *                                      sockets.
+ *              Alan Cox        :       Use ip_tos/ip_ttl settings.
+ *              Alan Cox        :       Handle FIN (more) properly (we hope).
+ *              Alan Cox        :       RST frames sent on unsynchronised
+ *                                      state ack error.
+ *              Alan Cox        :       Put in missing check for SYN bit.
+ *              Alan Cox        :       Added tcp_select_window() aka NET2E
+ *                                      window non shrink trick.
+ *              Alan Cox        :       Added a couple of small NET2E timer
+ *                                      fixes
+ *              Charles Hedrick :       TCP fixes
+ *              Toomas Tamm     :       TCP window fixes
+ *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
+ *              Charles Hedrick :       Rewrote most of it to actually work
+ *              Linus           :       Rewrote tcp_read() and URG handling
+ *                                      completely
+ *              Gerhard Koerting:       Fixed some missing timer handling
+ *              Matthew Dillon  :       Reworked TCP machine states as per RFC
+ *              Gerhard Koerting:       PC/TCP workarounds
+ *              Adam Caldwell   :       Assorted timer/timing errors
+ *              Matthew Dillon  :       Fixed another RST bug
+ *              Alan Cox        :       Move to kernel side addressing changes.
+ *              Alan Cox        :       Beginning work on TCP fastpathing
+ *                                      (not yet usable)
+ *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
+ *              Alan Cox        :       TCP fast path debugging
+ *              Alan Cox        :       Window clamping
+ *              Michael Riepe   :       Bug in tcp_check()
+ *              Matt Dillon     :       More TCP improvements and RST bug fixes
+ *              Matt Dillon     :       Yet more small nasties remove from the
+ *                                      TCP code (Be very nice to this man if
+ *                                      tcp finally works 100%) 8)
+ *              Alan Cox        :       BSD accept semantics.
+ *              Alan Cox        :       Reset on closedown bug.
+ *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
+ *              Michael Pall    :       Handle poll() after URG properly in
+ *                                      all cases.
+ *              Michael Pall    :       Undo the last fix in tcp_read_urg()
+ *                                      (multi URG PUSH broke rlogin).
+ *              Michael Pall    :       Fix the multi URG PUSH problem in
+ *                                      tcp_readable(), poll() after URG
+ *                                      works now.
+ *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
+ *                                      BSD api.
+ *              Alan Cox        :       Changed the semantics of sk->socket to
+ *                                      fix a race and a signal problem with
+ *                                      accept() and async I/O.
+ *              Alan Cox        :       Relaxed the rules on tcp_sendto().
+ *              Yury Shevchuk   :       Really fixed accept() blocking problem.
+ *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
+ *                                      clients/servers which listen in on
+ *                                      fixed ports.
+ *              Alan Cox        :       Cleaned the above up and shrank it to
+ *                                      a sensible code size.
+ *              Alan Cox        :       Self connect lockup fix.
+ *              Alan Cox        :       No connect to multicast.
+ *              Ross Biro       :       Close unaccepted children on master
+ *                                      socket close.
+ *              Alan Cox        :       Reset tracing code.
+ *              Alan Cox        :       Spurious resets on shutdown.
+ *              Alan Cox        :       Giant 15 minute/60 second timer error
+ *              Alan Cox        :       Small whoops in polling before an
+ *                                      accept.
+ *              Alan Cox        :       Kept the state trace facility since
+ *                                      it's handy for debugging.
+ *              Alan Cox        :       More reset handler fixes.
+ *              Alan Cox        :       Started rewriting the code based on
+ *                                      the RFC's for other useful protocol
+ *                                      references see: Comer, KA9Q NOS, and
+ *                                      for a reference on the difference
+ *                                      between specifications and how BSD
+ *                                      works see the 4.4lite source.
+ *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
+ *                                      close.
+ *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
+ *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
+ *              Alan Cox        :       Reimplemented timers as per the RFC
+ *                                      and using multiple timers for sanity.
+ *              Alan Cox        :       Small bug fixes, and a lot of new
+ *                                      comments.
+ *              Alan Cox        :       Fixed dual reader crash by locking
+ *                                      the buffers (much like datagram.c)
+ *              Alan Cox        :       Fixed stuck sockets in probe. A probe
+ *                                      now gets fed up of retrying without
+ *                                      (even a no space) answer.
+ *              Alan Cox        :       Extracted closing code better
+ *              Alan Cox        :       Fixed the closing state machine to
+ *                                      resemble the RFC.
+ *              Alan Cox        :       More 'per spec' fixes.
+ *              Jorge Cwik      :       Even faster checksumming.
+ *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
+ *                                      only frames. At least one pc tcp stack
+ *                                      generates them.
+ *              Alan Cox        :       Cache last socket.
+ *              Alan Cox        :       Per route irtt.
+ *              Matt Day        :       poll()->select() match BSD precisely on error
+ *              Alan Cox        :       New buffers
+ *              Marc Tamsky     :       Various sk->prot->retransmits and
+ *                                      sk->retransmits misupdating fixed.
+ *                                      Fixed tcp_write_timeout: stuck close,
+ *                                      and TCP syn retries gets used now.
+ *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
+ *                                      ack if state is TCP_CLOSED.
+ *              Alan Cox        :       Look up device on a retransmit - routes may
+ *                                      change. Doesn't yet cope with MSS shrink right
+ *                                      but it's a start!
+ *              Marc Tamsky     :       Closing in closing fixes.
+ *              Mike Shaver     :       RFC1122 verifications.
+ *              Alan Cox        :       rcv_saddr errors.
+ *              Alan Cox        :       Block double connect().
+ *              Alan Cox        :       Small hooks for enSKIP.
+ *              Alexey Kuznetsov:       Path MTU discovery.
+ *              Alan Cox        :       Support soft errors.
+ *              Alan Cox        :       Fix MTU discovery pathological case
+ *                                      when the remote claims no mtu!
+ *              Marc Tamsky     :       TCP_CLOSE fix.
+ *              Colin (G3TNE)   :       Send a reset on syn ack replies in
+ *                                      window but wrong (fixes NT lpd problems)
+ *              Pedro Roque     :       Better TCP window handling, delayed ack.
+ *              Joerg Reuter    :       No modification of locked buffers in
+ *                                      tcp_do_retransmit()
+ *              Eric Schenk     :       Changed receiver side silly window
+ *                                      avoidance algorithm to BSD style
+ *                                      algorithm. This doubles throughput
+ *                                      against machines running Solaris,
+ *                                      and seems to result in general
+ *                                      improvement.
+ *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
+ *      Willy Konynenberg       :       Transparent proxying support.
+ *      Mike McLagan            :       Routing by source
+ *              Keith Owens     :       Do proper merging with partial SKB's in
+ *                                      tcp_do_sendmsg to avoid burstiness.
+ *              Eric Schenk     :       Fix fast close down bug with
+ *                                      shutdown() followed by close().
+ *              Andi Kleen      :       Make poll agree with SIGIO
+ *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
+ *                                      lingertime == 0 (RFC 793 ABORT Call)
+ *      Hirokazu Takahashi      :       Use copy_from_user() instead of
+ *                                      csum_and_copy_from_user() if possible.
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or(at your option) any later version.
+ *
+ * Description of States:
+ *
+ *      TCP_SYN_SENT            sent a connection request, waiting for ack
+ *
+ *      TCP_SYN_RECV            received a connection request, sent ack,
+ *                              waiting for final ack in three-way handshake.
+ *
+ *      TCP_ESTABLISHED         connection established
+ *
+ *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
+ *                              transmission of remaining buffered data
+ *
+ *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
+ *                              to shutdown
+ *
+ *      TCP_CLOSING             both sides have shutdown but we still have
+ *                              data we have to finish sending
+ *
+ *      TCP_TIME_WAIT           timeout to catch resent junk before entering
+ *                              closed, can only be entered from FIN_WAIT2
+ *                              or CLOSING.  Required because the other end
+ *                              may not have gotten our last ACK causing it
+ *                              to retransmit the data packet (which we ignore)
+ *
+ *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
+ *                              us to finish writing our data and to shutdown
+ *                              (we have to close() to move on to LAST_ACK)
+ *
+ *      TCP_LAST_ACK            out side has shutdown after remote has
+ *                              shutdown.  There may still be data in our
+ *                              buffer that we have to finish sending
+ *
+ *      TCP_CLOSE               socket is finished
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/bootmem.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
+kmem_cache_t *tcp_openreq_cachep;
+kmem_cache_t *tcp_bucket_cachep;
+kmem_cache_t *tcp_timewait_cachep;
+atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+int sysctl_tcp_mem[3];
+int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
+EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_SYMBOL(sysctl_tcp_rmem);
+EXPORT_SYMBOL(sysctl_tcp_wmem);
+atomic_t tcp_memory_allocated;  /* Current allocated memory. */
+atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
+EXPORT_SYMBOL(tcp_memory_allocated);
+EXPORT_SYMBOL(tcp_sockets_allocated);
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the sk_stream_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+int tcp_memory_pressure;
+EXPORT_SYMBOL(tcp_memory_pressure);
+void tcp_enter_memory_pressure(void)
+{
+        if (!tcp_memory_pressure) {
+                NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
+                tcp_memory_pressure = 1;
+        }
+}
+EXPORT_SYMBOL(tcp_enter_memory_pressure);
+/*
+ * LISTEN is a special case for poll..
+ */
+static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
+                                               poll_table *wait)
+{
+        return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
+}
+/*
+ *      Wait for a TCP event.
+ *
+ *      Note that we don't need to lock the socket, as the upper poll layers
+ *      take care of normal races (between the test and the event) and we don't
+ *      go look at any of the socket buffers directly.
+ */
+unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+        unsigned int mask;
+        struct sock *sk = sock->sk;
+        struct tcp_sock *tp = tcp_sk(sk);
+        poll_wait(file, sk->sk_sleep, wait);
+        if (sk->sk_state == TCP_LISTEN)
+                return tcp_listen_poll(sk, wait);
+        /* Socket is not locked. We are protected from async events
+           by poll logic and correct handling of state changes
+           made by another threads is impossible in any case.
+         */
+        mask = 0;
+        if (sk->sk_err)
+                mask = POLLERR;
+        /*
+         * POLLHUP is certainly not done right. But poll() doesn't
+         * have a notion of HUP in just one direction, and for a
+         * socket the read side is more interesting.
+         *
+         * Some poll() documentation says that POLLHUP is incompatible
+         * with the POLLOUT/POLLWR flags, so somebody should check this
+         * all. But careful, it tends to be safer to return too many
+         * bits than too few, and you can easily break real applications
+         * if you don't tell them that something has hung up!
+         *
+         * Check-me.
+         *
+         * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+         * our fs/select.c). It means that after we received EOF,
+         * poll always returns immediately, making impossible poll() on write()
+         * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+         * if and only if shutdown has been made in both directions.
+         * Actually, it is interesting to look how Solaris and DUX
+         * solve this dilemma. I would prefer, if PULLHUP were maskable,
+         * then we could set it on SND_SHUTDOWN. BTW examples given
+         * in Stevens' books assume exactly this behaviour, it explains
+         * why PULLHUP is incompatible with POLLOUT.    --ANK
+         *
+         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+         * blocking on fresh not-connected or disconnected socket. --ANK
+         */
+        if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+                mask |= POLLHUP;
+        if (sk->sk_shutdown & RCV_SHUTDOWN)
+                mask |= POLLIN | POLLRDNORM;
+        /* Connected? */
+        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+                /* Potential race condition. If read of tp below will
+                 * escape above sk->sk_state, we can be illegally awaken
+                 * in SYN_* states. */
+                if ((tp->rcv_nxt != tp->copied_seq) &&
+                    (tp->urg_seq != tp->copied_seq ||
+                     tp->rcv_nxt != tp->copied_seq + 1 ||
+                     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
+                        mask |= POLLIN | POLLRDNORM;
+                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+                        if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+                                mask |= POLLOUT | POLLWRNORM;
+                        } else {  /* send SIGIO later */
+                                set_bit(SOCK_ASYNC_NOSPACE,
+                                        &sk->sk_socket->flags);
+                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+                                /* Race breaker. If space is freed after
+                                 * wspace test but before the flags are set,
+                                 * IO signal will be lost.
+                                 */
+                                if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+                                        mask |= POLLOUT | POLLWRNORM;
+                        }
+                }
+                if (tp->urg_data & TCP_URG_VALID)
+                        mask |= POLLPRI;
+        }
+        return mask;
+}
+int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int answ;
+        switch (cmd) {
+        case SIOCINQ:
+                if (sk->sk_state == TCP_LISTEN)
+                        return -EINVAL;
+                lock_sock(sk);
+                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+                        answ = 0;
+                else if (sock_flag(sk, SOCK_URGINLINE) ||
+                         !tp->urg_data ||
+                         before(tp->urg_seq, tp->copied_seq) ||
+                         !before(tp->urg_seq, tp->rcv_nxt)) {
+                        answ = tp->rcv_nxt - tp->copied_seq;
+                        /* Subtract 1, if FIN is in queue. */
+                        if (answ && !skb_queue_empty(&sk->sk_receive_queue))
+                                answ -=
+                       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
+                } else
+                        answ = tp->urg_seq - tp->copied_seq;
+                release_sock(sk);
+                break;
+        case SIOCATMARK:
+                answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+                break;
+        case SIOCOUTQ:
+                if (sk->sk_state == TCP_LISTEN)
+                        return -EINVAL;
+                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+                        answ = 0;
+                else
+                        answ = tp->write_seq - tp->snd_una;
+                break;
+        default:
+                return -ENOIOCTLCMD;
+        };
+        return put_user(answ, (int __user *)arg);
+}
+int tcp_listen_start(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_listen_opt *lopt;
+        sk->sk_max_ack_backlog = 0;
+        sk->sk_ack_backlog = 0;
+        tp->accept_queue = tp->accept_queue_tail = NULL;
+        rwlock_init(&tp->syn_wait_lock);
+        tcp_delack_init(tp);
+        lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
+        if (!lopt)
+                return -ENOMEM;
+        memset(lopt, 0, sizeof(struct tcp_listen_opt));
+        for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
+                if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
+                        break;
+        get_random_bytes(&lopt->hash_rnd, 4);
+        write_lock_bh(&tp->syn_wait_lock);
+        tp->listen_opt = lopt;
+        write_unlock_bh(&tp->syn_wait_lock);
+        /* There is race window here: we announce ourselves listening,
+         * but this transition is still not validated by get_port().
+         * It is OK, because this socket enters to hash table only
+         * after validation is complete.
+         */
+        sk->sk_state = TCP_LISTEN;
+        if (!sk->sk_prot->get_port(sk, inet->num)) {
+                inet->sport = htons(inet->num);
+                sk_dst_reset(sk);
+                sk->sk_prot->hash(sk);
+                return 0;
+        }
+        sk->sk_state = TCP_CLOSE;
+        write_lock_bh(&tp->syn_wait_lock);
+        tp->listen_opt = NULL;
+        write_unlock_bh(&tp->syn_wait_lock);
+        kfree(lopt);
+        return -EADDRINUSE;
+}
+/*
+ *      This routine closes sockets which have been at least partially
+ *      opened, but not yet accepted.
+ */
+static void tcp_listen_stop (struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_listen_opt *lopt = tp->listen_opt;
+        struct open_request *acc_req = tp->accept_queue;
+        struct open_request *req;
+        int i;
+        tcp_delete_keepalive_timer(sk);
+        /* make all the listen_opt local to us */
+        write_lock_bh(&tp->syn_wait_lock);
+        tp->listen_opt = NULL;
+        write_unlock_bh(&tp->syn_wait_lock);
+        tp->accept_queue = tp->accept_queue_tail = NULL;
+        if (lopt->qlen) {
+                for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
+                        while ((req = lopt->syn_table[i]) != NULL) {
+                                lopt->syn_table[i] = req->dl_next;
+                                lopt->qlen--;
+                                tcp_openreq_free(req);
+                /* Following specs, it would be better either to send FIN
+                 * (and enter FIN-WAIT-1, it is normal close)
+                 * or to send active reset (abort).
+                 * Certainly, it is pretty dangerous while synflood, but it is
+                 * bad justification for our negligence 8)
+                 * To be honest, we are not able to make either
+                 * of the variants now.                 --ANK
+                 */
+                        }
+                }
+        }
+        BUG_TRAP(!lopt->qlen);
+        kfree(lopt);
+        while ((req = acc_req) != NULL) {
+                struct sock *child = req->sk;
+                acc_req = req->dl_next;
+                local_bh_disable();
+                bh_lock_sock(child);
+                BUG_TRAP(!sock_owned_by_user(child));
+                sock_hold(child);
+                tcp_disconnect(child, O_NONBLOCK);
+                sock_orphan(child);
+                atomic_inc(&tcp_orphan_count);
+                tcp_destroy_sock(child);
+                bh_unlock_sock(child);
+                local_bh_enable();
+                sock_put(child);
+                sk_acceptq_removed(sk);
+                tcp_openreq_fastfree(req);
+        }
+        BUG_TRAP(!sk->sk_ack_backlog);
+}
+static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+        tp->pushed_seq = tp->write_seq;
+}
+static inline int forced_push(struct tcp_sock *tp)
+{
+        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
+}
+static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
+                              struct sk_buff *skb)
+{
+        skb->csum = 0;
+        TCP_SKB_CB(skb)->seq = tp->write_seq;
+        TCP_SKB_CB(skb)->end_seq = tp->write_seq;
+        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+        TCP_SKB_CB(skb)->sacked = 0;
+        skb_header_release(skb);
+        __skb_queue_tail(&sk->sk_write_queue, skb);
+        sk_charge_skb(sk, skb);
+        if (!sk->sk_send_head)
+                sk->sk_send_head = skb;
+        else if (tp->nonagle&TCP_NAGLE_PUSH)
+                tp->nonagle &= ~TCP_NAGLE_PUSH; 
+}
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
+                                struct sk_buff *skb)
+{
+        if (flags & MSG_OOB) {
+                tp->urg_mode = 1;
+                tp->snd_up = tp->write_seq;
+                TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
+        }
+}
+static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
+                            int mss_now, int nonagle)
+{
+        if (sk->sk_send_head) {
+                struct sk_buff *skb = sk->sk_write_queue.prev;
+                if (!(flags & MSG_MORE) || forced_push(tp))
+                        tcp_mark_push(tp, skb);
+                tcp_mark_urg(tp, flags, skb);
+                __tcp_push_pending_frames(sk, tp, mss_now,
+                                          (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
+        }
+}
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+                         size_t psize, int flags)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int mss_now;
+        int err;
+        ssize_t copied;
+        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+        /* Wait for a connection to finish. */
+        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+                        goto out_err;
+        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        copied = 0;
+        err = -EPIPE;
+        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+                goto do_error;
+        while (psize > 0) {
+                struct sk_buff *skb = sk->sk_write_queue.prev;
+                struct page *page = pages[poffset / PAGE_SIZE];
+                int copy, i, can_coalesce;
+                int offset = poffset % PAGE_SIZE;
+                int size = min_t(size_t, psize, PAGE_SIZE - offset);
+                if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+new_segment:
+                        if (!sk_stream_memory_free(sk))
+                                goto wait_for_sndbuf;
+                        skb = sk_stream_alloc_pskb(sk, 0, 0,
+                                                   sk->sk_allocation);
+                        if (!skb)
+                                goto wait_for_memory;
+                        skb_entail(sk, tp, skb);
+                        copy = mss_now;
+                }
+                if (copy > size)
+                        copy = size;
+                i = skb_shinfo(skb)->nr_frags;
+                can_coalesce = skb_can_coalesce(skb, i, page, offset);
+                if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+                        tcp_mark_push(tp, skb);
+                        goto new_segment;
+                }
+                if (sk->sk_forward_alloc < copy &&
+                    !sk_stream_mem_schedule(sk, copy, 0))
+                        goto wait_for_memory;
+                
+                if (can_coalesce) {
+                        skb_shinfo(skb)->frags[i - 1].size += copy;
+                } else {
+                        get_page(page);
+                        skb_fill_page_desc(skb, i, page, offset, copy);
+                }
+                skb->len += copy;
+                skb->data_len += copy;
+                skb->truesize += copy;
+                sk->sk_wmem_queued += copy;
+                sk->sk_forward_alloc -= copy;
+                skb->ip_summed = CHECKSUM_HW;
+                tp->write_seq += copy;
+                TCP_SKB_CB(skb)->end_seq += copy;
+                skb_shinfo(skb)->tso_segs = 0;
+                if (!copied)
+                        TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+                copied += copy;
+                poffset += copy;
+                if (!(psize -= copy))
+                        goto out;
+                if (skb->len != mss_now || (flags & MSG_OOB))
+                        continue;
+                if (forced_push(tp)) {
+                        tcp_mark_push(tp, skb);
+                        __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+                } else if (skb == sk->sk_send_head)
+                        tcp_push_one(sk, mss_now);
+                continue;
+wait_for_sndbuf:
+                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+                if (copied)
+                        tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+                if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+                        goto do_error;
+                mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        }
+out:
+        if (copied)
+                tcp_push(sk, tp, flags, mss_now, tp->nonagle);
+        return copied;
+do_error:
+        if (copied)
+                goto out;
+out_err:
+        return sk_stream_error(sk, flags, err);
+}
+ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+                     size_t size, int flags)
+{
+        ssize_t res;
+        struct sock *sk = sock->sk;
+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
+        if (!(sk->sk_route_caps & NETIF_F_SG) ||
+            !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
+                return sock_no_sendpage(sock, page, offset, size, flags);
+#undef TCP_ZC_CSUM_FLAGS
+        lock_sock(sk);
+        TCP_CHECK_TIMER(sk);
+        res = do_tcp_sendpages(sk, &page, offset, size, flags);
+        TCP_CHECK_TIMER(sk);
+        release_sock(sk);
+        return res;
+}
+#define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
+#define TCP_OFF(sk)     (sk->sk_sndmsg_off)
+static inline int select_size(struct sock *sk, struct tcp_sock *tp)
+{
+        int tmp = tp->mss_cache_std;
+        if (sk->sk_route_caps & NETIF_F_SG) {
+                int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+                if (tmp >= pgbreak &&
+                    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+                        tmp = pgbreak;
+        }
+        return tmp;
+}
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                size_t size)
+{
+        struct iovec *iov;
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        int iovlen, flags;
+        int mss_now;
+        int err, copied;
+        long timeo;
+        lock_sock(sk);
+        TCP_CHECK_TIMER(sk);
+        flags = msg->msg_flags;
+        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+        /* Wait for a connection to finish. */
+        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+                        goto out_err;
+        /* This should be in poll */
+        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        /* Ok commence sending. */
+        iovlen = msg->msg_iovlen;
+        iov = msg->msg_iov;
+        copied = 0;
+        err = -EPIPE;
+        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+                goto do_error;
+        while (--iovlen >= 0) {
+                int seglen = iov->iov_len;
+                unsigned char __user *from = iov->iov_base;
+                iov++;
+                while (seglen > 0) {
+                        int copy;
+                        skb = sk->sk_write_queue.prev;
+                        if (!sk->sk_send_head ||
+                            (copy = mss_now - skb->len) <= 0) {
+new_segment:
+                                /* Allocate new segment. If the interface is SG,
+                                 * allocate skb fitting to single page.
+                                 */
+                                if (!sk_stream_memory_free(sk))
+                                        goto wait_for_sndbuf;
+                                skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
+                                                           0, sk->sk_allocation);
+                                if (!skb)
+                                        goto wait_for_memory;
+                                /*
+                                 * Check whether we can use HW checksum.
+                                 */
+                                if (sk->sk_route_caps &
+                                    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
+                                     NETIF_F_HW_CSUM))
+                                        skb->ip_summed = CHECKSUM_HW;
+                                skb_entail(sk, tp, skb);
+                                copy = mss_now;
+                        }
+                        /* Try to append data to the end of skb. */
+                        if (copy > seglen)
+                                copy = seglen;
+                        /* Where to copy to? */
+                        if (skb_tailroom(skb) > 0) {
+                                /* We have some space in skb head. Superb! */
+                                if (copy > skb_tailroom(skb))
+                                        copy = skb_tailroom(skb);
+                                if ((err = skb_add_data(skb, from, copy)) != 0)
+                                        goto do_fault;
+                        } else {
+                                int merge = 0;
+                                int i = skb_shinfo(skb)->nr_frags;
+                                struct page *page = TCP_PAGE(sk);
+                                int off = TCP_OFF(sk);
+                                if (skb_can_coalesce(skb, i, page, off) &&
+                                    off != PAGE_SIZE) {
+                                        /* We can extend the last page
+                                         * fragment. */
+                                        merge = 1;
+                                } else if (i == MAX_SKB_FRAGS ||
+                                           (!i &&
+                                           !(sk->sk_route_caps & NETIF_F_SG))) {
+                                        /* Need to add new fragment and cannot
+                                         * do this because interface is non-SG,
+                                         * or because all the page slots are
+                                         * busy. */
+                                        tcp_mark_push(tp, skb);
+                                        goto new_segment;
+                                } else if (page) {
+                                        /* If page is cached, align
+                                         * offset to L1 cache boundary
+                                         */
+                                        off = (off + L1_CACHE_BYTES - 1) &
+                                              ~(L1_CACHE_BYTES - 1);
+                                        if (off == PAGE_SIZE) {
+                                                put_page(page);
+                                                TCP_PAGE(sk) = page = NULL;
+                                        }
+                                }
+                                if (!page) {
+                                        /* Allocate new cache page. */
+                                        if (!(page = sk_stream_alloc_page(sk)))
+                                                goto wait_for_memory;
+                                        off = 0;
+                                }
+                                if (copy > PAGE_SIZE - off)
+                                        copy = PAGE_SIZE - off;
+                                /* Time to copy data. We are close to
+                                 * the end! */
+                                err = skb_copy_to_page(sk, from, skb, page,
+                                                       off, copy);
+                                if (err) {
+                                        /* If this page was new, give it to the
+                                         * socket so it does not get leaked.
+                                         */
+                                        if (!TCP_PAGE(sk)) {
+                                                TCP_PAGE(sk) = page;
+                                                TCP_OFF(sk) = 0;
+                                        }
+                                        goto do_error;
+                                }
+                                /* Update the skb. */
+                                if (merge) {
+                                        skb_shinfo(skb)->frags[i - 1].size +=
+                                                                        copy;
+                                } else {
+                                        skb_fill_page_desc(skb, i, page, off, copy);
+                                        if (TCP_PAGE(sk)) {
+                                                get_page(page);
+                                        } else if (off + copy < PAGE_SIZE) {
+                                                get_page(page);
+                                                TCP_PAGE(sk) = page;
+                                        }
+                                }
+                                TCP_OFF(sk) = off + copy;
+                        }
+                        if (!copied)
+                                TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+                        tp->write_seq += copy;
+                        TCP_SKB_CB(skb)->end_seq += copy;
+                        skb_shinfo(skb)->tso_segs = 0;
+                        from += copy;
+                        copied += copy;
+                        if ((seglen -= copy) == 0 && iovlen == 0)
+                                goto out;
+                        if (skb->len != mss_now || (flags & MSG_OOB))
+                                continue;
+                        if (forced_push(tp)) {
+                                tcp_mark_push(tp, skb);
+                                __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+                        } else if (skb == sk->sk_send_head)
+                                tcp_push_one(sk, mss_now);
+                        continue;
+wait_for_sndbuf:
+                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+                        if (copied)
+                                tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+                        if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+                                goto do_error;
+                        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+                }
+        }
+out:
+        if (copied)
+                tcp_push(sk, tp, flags, mss_now, tp->nonagle);
+        TCP_CHECK_TIMER(sk);
+        release_sock(sk);
+        return copied;
+do_fault:
+        if (!skb->len) {
+                if (sk->sk_send_head == skb)
+                        sk->sk_send_head = NULL;
+                __skb_unlink(skb, skb->list);
+                sk_stream_free_skb(sk, skb);
+        }
+do_error:
+        if (copied)
+                goto out;
+out_err:
+        err = sk_stream_error(sk, flags, err);
+        TCP_CHECK_TIMER(sk);
+        release_sock(sk);
+        return err;
+}
+/*
+ *      Handle reading urgent data. BSD has very simple semantics for
+ *      this, no blocking and very strange errors 8)
+ */
+static int tcp_recv_urg(struct sock *sk, long timeo,
+                        struct msghdr *msg, int len, int flags,
+                        int *addr_len)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* No URG data to read. */
+        if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
+            tp->urg_data == TCP_URG_READ)
+                return -EINVAL; /* Yes this is right ! */
+        if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+                return -ENOTCONN;
+        if (tp->urg_data & TCP_URG_VALID) {
+                int err = 0;
+                char c = tp->urg_data;
+                if (!(flags & MSG_PEEK))
+                        tp->urg_data = TCP_URG_READ;
+                /* Read urgent data. */
+                msg->msg_flags |= MSG_OOB;
+                if (len > 0) {
+                        if (!(flags & MSG_TRUNC))
+                                err = memcpy_toiovec(msg->msg_iov, &c, 1);
+                        len = 1;
+                } else
+                        msg->msg_flags |= MSG_TRUNC;
+                return err ? -EFAULT : len;
+        }
+        if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+                return 0;
+        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
+         * the available implementations agree in this case:
+         * this call should never block, independent of the
+         * blocking state of the socket.
+         * Mike <pall@rz.uni-karlsruhe.de>
+         */
+        return -EAGAIN;
+}
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary.  COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void cleanup_rbuf(struct sock *sk, int copied)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int time_to_ack = 0;
+#if TCP_DEBUG
+        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+        BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
+#endif
+        if (tcp_ack_scheduled(tp)) {
+                   /* Delayed ACKs frequently hit locked sockets during bulk
+                    * receive. */
+                if (tp->ack.blocked ||
+                    /* Once-per-two-segments ACK was not sent by tcp_input.c */
+                    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+                    /*
+                     * If this read emptied read buffer, we send ACK, if
+                     * connection is not bidirectional, user drained
+                     * receive buffer and there was a small segment
+                     * in queue.
+                     */
+                    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
+                     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+                        time_to_ack = 1;
+        }
+        /* We send an ACK if we can now advertise a non-zero window
+         * which has been raised "significantly".
+         *
+         * Even if window raised up to infinity, do not send window open ACK
+         * in states, where we will not receive more. It is useless.
+         */
+        if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+                __u32 rcv_window_now = tcp_receive_window(tp);
+                /* Optimize, __tcp_select_window() is not cheap. */
+                if (2*rcv_window_now <= tp->window_clamp) {
+                        __u32 new_window = __tcp_select_window(sk);
+                        /* Send ACK now, if this read freed lots of space
+                         * in our buffer. Certainly, new_window is new window.
+                         * We can advertise it now, if it is not less than current one.
+                         * "Lots" means "at least twice" here.
+                         */
+                        if (new_window && new_window >= 2 * rcv_window_now)
+                                time_to_ack = 1;
+                }
+        }
+        if (time_to_ack)
+                tcp_send_ack(sk);
+}
+static void tcp_prequeue_process(struct sock *sk)
+{
+        struct sk_buff *skb;
+        struct tcp_sock *tp = tcp_sk(sk);
+        NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+        /* RX process wants to run with disabled BHs, though it is not
+         * necessary */
+        local_bh_disable();
+        while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+                sk->sk_backlog_rcv(sk, skb);
+        local_bh_enable();
+        /* Clear memory counter. */
+        tp->ucopy.memory = 0;
+}
+static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+{
+        struct sk_buff *skb;
+        u32 offset;
+        skb_queue_walk(&sk->sk_receive_queue, skb) {
+                offset = seq - TCP_SKB_CB(skb)->seq;
+                if (skb->h.th->syn)
+                        offset--;
+                if (offset < skb->len || skb->h.th->fin) {
+                        *off = offset;
+                        return skb;
+                }
+        }
+        return NULL;
+}
+/*
+ * This routine provides an alternative to tcp_recvmsg() for routines
+ * that would like to handle copying from skbuffs directly in 'sendfile'
+ * fashion.
+ * Note:
+ *      - It is assumed that the socket was locked by the caller.
+ *      - The routine does not block.
+ *      - At present, there is no support for reading OOB data
+ *        or for 'peeking' the socket using this routine
+ *        (although both would be easy to implement).
+ */
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+                  sk_read_actor_t recv_actor)
+{
+        struct sk_buff *skb;
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 seq = tp->copied_seq;
+        u32 offset;
+        int copied = 0;
+        if (sk->sk_state == TCP_LISTEN)
+                return -ENOTCONN;
+        while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+                if (offset < skb->len) {
+                        size_t used, len;
+                        len = skb->len - offset;
+                        /* Stop reading if we hit a patch of urgent data */
+                        if (tp->urg_data) {
+                                u32 urg_offset = tp->urg_seq - seq;
+                                if (urg_offset < len)
+                                        len = urg_offset;
+                                if (!len)
+                                        break;
+                        }
+                        used = recv_actor(desc, skb, offset, len);
+                        if (used <= len) {
+                                seq += used;
+                                copied += used;
+                                offset += used;
+                        }
+                        if (offset != skb->len)
+                                break;
+                }
+                if (skb->h.th->fin) {
+                        sk_eat_skb(sk, skb);
+                        ++seq;
+                        break;
+                }
+                sk_eat_skb(sk, skb);
+                if (!desc->count)
+                        break;
+        }
+        tp->copied_seq = seq;
+        tcp_rcv_space_adjust(sk);
+        /* Clean up data we have read: This will do ACK frames. */
+        if (copied)
+                cleanup_rbuf(sk, copied);
+        return copied;
+}
+/*
+ *      This routine copies from a sock struct into the user buffer.
+ *
+ *      Technical note: in 2.3 we work on _locked_ socket, so that
+ *      tricks with *seq access order and skb->users are not required.
+ *      Probably, code can be easily improved even more.
+ */
+int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                size_t len, int nonblock, int flags, int *addr_len)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int copied = 0;
+        u32 peek_seq;
+        u32 *seq;
+        unsigned long used;
+        int err;
+        int target;             /* Read at least this many bytes */
+        long timeo;
+        struct task_struct *user_recv = NULL;
+        lock_sock(sk);
+        TCP_CHECK_TIMER(sk);
+        err = -ENOTCONN;
+        if (sk->sk_state == TCP_LISTEN)
+                goto out;
+        timeo = sock_rcvtimeo(sk, nonblock);
+        /* Urgent data needs to be handled specially. */
+        if (flags & MSG_OOB)
+                goto recv_urg;
+        seq = &tp->copied_seq;
+        if (flags & MSG_PEEK) {
+                peek_seq = tp->copied_seq;
+                seq = &peek_seq;
+        }
+        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+        do {
+                struct sk_buff *skb;
+                u32 offset;
+                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+                if (tp->urg_data && tp->urg_seq == *seq) {
+                        if (copied)
+                                break;
+                        if (signal_pending(current)) {
+                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+                                break;
+                        }
+                }
+                /* Next get a buffer. */
+                skb = skb_peek(&sk->sk_receive_queue);
+                do {
+                        if (!skb)
+                                break;
+                        /* Now that we have two receive queues this
+                         * shouldn't happen.
+                         */
+                        if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+                                printk(KERN_INFO "recvmsg bug: copied %X "
+                                       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+                                break;
+                        }
+                        offset = *seq - TCP_SKB_CB(skb)->seq;
+                        if (skb->h.th->syn)
+                                offset--;
+                        if (offset < skb->len)
+                                goto found_ok_skb;
+                        if (skb->h.th->fin)
+                                goto found_fin_ok;
+                        BUG_TRAP(flags & MSG_PEEK);
+                        skb = skb->next;
+                } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+                /* Well, if we have backlog, try to process it now yet. */
+                if (copied >= target && !sk->sk_backlog.tail)
+                        break;
+                if (copied) {
+                        if (sk->sk_err ||
+                            sk->sk_state == TCP_CLOSE ||
+                            (sk->sk_shutdown & RCV_SHUTDOWN) ||
+                            !timeo ||
+                            signal_pending(current) ||
+                            (flags & MSG_PEEK))
+                                break;
+                } else {
+                        if (sock_flag(sk, SOCK_DONE))
+                                break;
+                        if (sk->sk_err) {
+                                copied = sock_error(sk);
+                                break;
+                        }
+                        if (sk->sk_shutdown & RCV_SHUTDOWN)
+                                break;
+                        if (sk->sk_state == TCP_CLOSE) {
+                                if (!sock_flag(sk, SOCK_DONE)) {
+                                        /* This occurs when user tries to read
+                                         * from never connected socket.
+                                         */
+                                        copied = -ENOTCONN;
+                                        break;
+                                }
+                                break;
+                        }
+                        if (!timeo) {
+                                copied = -EAGAIN;
+                                break;
+                        }
+                        if (signal_pending(current)) {
+                                copied = sock_intr_errno(timeo);
+                                break;
+                        }
+                }
+                cleanup_rbuf(sk, copied);
+                if (tp->ucopy.task == user_recv) {
+                        /* Install new reader */
+                        if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
+                                user_recv = current;
+                                tp->ucopy.task = user_recv;
+                                tp->ucopy.iov = msg->msg_iov;
+                        }
+                        tp->ucopy.len = len;
+                        BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
+                                 (flags & (MSG_PEEK | MSG_TRUNC)));
+                        /* Ugly... If prequeue is not empty, we have to
+                         * process it before releasing socket, otherwise
+                         * order will be broken at second iteration.
+                         * More elegant solution is required!!!
+                         *
+                         * Look: we have the following (pseudo)queues:
+                         *
+                         * 1. packets in flight
+                         * 2. backlog
+                         * 3. prequeue
+                         * 4. receive_queue
+                         *
+                         * Each queue can be processed only if the next ones
+                         * are empty. At this point we have empty receive_queue.
+                         * But prequeue _can_ be not empty after 2nd iteration,
+                         * when we jumped to start of loop because backlog
+                         * processing added something to receive_queue.
+                         * We cannot release_sock(), because backlog contains
+                         * packets arrived _after_ prequeued ones.
+                         *
+                         * Shortly, algorithm is clear --- to process all
+                         * the queues in order. We could make it more directly,
+                         * requeueing packets from backlog to prequeue, if
+                         * is not empty. It is more elegant, but eats cycles,
+                         * unfortunately.
+                         */
+                        if (skb_queue_len(&tp->ucopy.prequeue))
+                                goto do_prequeue;
+                        /* __ Set realtime policy in scheduler __ */
+                }
+                if (copied >= target) {
+                        /* Do not sleep, just process backlog. */
+                        release_sock(sk);
+                        lock_sock(sk);
+                } else
+                        sk_wait_data(sk, &timeo);
+                if (user_recv) {
+                        int chunk;
+                        /* __ Restore normal policy in scheduler __ */
+                        if ((chunk = len - tp->ucopy.len) != 0) {
+                                NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+                                len -= chunk;
+                                copied += chunk;
+                        }
+                        if (tp->rcv_nxt == tp->copied_seq &&
+                            skb_queue_len(&tp->ucopy.prequeue)) {
+do_prequeue:
+                                tcp_prequeue_process(sk);
+                                if ((chunk = len - tp->ucopy.len) != 0) {
+                                        NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+                                        len -= chunk;
+                                        copied += chunk;
+                                }
+                        }
+                }
+                if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
+                        if (net_ratelimit())
+                                printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
+                                       current->comm, current->pid);
+                        peek_seq = tp->copied_seq;
+                }
+                continue;
+        found_ok_skb:
+                /* Ok so how much can we use? */
+                used = skb->len - offset;
+                if (len < used)
+                        used = len;
+                /* Do we have urgent data here? */
+                if (tp->urg_data) {
+                        u32 urg_offset = tp->urg_seq - *seq;
+                        if (urg_offset < used) {
+                                if (!urg_offset) {
+                                        if (!sock_flag(sk, SOCK_URGINLINE)) {
+                                                ++*seq;
+                                                offset++;
+                                                used--;
+                                                if (!used)
+                                                        goto skip_copy;
+                                        }
+                                } else
+                                        used = urg_offset;
+                        }
+                }
+                if (!(flags & MSG_TRUNC)) {
+                        err = skb_copy_datagram_iovec(skb, offset,
+                                                      msg->msg_iov, used);
+                        if (err) {
+                                /* Exception. Bailout! */
+                                if (!copied)
+                                        copied = -EFAULT;
+                                break;
+                        }
+                }
+                *seq += used;
+                copied += used;
+                len -= used;
+                tcp_rcv_space_adjust(sk);
+skip_copy:
+                if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+                        tp->urg_data = 0;
+                        tcp_fast_path_check(sk, tp);
+                }
+                if (used + offset < skb->len)
+                        continue;
+                if (skb->h.th->fin)
+                        goto found_fin_ok;
+                if (!(flags & MSG_PEEK))
+                        sk_eat_skb(sk, skb);
+                continue;
+        found_fin_ok:
+                /* Process the FIN. */
+                ++*seq;
+                if (!(flags & MSG_PEEK))
+                        sk_eat_skb(sk, skb);
+                break;
+        } while (len > 0);
+        if (user_recv) {
+                if (skb_queue_len(&tp->ucopy.prequeue)) {
+                        int chunk;
+                        tp->ucopy.len = copied > 0 ? len : 0;
+                        tcp_prequeue_process(sk);
+                        if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+                                NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+                                len -= chunk;
+                                copied += chunk;
+                        }
+                }
+                tp->ucopy.task = NULL;
+                tp->ucopy.len = 0;
+        }
+        /* According to UNIX98, msg_name/msg_namelen are ignored
+         * on connected socket. I was just happy when found this 8) --ANK
+         */
+        /* Clean up data we have read: This will do ACK frames. */
+        cleanup_rbuf(sk, copied);
+        TCP_CHECK_TIMER(sk);
+        release_sock(sk);
+        return copied;
+out:
+        TCP_CHECK_TIMER(sk);
+        release_sock(sk);
+        return err;
+recv_urg:
+        err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+        goto out;
+}
+/*
+ *      State processing on a close. This implements the state shift for
+ *      sending our FIN frame. Note that we only send a FIN for some
+ *      states. A shutdown() may have already sent the FIN, or we may be
+ *      closed.
+ */
+static unsigned char new_state[16] = {
+  /* current state:        new state:      action:      */
+  /* (Invalid)          */ TCP_CLOSE,
+  /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  /* TCP_SYN_SENT       */ TCP_CLOSE,
+  /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
+  /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
+  /* TCP_TIME_WAIT      */ TCP_CLOSE,
+  /* TCP_CLOSE          */ TCP_CLOSE,
+  /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
+  /* TCP_LAST_ACK       */ TCP_LAST_ACK,
+  /* TCP_LISTEN         */ TCP_CLOSE,
+  /* TCP_CLOSING        */ TCP_CLOSING,
+};
+static int tcp_close_state(struct sock *sk)
+{
+        int next = (int)new_state[sk->sk_state];
+        int ns = next & TCP_STATE_MASK;
+        tcp_set_state(sk, ns);
+        return next & TCP_ACTION_FIN;
+}
+/*
+ *      Shutdown the sending side of a connection. Much like close except
+ *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
+ */
+void tcp_shutdown(struct sock *sk, int how)
+{
+        /*      We need to grab some memory, and put together a FIN,
+         *      and then put it into the queue to be sent.
+         *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+         */
+        if (!(how & SEND_SHUTDOWN))
+                return;
+        /* If we've already sent a FIN, or it's a closed state, skip this. */
+        if ((1 << sk->sk_state) &
+            (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+             TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+                /* Clear out any half completed packets.  FIN if needed. */
+                if (tcp_close_state(sk))
+                        tcp_send_fin(sk);
+        }
+}
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void tcp_destroy_sock(struct sock *sk)
+{
+        BUG_TRAP(sk->sk_state == TCP_CLOSE);
+        BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+        /* It cannot be in hash table! */
+        BUG_TRAP(sk_unhashed(sk));
+        /* If it has not 0 inet_sk(sk)->num, it must be bound */
+        BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
+        sk->sk_prot->destroy(sk);
+        sk_stream_kill_queues(sk);
+        xfrm_sk_free_policy(sk);
+#ifdef INET_REFCNT_DEBUG
+        if (atomic_read(&sk->sk_refcnt) != 1) {
+                printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
+                       sk, atomic_read(&sk->sk_refcnt));
+        }
+#endif
+        atomic_dec(&tcp_orphan_count);
+        sock_put(sk);
+}
+void tcp_close(struct sock *sk, long timeout)
+{
+        struct sk_buff *skb;
+        int data_was_unread = 0;
+        lock_sock(sk);
+        sk->sk_shutdown = SHUTDOWN_MASK;
+        if (sk->sk_state == TCP_LISTEN) {
+                tcp_set_state(sk, TCP_CLOSE);
+                /* Special case. */
+                tcp_listen_stop(sk);
+                goto adjudge_to_death;
+        }
+        /*  We need to flush the recv. buffs.  We do this only on the
+         *  descriptor close, not protocol-sourced closes, because the
+         *  reader process may not have drained the data yet!
+         */
+        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
+                          skb->h.th->fin;
+                data_was_unread += len;
+                __kfree_skb(skb);
+        }
+        sk_stream_mem_reclaim(sk);
+        /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+         * 3.10, we send a RST here because data was lost.  To
+         * witness the awful effects of the old behavior of always
+         * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+         * a bulk GET in an FTP client, suspend the process, wait
+         * for the client to advertise a zero window, then kill -9
+         * the FTP client, wheee...  Note: timeout is always zero
+         * in such a case.
+         */
+        if (data_was_unread) {
+                /* Unread data was tossed, zap the connection. */
+                NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
+                tcp_set_state(sk, TCP_CLOSE);
+                tcp_send_active_reset(sk, GFP_KERNEL);
+        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+                /* Check zero linger _after_ checking for unread data. */
+                sk->sk_prot->disconnect(sk, 0);
+                NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
+        } else if (tcp_close_state(sk)) {
+                /* We FIN if the application ate all the data before
+                 * zapping the connection.
+                 */
+                /* RED-PEN. Formally speaking, we have broken TCP state
+                 * machine. State transitions:
+                 *
+                 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+                 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+                 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+                 *
+                 * are legal only when FIN has been sent (i.e. in window),
+                 * rather than queued out of window. Purists blame.
+                 *
+                 * F.e. "RFC state" is ESTABLISHED,
+                 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+                 *
+                 * The visible declinations are that sometimes
+                 * we enter time-wait state, when it is not required really
+                 * (harmless), do not send active resets, when they are
+                 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+                 * they look as CLOSING or LAST_ACK for Linux)
+                 * Probably, I missed some more holelets.
+                 *                                              --ANK
+                 */
+                tcp_send_fin(sk);
+        }
+        sk_stream_wait_close(sk, timeout);
+adjudge_to_death:
+        /* It is the last release_sock in its life. It will remove backlog. */
+        release_sock(sk);
+        /* Now socket is owned by kernel and we acquire BH lock
+           to finish close. No need to check for user refs.
+         */
+        local_bh_disable();
+        bh_lock_sock(sk);
+        BUG_TRAP(!sock_owned_by_user(sk));
+        sock_hold(sk);
+        sock_orphan(sk);
+        /*      This is a (useful) BSD violating of the RFC. There is a
+         *      problem with TCP as specified in that the other end could
+         *      keep a socket open forever with no application left this end.
+         *      We use a 3 minute timeout (about the same as BSD) then kill
+         *      our end. If they send after that then tough - BUT: long enough
+         *      that we won't make the old 4*rto = almost no time - whoops
+         *      reset mistake.
+         *
+         *      Nope, it was not mistake. It is really desired behaviour
+         *      f.e. on http servers, when such sockets are useless, but
+         *      consume significant resources. Let's do it with special
+         *      linger2 option.                                 --ANK
+         */
+        if (sk->sk_state == TCP_FIN_WAIT2) {
+                struct tcp_sock *tp = tcp_sk(sk);
+                if (tp->linger2 < 0) {
+                        tcp_set_state(sk, TCP_CLOSE);
+                        tcp_send_active_reset(sk, GFP_ATOMIC);
+                        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
+                } else {
+                        int tmo = tcp_fin_time(tp);
+                        if (tmo > TCP_TIMEWAIT_LEN) {
+                                tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+                        } else {
+                                atomic_inc(&tcp_orphan_count);
+                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+                                goto out;
+                        }
+                }
+        }
+        if (sk->sk_state != TCP_CLOSE) {
+                sk_stream_mem_reclaim(sk);
+                if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+                    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+                     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+                        if (net_ratelimit())
+                                printk(KERN_INFO "TCP: too many of orphaned "
+                                       "sockets\n");
+                        tcp_set_state(sk, TCP_CLOSE);
+                        tcp_send_active_reset(sk, GFP_ATOMIC);
+                        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+                }
+        }
+        atomic_inc(&tcp_orphan_count);
+        if (sk->sk_state == TCP_CLOSE)
+                tcp_destroy_sock(sk);
+        /* Otherwise, socket is reprieved until protocol close. */
+out:
+        bh_unlock_sock(sk);
+        local_bh_enable();
+        sock_put(sk);
+}
+/* These states need RST on ABORT according to RFC793 */
+static inline int tcp_need_reset(int state)
+{
+        return (1 << state) &
+               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+int tcp_disconnect(struct sock *sk, int flags)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        int err = 0;
+        int old_state = sk->sk_state;
+        if (old_state != TCP_CLOSE)
+                tcp_set_state(sk, TCP_CLOSE);
+        /* ABORT function of RFC793 */
+        if (old_state == TCP_LISTEN) {
+                tcp_listen_stop(sk);
+        } else if (tcp_need_reset(old_state) ||
+                   (tp->snd_nxt != tp->write_seq &&
+                    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+                /* The last check adjusts for discrepance of Linux wrt. RFC
+                 * states
+                 */
+                tcp_send_active_reset(sk, gfp_any());
+                sk->sk_err = ECONNRESET;
+        } else if (old_state == TCP_SYN_SENT)
+                sk->sk_err = ECONNRESET;
+        tcp_clear_xmit_timers(sk);
+        __skb_queue_purge(&sk->sk_receive_queue);
+        sk_stream_writequeue_purge(sk);
+        __skb_queue_purge(&tp->out_of_order_queue);
+        inet->dport = 0;
+        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+                inet_reset_saddr(sk);
+        sk->sk_shutdown = 0;
+        sock_reset_flag(sk, SOCK_DONE);
+        tp->srtt = 0;
+        if ((tp->write_seq += tp->max_window + 2) == 0)
+                tp->write_seq = 1;
+        tp->backoff = 0;
+        tp->snd_cwnd = 2;
+        tp->probes_out = 0;
+        tp->packets_out = 0;
+        tp->snd_ssthresh = 0x7fffffff;
+        tp->snd_cwnd_cnt = 0;
+        tcp_set_ca_state(tp, TCP_CA_Open);
+        tcp_clear_retrans(tp);
+        tcp_delack_init(tp);
+        sk->sk_send_head = NULL;
+        tp->rx_opt.saw_tstamp = 0;
+        tcp_sack_reset(&tp->rx_opt);
+        __sk_dst_reset(sk);
+        BUG_TRAP(!inet->num || tp->bind_hash);
+        sk->sk_error_report(sk);
+        return err;
+}
+/*
+ *      Wait for an incoming connection, avoid race
+ *      conditions. This must be called with the socket locked.
+ */
+static int wait_for_connect(struct sock *sk, long timeo)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        DEFINE_WAIT(wait);
+        int err;
+        /*
+         * True wake-one mechanism for incoming connections: only
+         * one process gets woken up, not the 'whole herd'.
+         * Since we do not 'race & poll' for established sockets
+         * anymore, the common case will execute the loop only once.
+         *
+         * Subtle issue: "add_wait_queue_exclusive()" will be added
+         * after any current non-exclusive waiters, and we know that
+         * it will always _stay_ after any new non-exclusive waiters
+         * because all non-exclusive waiters are added at the
+         * beginning of the wait-queue. As such, it's ok to "drop"
+         * our exclusiveness temporarily when we get woken up without
+         * having to remove and re-insert us on the wait queue.
+         */
+        for (;;) {
+                prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+                                          TASK_INTERRUPTIBLE);
+                release_sock(sk);
+                if (!tp->accept_queue)
+                        timeo = schedule_timeout(timeo);
+                lock_sock(sk);
+                err = 0;
+                if (tp->accept_queue)
+                        break;
+                err = -EINVAL;
+                if (sk->sk_state != TCP_LISTEN)
+                        break;
+                err = sock_intr_errno(timeo);
+                if (signal_pending(current))
+                        break;
+                err = -EAGAIN;
+                if (!timeo)
+                        break;
+        }
+        finish_wait(sk->sk_sleep, &wait);
+        return err;
+}
+/*
+ *      This will accept the next outstanding connection.
+ */
+struct sock *tcp_accept(struct sock *sk, int flags, int *err)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct open_request *req;
+        struct sock *newsk;
+        int error;
+        lock_sock(sk);
+        /* We need to make sure that this socket is listening,
+         * and that it has something pending.
+         */
+        error = -EINVAL;
+        if (sk->sk_state != TCP_LISTEN)
+                goto out;
+        /* Find already established connection */
+        if (!tp->accept_queue) {
+                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+                /* If this is a non blocking socket don't sleep */
+                error = -EAGAIN;
+                if (!timeo)
+                        goto out;
+                error = wait_for_connect(sk, timeo);
+                if (error)
+                        goto out;
+        }
+        req = tp->accept_queue;
+        if ((tp->accept_queue = req->dl_next) == NULL)
+                tp->accept_queue_tail = NULL;
+        newsk = req->sk;
+        sk_acceptq_removed(sk);
+        tcp_openreq_fastfree(req);
+        BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+        release_sock(sk);
+        return newsk;
+out:
+        release_sock(sk);
+        *err = error;
+        return NULL;
+}
+/*
+ *      Socket option code for TCP.
+ */
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+                   int optlen)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int val;
+        int err = 0;
+        if (level != SOL_TCP)
+                return tp->af_specific->setsockopt(sk, level, optname,
+                                                   optval, optlen);
+        if (optlen < sizeof(int))
+                return -EINVAL;
+        if (get_user(val, (int __user *)optval))
+                return -EFAULT;
+        lock_sock(sk);
+        switch (optname) {
+        case TCP_MAXSEG:
+                /* Values greater than interface MTU won't take effect. However
+                 * at the point when this call is done we typically don't yet
+                 * know which interface is going to be used */
+                if (val < 8 || val > MAX_TCP_WINDOW) {
+                        err = -EINVAL;
+                        break;
+                }
+                tp->rx_opt.user_mss = val;
+                break;
+        case TCP_NODELAY:
+                if (val) {
+                        /* TCP_NODELAY is weaker than TCP_CORK, so that
+                         * this option on corked socket is remembered, but
+                         * it is not activated until cork is cleared.
+                         *
+                         * However, when TCP_NODELAY is set we make
+                         * an explicit push, which overrides even TCP_CORK
+                         * for currently queued segments.
+                         */
+                        tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+                        tcp_push_pending_frames(sk, tp);
+                } else {
+                        tp->nonagle &= ~TCP_NAGLE_OFF;
+                }
+                break;
+        case TCP_CORK:
+                /* When set indicates to always queue non-full frames.
+                 * Later the user clears this option and we transmit
+                 * any pending partial frames in the queue.  This is
+                 * meant to be used alongside sendfile() to get properly
+                 * filled frames when the user (for example) must write
+                 * out headers with a write() call first and then use
+                 * sendfile to send out the data parts.
+                 *
+                 * TCP_CORK can be set together with TCP_NODELAY and it is
+                 * stronger than TCP_NODELAY.
+                 */
+                if (val) {
+                        tp->nonagle |= TCP_NAGLE_CORK;
+                } else {
+                        tp->nonagle &= ~TCP_NAGLE_CORK;
+                        if (tp->nonagle&TCP_NAGLE_OFF)
+                                tp->nonagle |= TCP_NAGLE_PUSH;
+                        tcp_push_pending_frames(sk, tp);
+                }
+                break;
+        case TCP_KEEPIDLE:
+                if (val < 1 || val > MAX_TCP_KEEPIDLE)
+                        err = -EINVAL;
+                else {
+                        tp->keepalive_time = val * HZ;
+                        if (sock_flag(sk, SOCK_KEEPOPEN) &&
+                            !((1 << sk->sk_state) &
+                              (TCPF_CLOSE | TCPF_LISTEN))) {
+                                __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+                                if (tp->keepalive_time > elapsed)
+                                        elapsed = tp->keepalive_time - elapsed;
+                                else
+                                        elapsed = 0;
+                                tcp_reset_keepalive_timer(sk, elapsed);
+                        }
+                }
+                break;
+        case TCP_KEEPINTVL:
+                if (val < 1 || val > MAX_TCP_KEEPINTVL)
+                        err = -EINVAL;
+                else
+                        tp->keepalive_intvl = val * HZ;
+                break;
+        case TCP_KEEPCNT:
+                if (val < 1 || val > MAX_TCP_KEEPCNT)
+                        err = -EINVAL;
+                else
+                        tp->keepalive_probes = val;
+                break;
+        case TCP_SYNCNT:
+                if (val < 1 || val > MAX_TCP_SYNCNT)
+                        err = -EINVAL;
+                else
+                        tp->syn_retries = val;
+                break;
+        case TCP_LINGER2:
+                if (val < 0)
+                        tp->linger2 = -1;
+                else if (val > sysctl_tcp_fin_timeout / HZ)
+                        tp->linger2 = 0;
+                else
+                        tp->linger2 = val * HZ;
+                break;
+        case TCP_DEFER_ACCEPT:
+                tp->defer_accept = 0;
+                if (val > 0) {
+                        /* Translate value in seconds to number of
+                         * retransmits */
+                        while (tp->defer_accept < 32 &&
+                               val > ((TCP_TIMEOUT_INIT / HZ) <<
+                                       tp->defer_accept))
+                                tp->defer_accept++;
+                        tp->defer_accept++;
+                }
+                break;
+        case TCP_WINDOW_CLAMP:
+                if (!val) {
+                        if (sk->sk_state != TCP_CLOSE) {
+                                err = -EINVAL;
+                                break;
+                        }
+                        tp->window_clamp = 0;
+                } else
+                        tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+                                                SOCK_MIN_RCVBUF / 2 : val;
+                break;
+        case TCP_QUICKACK:
+                if (!val) {
+                        tp->ack.pingpong = 1;
+                } else {
+                        tp->ack.pingpong = 0;
+                        if ((1 << sk->sk_state) &
+                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+                            tcp_ack_scheduled(tp)) {
+                                tp->ack.pending |= TCP_ACK_PUSHED;
+                                cleanup_rbuf(sk, 1);
+                                if (!(val & 1))
+                                        tp->ack.pingpong = 1;
+                        }
+                }
+                break;
+        default:
+                err = -ENOPROTOOPT;
+                break;
+        };
+        release_sock(sk);
+        return err;
+}
+/* Return information about state of tcp endpoint in API format. */
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 now = tcp_time_stamp;
+        memset(info, 0, sizeof(*info));
+        info->tcpi_state = sk->sk_state;
+        info->tcpi_ca_state = tp->ca_state;
+        info->tcpi_retransmits = tp->retransmits;
+        info->tcpi_probes = tp->probes_out;
+        info->tcpi_backoff = tp->backoff;
+        if (tp->rx_opt.tstamp_ok)
+                info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+        if (tp->rx_opt.sack_ok)
+                info->tcpi_options |= TCPI_OPT_SACK;
+        if (tp->rx_opt.wscale_ok) {
+                info->tcpi_options |= TCPI_OPT_WSCALE;
+                info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
+                info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
+        } 
+        if (tp->ecn_flags&TCP_ECN_OK)
+                info->tcpi_options |= TCPI_OPT_ECN;
+        info->tcpi_rto = jiffies_to_usecs(tp->rto);
+        info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+        info->tcpi_snd_mss = tp->mss_cache_std;
+        info->tcpi_rcv_mss = tp->ack.rcv_mss;
+        info->tcpi_unacked = tp->packets_out;
+        info->tcpi_sacked = tp->sacked_out;
+        info->tcpi_lost = tp->lost_out;
+        info->tcpi_retrans = tp->retrans_out;
+        info->tcpi_fackets = tp->fackets_out;
+        info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
+        info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+        info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
+        info->tcpi_pmtu = tp->pmtu_cookie;
+        info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+        info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
+        info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+        info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+        info->tcpi_snd_cwnd = tp->snd_cwnd;
+        info->tcpi_advmss = tp->advmss;
+        info->tcpi_reordering = tp->reordering;
+        info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+        info->tcpi_rcv_space = tp->rcvq_space.space;
+        info->tcpi_total_retrans = tp->total_retrans;
+}
+EXPORT_SYMBOL_GPL(tcp_get_info);
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+                   int __user *optlen)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int val, len;
+        if (level != SOL_TCP)
+                return tp->af_specific->getsockopt(sk, level, optname,
+                                                   optval, optlen);
+        if (get_user(len, optlen))
+                return -EFAULT;
+        len = min_t(unsigned int, len, sizeof(int));
+        if (len < 0)
+                return -EINVAL;
+        switch (optname) {
+        case TCP_MAXSEG:
+                val = tp->mss_cache_std;
+                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+                        val = tp->rx_opt.user_mss;
+                break;
+        case TCP_NODELAY:
+                val = !!(tp->nonagle&TCP_NAGLE_OFF);
+                break;
+        case TCP_CORK:
+                val = !!(tp->nonagle&TCP_NAGLE_CORK);
+                break;
+        case TCP_KEEPIDLE:
+                val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
+                break;
+        case TCP_KEEPINTVL:
+                val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
+                break;
+        case TCP_KEEPCNT:
+                val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
+                break;
+        case TCP_SYNCNT:
+                val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+                break;
+        case TCP_LINGER2:
+                val = tp->linger2;
+                if (val >= 0)
+                        val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+                break;
+        case TCP_DEFER_ACCEPT:
+                val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
+                                               (tp->defer_accept - 1));
+                break;
+        case TCP_WINDOW_CLAMP:
+                val = tp->window_clamp;
+                break;
+        case TCP_INFO: {
+                struct tcp_info info;
+                if (get_user(len, optlen))
+                        return -EFAULT;
+                tcp_get_info(sk, &info);
+                len = min_t(unsigned int, len, sizeof(info));
+                if (put_user(len, optlen))
+                        return -EFAULT;
+                if (copy_to_user(optval, &info, len))
+                        return -EFAULT;
+                return 0;
+        }
+        case TCP_QUICKACK:
+                val = !tp->ack.pingpong;
+                break;
+        default:
+                return -ENOPROTOOPT;
+        };
+        if (put_user(len, optlen))
+                return -EFAULT;
+        if (copy_to_user(optval, &val, len))
+                return -EFAULT;
+        return 0;
+}
+extern void __skb_cb_too_small_for_tcp(int, int);
+extern void tcpdiag_init(void);
+static __initdata unsigned long thash_entries;
+static int __init set_thash_entries(char *str)
+{
+        if (!str)
+                return 0;
+        thash_entries = simple_strtoul(str, &str, 0);
+        return 1;
+}
+__setup("thash_entries=", set_thash_entries);
+void __init tcp_init(void)
+{
+        struct sk_buff *skb = NULL;
+        int order, i;
+        if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
+                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
+                                           sizeof(skb->cb));
+        tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
+                                                   sizeof(struct open_request),
+                                               0, SLAB_HWCACHE_ALIGN,
+                                               NULL, NULL);
+        if (!tcp_openreq_cachep)
+                panic("tcp_init: Cannot alloc open_request cache.");
+        tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
+                                              sizeof(struct tcp_bind_bucket),
+                                              0, SLAB_HWCACHE_ALIGN,
+                                              NULL, NULL);
+        if (!tcp_bucket_cachep)
+                panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+        tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
+                                                sizeof(struct tcp_tw_bucket),
+                                                0, SLAB_HWCACHE_ALIGN,
+                                                NULL, NULL);
+        if (!tcp_timewait_cachep)
+                panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+        /* Size and allocate the main established and bind bucket
+         * hash tables.
+         *
+         * The methodology is similar to that of the buffer cache.
+         */
+        tcp_ehash = (struct tcp_ehash_bucket *)
+                alloc_large_system_hash("TCP established",
+                                        sizeof(struct tcp_ehash_bucket),
+                                        thash_entries,
+                                        (num_physpages >= 128 * 1024) ?
+                                                (25 - PAGE_SHIFT) :
+                                                (27 - PAGE_SHIFT),
+                                        HASH_HIGHMEM,
+                                        &tcp_ehash_size,
+                                        NULL,
+                                        0);
+        tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
+        for (i = 0; i < (tcp_ehash_size << 1); i++) {
+                rwlock_init(&tcp_ehash[i].lock);
+                INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+        }
+        tcp_bhash = (struct tcp_bind_hashbucket *)
+                alloc_large_system_hash("TCP bind",
+                                        sizeof(struct tcp_bind_hashbucket),
+                                        tcp_ehash_size,
+                                        (num_physpages >= 128 * 1024) ?
+                                                (25 - PAGE_SHIFT) :
+                                                (27 - PAGE_SHIFT),
+                                        HASH_HIGHMEM,
+                                        &tcp_bhash_size,
+                                        NULL,
+                                        64 * 1024);
+        tcp_bhash_size = 1 << tcp_bhash_size;
+        for (i = 0; i < tcp_bhash_size; i++) {
+                spin_lock_init(&tcp_bhash[i].lock);
+                INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+        }
+        /* Try to be a bit smarter and adjust defaults depending
+         * on available memory.
+         */
+        for (order = 0; ((1 << order) << PAGE_SHIFT) <
+                        (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+                        order++)
+                ;
+        if (order > 4) {
+                sysctl_local_port_range[0] = 32768;
+                sysctl_local_port_range[1] = 61000;
+                sysctl_tcp_max_tw_buckets = 180000;
+                sysctl_tcp_max_orphans = 4096 << (order - 4);
+                sysctl_max_syn_backlog = 1024;
+        } else if (order < 3) {
+                sysctl_local_port_range[0] = 1024 * (3 - order);
+                sysctl_tcp_max_tw_buckets >>= (3 - order);
+                sysctl_tcp_max_orphans >>= (3 - order);
+                sysctl_max_syn_backlog = 128;
+        }
+        tcp_port_rover = sysctl_local_port_range[0] - 1;
+        sysctl_tcp_mem[0] =  768 << order;
+        sysctl_tcp_mem[1] = 1024 << order;
+        sysctl_tcp_mem[2] = 1536 << order;
+        if (order < 3) {
+                sysctl_tcp_wmem[2] = 64 * 1024;
+                sysctl_tcp_rmem[0] = PAGE_SIZE;
+                sysctl_tcp_rmem[1] = 43689;
+                sysctl_tcp_rmem[2] = 2 * 43689;
+        }
+        printk(KERN_INFO "TCP: Hash tables configured "
+               "(established %d bind %d)\n",
+               tcp_ehash_size << 1, tcp_bhash_size);
+}
+EXPORT_SYMBOL(tcp_accept);
+EXPORT_SYMBOL(tcp_close);
+EXPORT_SYMBOL(tcp_destroy_sock);
+EXPORT_SYMBOL(tcp_disconnect);
+EXPORT_SYMBOL(tcp_getsockopt);
+EXPORT_SYMBOL(tcp_ioctl);
+EXPORT_SYMBOL(tcp_openreq_cachep);
+EXPORT_SYMBOL(tcp_poll);
+EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_recvmsg);
+EXPORT_SYMBOL(tcp_sendmsg);
+EXPORT_SYMBOL(tcp_sendpage);
+EXPORT_SYMBOL(tcp_setsockopt);
+EXPORT_SYMBOL(tcp_shutdown);
+EXPORT_SYMBOL(tcp_statistics);
+EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 000000000000..313c1408da33
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,802 @@
+/*
+ * tcp_diag.c   Module for monitoring TCP sockets.
+ *
+ * Version:     $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <linux/inet.h>
+#include <linux/stddef.h>
+#include <linux/tcp_diag.h>
+struct tcpdiag_entry
+{
+        u32 *saddr;
+        u32 *daddr;
+        u16 sport;
+        u16 dport;
+        u16 family;
+        u16 userlocks;
+};
+static struct sock *tcpnl;
+#define TCPDIAG_PUT(skb, attrtype, attrlen) \
+({ int rtalen = RTA_LENGTH(attrlen);        \
+   struct rtattr *rta;                      \
+   if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
+   rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
+   rta->rta_type = attrtype;                \
+   rta->rta_len = rtalen;                   \
+   RTA_DATA(rta); })
+static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
+                        int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcpdiagmsg *r;
+        struct nlmsghdr  *nlh;
+        struct tcp_info  *info = NULL;
+        struct tcpdiag_meminfo  *minfo = NULL;
+        struct tcpvegas_info *vinfo = NULL;
+        unsigned char    *b = skb->tail;
+        nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
+        nlh->nlmsg_flags = nlmsg_flags;
+        r = NLMSG_DATA(nlh);
+        if (sk->sk_state != TCP_TIME_WAIT) {
+                if (ext & (1<<(TCPDIAG_MEMINFO-1)))
+                        minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
+                if (ext & (1<<(TCPDIAG_INFO-1)))
+                        info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
+                
+                if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
+                    && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
+                        vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
+        }
+        r->tcpdiag_family = sk->sk_family;
+        r->tcpdiag_state = sk->sk_state;
+        r->tcpdiag_timer = 0;
+        r->tcpdiag_retrans = 0;
+        r->id.tcpdiag_if = sk->sk_bound_dev_if;
+        r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
+        r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+        if (r->tcpdiag_state == TCP_TIME_WAIT) {
+                struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
+                long tmo = tw->tw_ttd - jiffies;
+                if (tmo < 0)
+                        tmo = 0;
+                r->id.tcpdiag_sport = tw->tw_sport;
+                r->id.tcpdiag_dport = tw->tw_dport;
+                r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
+                r->id.tcpdiag_dst[0] = tw->tw_daddr;
+                r->tcpdiag_state = tw->tw_substate;
+                r->tcpdiag_timer = 3;
+                r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
+                r->tcpdiag_rqueue = 0;
+                r->tcpdiag_wqueue = 0;
+                r->tcpdiag_uid = 0;
+                r->tcpdiag_inode = 0;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+                if (r->tcpdiag_family == AF_INET6) {
+                        ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+                                       &tw->tw_v6_rcv_saddr);
+                        ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+                                       &tw->tw_v6_daddr);
+                }
+#endif
+                nlh->nlmsg_len = skb->tail - b;
+                return skb->len;
+        }
+        r->id.tcpdiag_sport = inet->sport;
+        r->id.tcpdiag_dport = inet->dport;
+        r->id.tcpdiag_src[0] = inet->rcv_saddr;
+        r->id.tcpdiag_dst[0] = inet->daddr;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+        if (r->tcpdiag_family == AF_INET6) {
+                struct ipv6_pinfo *np = inet6_sk(sk);
+                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+                               &np->rcv_saddr);
+                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+                               &np->daddr);
+        }
+#endif
+#define EXPIRES_IN_MS(tmo)  ((tmo-jiffies)*1000+HZ-1)/HZ
+        if (tp->pending == TCP_TIME_RETRANS) {
+                r->tcpdiag_timer = 1;
+                r->tcpdiag_retrans = tp->retransmits;
+                r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+        } else if (tp->pending == TCP_TIME_PROBE0) {
+                r->tcpdiag_timer = 4;
+                r->tcpdiag_retrans = tp->probes_out;
+                r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+        } else if (timer_pending(&sk->sk_timer)) {
+                r->tcpdiag_timer = 2;
+                r->tcpdiag_retrans = tp->probes_out;
+                r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+        } else {
+                r->tcpdiag_timer = 0;
+                r->tcpdiag_expires = 0;
+        }
+#undef EXPIRES_IN_MS
+        r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+        r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
+        r->tcpdiag_uid = sock_i_uid(sk);
+        r->tcpdiag_inode = sock_i_ino(sk);
+        if (minfo) {
+                minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+                minfo->tcpdiag_wmem = sk->sk_wmem_queued;
+                minfo->tcpdiag_fmem = sk->sk_forward_alloc;
+                minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+        }
+        if (info) 
+                tcp_get_info(sk, info);
+        if (vinfo) {
+                if (tcp_is_vegas(tp)) {
+                        vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
+                        vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
+                        vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
+                        vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
+                } else {
+                        vinfo->tcpv_enabled = 0;
+                        vinfo->tcpv_rttcnt = 0;
+                        vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
+                        vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
+                }
+        }
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
+                                  int dif);
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+                                  struct in6_addr *daddr, u16 dport,
+                                  int dif);
+#else
+static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+                                         struct in6_addr *daddr, u16 dport,
+                                         int dif)
+{
+        return NULL;
+}
+#endif
+static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+        int err;
+        struct sock *sk;
+        struct tcpdiagreq *req = NLMSG_DATA(nlh);
+        struct sk_buff *rep;
+        if (req->tcpdiag_family == AF_INET) {
+                sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
+                                   req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
+                                   req->id.tcpdiag_if);
+        }
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+        else if (req->tcpdiag_family == AF_INET6) {
+                sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
+                                   (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
+                                   req->id.tcpdiag_if);
+        }
+#endif
+        else {
+                return -EINVAL;
+        }
+        if (sk == NULL)
+                return -ENOENT;
+        err = -ESTALE;
+        if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
+             req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
+            ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
+             (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
+                goto out;
+        err = -ENOMEM;
+        rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
+                                    sizeof(struct tcpdiag_meminfo)+
+                                    sizeof(struct tcp_info)+64), GFP_KERNEL);
+        if (!rep)
+                goto out;
+        if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
+                         NETLINK_CB(in_skb).pid,
+                         nlh->nlmsg_seq, 0) <= 0)
+                BUG();
+        err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+        if (err > 0)
+                err = 0;
+out:
+        if (sk) {
+                if (sk->sk_state == TCP_TIME_WAIT)
+                        tcp_tw_put((struct tcp_tw_bucket*)sk);
+                else
+                        sock_put(sk);
+        }
+        return err;
+}
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+        int words = bits >> 5;
+        bits &= 0x1f;
+        if (words) {
+                if (memcmp(a1, a2, words << 2))
+                        return 0;
+        }
+        if (bits) {
+                __u32 w1, w2;
+                __u32 mask;
+                w1 = a1[words];
+                w2 = a2[words];
+                mask = htonl((0xffffffff) << (32 - bits));
+                if ((w1 ^ w2) & mask)
+                        return 0;
+        }
+        return 1;
+}
+static int tcpdiag_bc_run(const void *bc, int len,
+                          const struct tcpdiag_entry *entry)
+{
+        while (len > 0) {
+                int yes = 1;
+                const struct tcpdiag_bc_op *op = bc;
+                switch (op->code) {
+                case TCPDIAG_BC_NOP:
+                        break;
+                case TCPDIAG_BC_JMP:
+                        yes = 0;
+                        break;
+                case TCPDIAG_BC_S_GE:
+                        yes = entry->sport >= op[1].no;
+                        break;
+                case TCPDIAG_BC_S_LE:
+                        yes = entry->dport <= op[1].no;
+                        break;
+                case TCPDIAG_BC_D_GE:
+                        yes = entry->dport >= op[1].no;
+                        break;
+                case TCPDIAG_BC_D_LE:
+                        yes = entry->dport <= op[1].no;
+                        break;
+                case TCPDIAG_BC_AUTO:
+                        yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+                        break;
+                case TCPDIAG_BC_S_COND:
+                case TCPDIAG_BC_D_COND:
+                {
+                        struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
+                        u32 *addr;
+                        if (cond->port != -1 &&
+                            cond->port != (op->code == TCPDIAG_BC_S_COND ?
+                                             entry->sport : entry->dport)) {
+                                yes = 0;
+                                break;
+                        }
+                        
+                        if (cond->prefix_len == 0)
+                                break;
+                        if (op->code == TCPDIAG_BC_S_COND)
+                                addr = entry->saddr;
+                        else
+                                addr = entry->daddr;
+                        if (bitstring_match(addr, cond->addr, cond->prefix_len))
+                                break;
+                        if (entry->family == AF_INET6 &&
+                            cond->family == AF_INET) {
+                                if (addr[0] == 0 && addr[1] == 0 &&
+                                    addr[2] == htonl(0xffff) &&
+                                    bitstring_match(addr+3, cond->addr, cond->prefix_len))
+                                        break;
+                        }
+                        yes = 0;
+                        break;
+                }
+                }
+                if (yes) { 
+                        len -= op->yes;
+                        bc += op->yes;
+                } else {
+                        len -= op->no;
+                        bc += op->no;
+                }
+        }
+        return (len == 0);
+}
+static int valid_cc(const void *bc, int len, int cc)
+{
+        while (len >= 0) {
+                const struct tcpdiag_bc_op *op = bc;
+                if (cc > len)
+                        return 0;
+                if (cc == len)
+                        return 1;
+                if (op->yes < 4)
+                        return 0;
+                len -= op->yes;
+                bc  += op->yes;
+        }
+        return 0;
+}
+static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
+{
+        const unsigned char *bc = bytecode;
+        int  len = bytecode_len;
+        while (len > 0) {
+                struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+                switch (op->code) {
+                case TCPDIAG_BC_AUTO:
+                case TCPDIAG_BC_S_COND:
+                case TCPDIAG_BC_D_COND:
+                case TCPDIAG_BC_S_GE:
+                case TCPDIAG_BC_S_LE:
+                case TCPDIAG_BC_D_GE:
+                case TCPDIAG_BC_D_LE:
+                        if (op->yes < 4 || op->yes > len+4)
+                                return -EINVAL;
+                case TCPDIAG_BC_JMP:
+                        if (op->no < 4 || op->no > len+4)
+                                return -EINVAL;
+                        if (op->no < len &&
+                            !valid_cc(bytecode, bytecode_len, len-op->no))
+                                return -EINVAL;
+                        break;
+                case TCPDIAG_BC_NOP:
+                        if (op->yes < 4 || op->yes > len+4)
+                                return -EINVAL;
+                        break;
+                default:
+                        return -EINVAL;
+                }
+                bc += op->yes;
+                len -= op->yes;
+        }
+        return len == 0 ? 0 : -EINVAL;
+}
+static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
+                             struct netlink_callback *cb)
+{
+        struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+                struct tcpdiag_entry entry;
+                struct rtattr *bc = (struct rtattr *)(r + 1);
+                struct inet_sock *inet = inet_sk(sk);
+                entry.family = sk->sk_family;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+                if (entry.family == AF_INET6) {
+                        struct ipv6_pinfo *np = inet6_sk(sk);
+                        entry.saddr = np->rcv_saddr.s6_addr32;
+                        entry.daddr = np->daddr.s6_addr32;
+                } else
+#endif
+                {
+                        entry.saddr = &inet->rcv_saddr;
+                        entry.daddr = &inet->daddr;
+                }
+                entry.sport = inet->num;
+                entry.dport = ntohs(inet->dport);
+                entry.userlocks = sk->sk_userlocks;
+                if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+                        return 0;
+        }
+        return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
+                            cb->nlh->nlmsg_seq, NLM_F_MULTI);
+}
+static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
+                            struct open_request *req,
+                            u32 pid, u32 seq)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        unsigned char *b = skb->tail;
+        struct tcpdiagmsg *r;
+        struct nlmsghdr *nlh;
+        long tmo;
+        nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
+        nlh->nlmsg_flags = NLM_F_MULTI;
+        r = NLMSG_DATA(nlh);
+        r->tcpdiag_family = sk->sk_family;
+        r->tcpdiag_state = TCP_SYN_RECV;
+        r->tcpdiag_timer = 1;
+        r->tcpdiag_retrans = req->retrans;
+        r->id.tcpdiag_if = sk->sk_bound_dev_if;
+        r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
+        r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+        tmo = req->expires - jiffies;
+        if (tmo < 0)
+                tmo = 0;
+        r->id.tcpdiag_sport = inet->sport;
+        r->id.tcpdiag_dport = req->rmt_port;
+        r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr;
+        r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr;
+        r->tcpdiag_expires = jiffies_to_msecs(tmo),
+        r->tcpdiag_rqueue = 0;
+        r->tcpdiag_wqueue = 0;
+        r->tcpdiag_uid = sock_i_uid(sk);
+        r->tcpdiag_inode = 0;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+        if (r->tcpdiag_family == AF_INET6) {
+                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+                               &req->af.v6_req.loc_addr);
+                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+                               &req->af.v6_req.rmt_addr);
+        }
+#endif
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+                             struct netlink_callback *cb)
+{
+        struct tcpdiag_entry entry;
+        struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_listen_opt *lopt;
+        struct rtattr *bc = NULL;
+        struct inet_sock *inet = inet_sk(sk);
+        int j, s_j;
+        int reqnum, s_reqnum;
+        int err = 0;
+        s_j = cb->args[3];
+        s_reqnum = cb->args[4];
+        if (s_j > 0)
+                s_j--;
+        entry.family = sk->sk_family;
+        read_lock_bh(&tp->syn_wait_lock);
+        lopt = tp->listen_opt;
+        if (!lopt || !lopt->qlen)
+                goto out;
+        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+                bc = (struct rtattr *)(r + 1);
+                entry.sport = inet->num;
+                entry.userlocks = sk->sk_userlocks;
+        }
+        for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
+                struct open_request *req, *head = lopt->syn_table[j];
+                reqnum = 0;
+                for (req = head; req; reqnum++, req = req->dl_next) {
+                        if (reqnum < s_reqnum)
+                                continue;
+                        if (r->id.tcpdiag_dport != req->rmt_port &&
+                            r->id.tcpdiag_dport)
+                                continue;
+                        if (bc) {
+                                entry.saddr =
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+                                        (entry.family == AF_INET6) ?
+                                        req->af.v6_req.loc_addr.s6_addr32 :
+#endif
+                                        &req->af.v4_req.loc_addr;
+                                entry.daddr = 
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+                                        (entry.family == AF_INET6) ?
+                                        req->af.v6_req.rmt_addr.s6_addr32 :
+#endif
+                                        &req->af.v4_req.rmt_addr;
+                                entry.dport = ntohs(req->rmt_port);
+                                if (!tcpdiag_bc_run(RTA_DATA(bc),
+                                                    RTA_PAYLOAD(bc), &entry))
+                                        continue;
+                        }
+                        err = tcpdiag_fill_req(skb, sk, req,
+                                               NETLINK_CB(cb->skb).pid,
+                                               cb->nlh->nlmsg_seq);
+                        if (err < 0) {
+                                cb->args[3] = j + 1;
+                                cb->args[4] = reqnum;
+                                goto out;
+                        }
+                }
+                s_reqnum = 0;
+        }
+out:
+        read_unlock_bh(&tp->syn_wait_lock);
+        return err;
+}
+static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int i, num;
+        int s_i, s_num;
+        struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+        s_i = cb->args[1];
+        s_num = num = cb->args[2];
+        if (cb->args[0] == 0) {
+                if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
+                        goto skip_listen_ht;
+                tcp_listen_lock();
+                for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
+                        struct sock *sk;
+                        struct hlist_node *node;
+                        num = 0;
+                        sk_for_each(sk, node, &tcp_listening_hash[i]) {
+                                struct inet_sock *inet = inet_sk(sk);
+                                if (num < s_num) {
+                                        num++;
+                                        continue;
+                                }
+                                if (r->id.tcpdiag_sport != inet->sport &&
+                                    r->id.tcpdiag_sport)
+                                        goto next_listen;
+                                if (!(r->tcpdiag_states&TCPF_LISTEN) ||
+                                    r->id.tcpdiag_dport ||
+                                    cb->args[3] > 0)
+                                        goto syn_recv;
+                                if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+                                        tcp_listen_unlock();
+                                        goto done;
+                                }
+syn_recv:
+                                if (!(r->tcpdiag_states&TCPF_SYN_RECV))
+                                        goto next_listen;
+                                if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
+                                        tcp_listen_unlock();
+                                        goto done;
+                                }
+next_listen:
+                                cb->args[3] = 0;
+                                cb->args[4] = 0;
+                                ++num;
+                        }
+                        s_num = 0;
+                        cb->args[3] = 0;
+                        cb->args[4] = 0;
+                }
+                tcp_listen_unlock();
+skip_listen_ht:
+                cb->args[0] = 1;
+                s_i = num = s_num = 0;
+        }
+        if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
+                return skb->len;
+        for (i = s_i; i < tcp_ehash_size; i++) {
+                struct tcp_ehash_bucket *head = &tcp_ehash[i];
+                struct sock *sk;
+                struct hlist_node *node;
+                if (i > s_i)
+                        s_num = 0;
+                read_lock_bh(&head->lock);
+                num = 0;
+                sk_for_each(sk, node, &head->chain) {
+                        struct inet_sock *inet = inet_sk(sk);
+                        if (num < s_num)
+                                goto next_normal;
+                        if (!(r->tcpdiag_states & (1 << sk->sk_state)))
+                                goto next_normal;
+                        if (r->id.tcpdiag_sport != inet->sport &&
+                            r->id.tcpdiag_sport)
+                                goto next_normal;
+                        if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
+                                goto next_normal;
+                        if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+                                read_unlock_bh(&head->lock);
+                                goto done;
+                        }
+next_normal:
+                        ++num;
+                }
+                if (r->tcpdiag_states&TCPF_TIME_WAIT) {
+                        sk_for_each(sk, node,
+                                    &tcp_ehash[i + tcp_ehash_size].chain) {
+                                struct inet_sock *inet = inet_sk(sk);
+                                if (num < s_num)
+                                        goto next_dying;
+                                if (r->id.tcpdiag_sport != inet->sport &&
+                                    r->id.tcpdiag_sport)
+                                        goto next_dying;
+                                if (r->id.tcpdiag_dport != inet->dport &&
+                                    r->id.tcpdiag_dport)
+                                        goto next_dying;
+                                if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+                                        read_unlock_bh(&head->lock);
+                                        goto done;
+                                }
+next_dying:
+                                ++num;
+                        }
+                }
+                read_unlock_bh(&head->lock);
+        }
+done:
+        cb->args[1] = i;
+        cb->args[2] = num;
+        return skb->len;
+}
+static int tcpdiag_dump_done(struct netlink_callback *cb)
+{
+        return 0;
+}
+static __inline__ int
+tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+        if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+                return 0;
+        if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
+                goto err_inval;
+        if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
+                goto err_inval;
+        if (nlh->nlmsg_flags&NLM_F_DUMP) {
+                if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
+                        struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
+                        if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
+                            rta->rta_len < 8 ||
+                            rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
+                                goto err_inval;
+                        if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+                                goto err_inval;
+                }
+                return netlink_dump_start(tcpnl, skb, nlh,
+                                          tcpdiag_dump,
+                                          tcpdiag_dump_done);
+        } else {
+                return tcpdiag_get_exact(skb, nlh);
+        }
+err_inval:
+        return -EINVAL;
+}
+static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
+{
+        int err;
+        struct nlmsghdr * nlh;
+        if (skb->len >= NLMSG_SPACE(0)) {
+                nlh = (struct nlmsghdr *)skb->data;
+                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+                        return;
+                err = tcpdiag_rcv_msg(skb, nlh);
+                if (err || nlh->nlmsg_flags & NLM_F_ACK) 
+                        netlink_ack(skb, nlh, err);
+        }
+}
+static void tcpdiag_rcv(struct sock *sk, int len)
+{
+        struct sk_buff *skb;
+        while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+                tcpdiag_rcv_skb(skb);
+                kfree_skb(skb);
+        }
+}
+static int __init tcpdiag_init(void)
+{
+        tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
+        if (tcpnl == NULL)
+                return -ENOMEM;
+        return 0;
+}
+static void __exit tcpdiag_exit(void)
+{
+        sock_release(tcpnl->sk_socket);
+}
+module_init(tcpdiag_init);
+module_exit(tcpdiag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 000000000000..250492735902
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,4959 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:     $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *              Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *              Florian La Roche, <flla@stud.uni-sb.de>
+ *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *              Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *              Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *              Matthew Dillon, <dillon@apollo.west.oic.com>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Jorge Cwik, <jorge@laser.satlink.net>
+ */
+/*
+ * Changes:
+ *              Pedro Roque     :       Fast Retransmit/Recovery.
+ *                                      Two receive queues.
+ *                                      Retransmit queue handled by TCP.
+ *                                      Better retransmit timer handling.
+ *                                      New congestion avoidance.
+ *                                      Header prediction.
+ *                                      Variable renaming.
+ *
+ *              Eric            :       Fast Retransmit.
+ *              Randy Scott     :       MSS option defines.
+ *              Eric Schenk     :       Fixes to slow start algorithm.
+ *              Eric Schenk     :       Yet another double ACK bug.
+ *              Eric Schenk     :       Delayed ACK bug fixes.
+ *              Eric Schenk     :       Floyd style fast retrans war avoidance.
+ *              David S. Miller :       Don't allow zero congestion window.
+ *              Eric Schenk     :       Fix retransmitter so that it sends
+ *                                      next packet on ack of previous packet.
+ *              Andi Kleen      :       Moved open_request checking here
+ *                                      and process RSTs for open_requests.
+ *              Andi Kleen      :       Better prune_queue, and other fixes.
+ *              Andrey Savochkin:       Fix RTT measurements in the presnce of
+ *                                      timestamps.
+ *              Andrey Savochkin:       Check sequence numbers correctly when
+ *                                      removing SACKs due to in sequence incoming
+ *                                      data segments.
+ *              Andi Kleen:             Make sure we never ack data there is not
+ *                                      enough room for. Also make this condition
+ *                                      a fatal error if it might still happen.
+ *              Andi Kleen:             Add tcp_measure_rcv_mss to make 
+ *                                      connections with MSS<min(MTU,ann. MSS)
+ *                                      work without delayed acks. 
+ *              Andi Kleen:             Process packets with PSH set in the
+ *                                      fast path.
+ *              J Hadi Salim:           ECN support
+ *              Andrei Gurtov,
+ *              Pasi Sarolahti,
+ *              Panu Kuhlberg:          Experimental audit of TCP (re)transmission
+ *                                      engine. Lots of bugs are found.
+ *              Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
+ *              Angelo Dell'Aera:       TCP Westwood+ support
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/ipsec.h>
+#include <asm/unaligned.h>
+int sysctl_tcp_timestamps = 1;
+int sysctl_tcp_window_scaling = 1;
+int sysctl_tcp_sack = 1;
+int sysctl_tcp_fack = 1;
+int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_ecn;
+int sysctl_tcp_dsack = 1;
+int sysctl_tcp_app_win = 31;
+int sysctl_tcp_adv_win_scale = 2;
+int sysctl_tcp_stdurg;
+int sysctl_tcp_rfc1337;
+int sysctl_tcp_max_orphans = NR_FILE;
+int sysctl_tcp_frto;
+int sysctl_tcp_nometrics_save;
+int sysctl_tcp_westwood;
+int sysctl_tcp_vegas_cong_avoid;
+int sysctl_tcp_moderate_rcvbuf = 1;
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_beta  = 3<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
+int sysctl_tcp_bic = 1;
+int sysctl_tcp_bic_fast_convergence = 1;
+int sysctl_tcp_bic_low_window = 14;
+int sysctl_tcp_bic_beta = 819;          /* = 819/1024 (BICTCP_BETA_SCALE) */
+#define FLAG_DATA               0x01 /* Incoming frame contained data.          */
+#define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
+#define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
+#define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged SYN.              */
+#define FLAG_DATA_SACKED        0x20 /* New SACK.                               */
+#define FLAG_ECE                0x40 /* ECE in this ACK                         */
+#define FLAG_DATA_LOST          0x80 /* SACK detected data lossage.             */
+#define FLAG_SLOWPATH           0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ACKED              (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP            (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT           (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_FORWARD_PROGRESS   (FLAG_ACKED|FLAG_DATA_SACKED)
+#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0)
+#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
+#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
+#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+/* Adapt the MSS value used to make delayed ack decision to the 
+ * real world.
+ */ 
+static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
+                                       struct sk_buff *skb)
+{
+        unsigned int len, lss;
+        lss = tp->ack.last_seg_size; 
+        tp->ack.last_seg_size = 0; 
+        /* skb->len may jitter because of SACKs, even if peer
+         * sends good full-sized frames.
+         */
+        len = skb->len;
+        if (len >= tp->ack.rcv_mss) {
+                tp->ack.rcv_mss = len;
+        } else {
+                /* Otherwise, we make more careful check taking into account,
+                 * that SACKs block is variable.
+                 *
+                 * "len" is invariant segment length, including TCP header.
+                 */
+                len += skb->data - skb->h.raw;
+                if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
+                    /* If PSH is not set, packet should be
+                     * full sized, provided peer TCP is not badly broken.
+                     * This observation (if it is correct 8)) allows
+                     * to handle super-low mtu links fairly.
+                     */
+                    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
+                     !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) {
+                        /* Subtract also invariant (if peer is RFC compliant),
+                         * tcp header plus fixed timestamp option length.
+                         * Resulting "len" is MSS free of SACK jitter.
+                         */
+                        len -= tp->tcp_header_len;
+                        tp->ack.last_seg_size = len;
+                        if (len == lss) {
+                                tp->ack.rcv_mss = len;
+                                return;
+                        }
+                }
+                tp->ack.pending |= TCP_ACK_PUSHED;
+        }
+}
+static void tcp_incr_quickack(struct tcp_sock *tp)
+{
+        unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+        if (quickacks==0)
+                quickacks=2;
+        if (quickacks > tp->ack.quick)
+                tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+}
+void tcp_enter_quickack_mode(struct tcp_sock *tp)
+{
+        tcp_incr_quickack(tp);
+        tp->ack.pingpong = 0;
+        tp->ack.ato = TCP_ATO_MIN;
+}
+/* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
+ */
+static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+{
+        return (tp->ack.quick && !tp->ack.pingpong);
+}
+/* Buffer size and advertised window tuning.
+ *
+ * 1. Tuning sk->sk_sndbuf, when connection enters established state.
+ */
+static void tcp_fixup_sndbuf(struct sock *sk)
+{
+        int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
+                     sizeof(struct sk_buff);
+        if (sk->sk_sndbuf < 3 * sndmem)
+                sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+}
+/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+ *
+ * All tcp_full_space() is split to two parts: "network" buffer, allocated
+ * forward and advertised in receiver window (tp->rcv_wnd) and
+ * "application buffer", required to isolate scheduling/application
+ * latencies from network.
+ * window_clamp is maximal advertised window. It can be less than
+ * tcp_full_space(), in this case tcp_full_space() - window_clamp
+ * is reserved for "application" buffer. The less window_clamp is
+ * the smoother our behaviour from viewpoint of network, but the lower
+ * throughput and the higher sensitivity of the connection to losses. 8)
+ *
+ * rcv_ssthresh is more strict window_clamp used at "slow start"
+ * phase to predict further behaviour of this connection.
+ * It is used for two goals:
+ * - to enforce header prediction at sender, even when application
+ *   requires some significant "application buffer". It is check #1.
+ * - to prevent pruning of receive queue because of misprediction
+ *   of receiver window. Check #2.
+ *
+ * The scheme does not work when sender sends good segments opening
+ * window and then starts to feed us spagetti. But it should work
+ * in common situations. Otherwise, we have to rely on queue collapsing.
+ */
+/* Slow part of check#2. */
+static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+                             struct sk_buff *skb)
+{
+        /* Optimize this! */
+        int truesize = tcp_win_from_space(skb->truesize)/2;
+        int window = tcp_full_space(sk)/2;
+        while (tp->rcv_ssthresh <= window) {
+                if (truesize <= skb->len)
+                        return 2*tp->ack.rcv_mss;
+                truesize >>= 1;
+                window >>= 1;
+        }
+        return 0;
+}
+static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+                                   struct sk_buff *skb)
+{
+        /* Check #1 */
+        if (tp->rcv_ssthresh < tp->window_clamp &&
+            (int)tp->rcv_ssthresh < tcp_space(sk) &&
+            !tcp_memory_pressure) {
+                int incr;
+                /* Check #2. Increase window, if skb with such overhead
+                 * will fit to rcvbuf in future.
+                 */
+                if (tcp_win_from_space(skb->truesize) <= skb->len)
+                        incr = 2*tp->advmss;
+                else
+                        incr = __tcp_grow_window(sk, tp, skb);
+                if (incr) {
+                        tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
+                        tp->ack.quick |= 1;
+                }
+        }
+}
+/* 3. Tuning rcvbuf, when connection enters established state. */
+static void tcp_fixup_rcvbuf(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+        /* Try to select rcvbuf so that 4 mss-sized segments
+         * will fit to window and correspoding skbs will fit to our rcvbuf.
+         * (was 3; 4 is minimum to allow fast retransmit to work.)
+         */
+        while (tcp_win_from_space(rcvmem) < tp->advmss)
+                rcvmem += 128;
+        if (sk->sk_rcvbuf < 4 * rcvmem)
+                sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
+}
+/* 4. Try to fixup all. It is made iimediately after connection enters
+ *    established state.
+ */
+static void tcp_init_buffer_space(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int maxwin;
+        if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+                tcp_fixup_rcvbuf(sk);
+        if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
+                tcp_fixup_sndbuf(sk);
+        tp->rcvq_space.space = tp->rcv_wnd;
+        maxwin = tcp_full_space(sk);
+        if (tp->window_clamp >= maxwin) {
+                tp->window_clamp = maxwin;
+                if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+                        tp->window_clamp = max(maxwin -
+                                               (maxwin >> sysctl_tcp_app_win),
+                                               4 * tp->advmss);
+        }
+        /* Force reservation of one segment. */
+        if (sysctl_tcp_app_win &&
+            tp->window_clamp > 2 * tp->advmss &&
+            tp->window_clamp + tp->advmss > maxwin)
+                tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+        tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+static void init_bictcp(struct tcp_sock *tp)
+{
+        tp->bictcp.cnt = 0;
+        tp->bictcp.last_max_cwnd = 0;
+        tp->bictcp.last_cwnd = 0;
+        tp->bictcp.last_stamp = 0;
+}
+/* 5. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
+{
+        struct sk_buff *skb;
+        unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
+        int ofo_win = 0;
+        tp->ack.quick = 0;
+        skb_queue_walk(&tp->out_of_order_queue, skb) {
+                ofo_win += skb->len;
+        }
+        /* If overcommit is due to out of order segments,
+         * do not clamp window. Try to expand rcvbuf instead.
+         */
+        if (ofo_win) {
+                if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+                    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+                    !tcp_memory_pressure &&
+                    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
+                        sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+                                            sysctl_tcp_rmem[2]);
+        }
+        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
+                app_win += ofo_win;
+                if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
+                        app_win >>= 1;
+                if (app_win > tp->ack.rcv_mss)
+                        app_win -= tp->ack.rcv_mss;
+                app_win = max(app_win, 2U*tp->advmss);
+                if (!ofo_win)
+                        tp->window_clamp = min(tp->window_clamp, app_win);
+                tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
+        }
+}
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ *
+ * More detail on this code can be found at
+ * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * though this reference is out of date.  A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
+{
+        u32 new_sample = tp->rcv_rtt_est.rtt;
+        long m = sample;
+        if (m == 0)
+                m = 1;
+        if (new_sample != 0) {
+                /* If we sample in larger samples in the non-timestamp
+                 * case, we could grossly overestimate the RTT especially
+                 * with chatty applications or bulk transfer apps which
+                 * are stalled on filesystem I/O.
+                 *
+                 * Also, since we are only going for a minimum in the
+                 * non-timestamp case, we do not smoothe things out
+                 * else with timestamps disabled convergance takes too
+                 * long.
+                 */
+                if (!win_dep) {
+                        m -= (new_sample >> 3);
+                        new_sample += m;
+                } else if (m < new_sample)
+                        new_sample = m << 3;
+        } else {
+                /* No previous mesaure. */
+                new_sample = m << 3;
+        }
+        if (tp->rcv_rtt_est.rtt != new_sample)
+                tp->rcv_rtt_est.rtt = new_sample;
+}
+static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
+{
+        if (tp->rcv_rtt_est.time == 0)
+                goto new_measure;
+        if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+                return;
+        tcp_rcv_rtt_update(tp,
+                           jiffies - tp->rcv_rtt_est.time,
+                           1);
+new_measure:
+        tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+        tp->rcv_rtt_est.time = tcp_time_stamp;
+}
+static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        if (tp->rx_opt.rcv_tsecr &&
+            (TCP_SKB_CB(skb)->end_seq -
+             TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+                tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+}
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int time;
+        int space;
+        
+        if (tp->rcvq_space.time == 0)
+                goto new_measure;
+        
+        time = tcp_time_stamp - tp->rcvq_space.time;
+        if (time < (tp->rcv_rtt_est.rtt >> 3) ||
+            tp->rcv_rtt_est.rtt == 0)
+                return;
+        
+        space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+        space = max(tp->rcvq_space.space, space);
+        if (tp->rcvq_space.space != space) {
+                int rcvmem;
+                tp->rcvq_space.space = space;
+                if (sysctl_tcp_moderate_rcvbuf) {
+                        int new_clamp = space;
+                        /* Receive space grows, normalize in order to
+                         * take into account packet headers and sk_buff
+                         * structure overhead.
+                         */
+                        space /= tp->advmss;
+                        if (!space)
+                                space = 1;
+                        rcvmem = (tp->advmss + MAX_TCP_HEADER +
+                                  16 + sizeof(struct sk_buff));
+                        while (tcp_win_from_space(rcvmem) < tp->advmss)
+                                rcvmem += 128;
+                        space *= rcvmem;
+                        space = min(space, sysctl_tcp_rmem[2]);
+                        if (space > sk->sk_rcvbuf) {
+                                sk->sk_rcvbuf = space;
+                                /* Make the window clamp follow along.  */
+                                tp->window_clamp = new_clamp;
+                        }
+                }
+        }
+        
+new_measure:
+        tp->rcvq_space.seq = tp->copied_seq;
+        tp->rcvq_space.time = tcp_time_stamp;
+}
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval.  When a
+ * connection starts up, we want to ack as quickly as possible.  The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission.  The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time.  For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue.  -DaveM
+ */
+static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+        u32 now;
+        tcp_schedule_ack(tp);
+        tcp_measure_rcv_mss(tp, skb);
+        tcp_rcv_rtt_measure(tp);
+        
+        now = tcp_time_stamp;
+        if (!tp->ack.ato) {
+                /* The _first_ data packet received, initialize
+                 * delayed ACK engine.
+                 */
+                tcp_incr_quickack(tp);
+                tp->ack.ato = TCP_ATO_MIN;
+        } else {
+                int m = now - tp->ack.lrcvtime;
+                if (m <= TCP_ATO_MIN/2) {
+                        /* The fastest case is the first. */
+                        tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
+                } else if (m < tp->ack.ato) {
+                        tp->ack.ato = (tp->ack.ato>>1) + m;
+                        if (tp->ack.ato > tp->rto)
+                                tp->ack.ato = tp->rto;
+                } else if (m > tp->rto) {
+                        /* Too long gap. Apparently sender falled to
+                         * restart window, so that we send ACKs quickly.
+                         */
+                        tcp_incr_quickack(tp);
+                        sk_stream_mem_reclaim(sk);
+                }
+        }
+        tp->ack.lrcvtime = now;
+        TCP_ECN_check_ce(tp, skb);
+        if (skb->len >= 128)
+                tcp_grow_window(sk, tp, skb);
+}
+/* When starting a new connection, pin down the current choice of 
+ * congestion algorithm.
+ */
+void tcp_ca_init(struct tcp_sock *tp)
+{
+        if (sysctl_tcp_westwood) 
+                tp->adv_cong = TCP_WESTWOOD;
+        else if (sysctl_tcp_bic)
+                tp->adv_cong = TCP_BIC;
+        else if (sysctl_tcp_vegas_cong_avoid) {
+                tp->adv_cong = TCP_VEGAS;
+                tp->vegas.baseRTT = 0x7fffffff;
+                tcp_vegas_enable(tp);
+        } 
+}
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
+{
+        __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
+        /* Filter to find propagation delay: */
+        if (vrtt < tp->vegas.baseRTT) 
+                tp->vegas.baseRTT = vrtt;
+        /* Find the min RTT during the last RTT to find
+         * the current prop. delay + queuing delay:
+         */
+        tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
+        tp->vegas.cntRTT++;
+}
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+{
+        long m = mrtt; /* RTT */
+        if (tcp_vegas_enabled(tp))
+                vegas_rtt_calc(tp, mrtt);
+        /*      The following amusing code comes from Jacobson's
+         *      article in SIGCOMM '88.  Note that rtt and mdev
+         *      are scaled versions of rtt and mean deviation.
+         *      This is designed to be as fast as possible 
+         *      m stands for "measurement".
+         *
+         *      On a 1990 paper the rto value is changed to:
+         *      RTO = rtt + 4 * mdev
+         *
+         * Funny. This algorithm seems to be very broken.
+         * These formulae increase RTO, when it should be decreased, increase
+         * too slowly, when it should be incresed fastly, decrease too fastly
+         * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+         * does not matter how to _calculate_ it. Seems, it was trap
+         * that VJ failed to avoid. 8)
+         */
+        if(m == 0)
+                m = 1;
+        if (tp->srtt != 0) {
+                m -= (tp->srtt >> 3);   /* m is now error in rtt est */
+                tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
+                if (m < 0) {
+                        m = -m;         /* m is now abs(error) */
+                        m -= (tp->mdev >> 2);   /* similar update on mdev */
+                        /* This is similar to one of Eifel findings.
+                         * Eifel blocks mdev updates when rtt decreases.
+                         * This solution is a bit different: we use finer gain
+                         * for mdev in this case (alpha*beta).
+                         * Like Eifel it also prevents growth of rto,
+                         * but also it limits too fast rto decreases,
+                         * happening in pure Eifel.
+                         */
+                        if (m > 0)
+                                m >>= 3;
+                } else {
+                        m -= (tp->mdev >> 2);   /* similar update on mdev */
+                }
+                tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
+                if (tp->mdev > tp->mdev_max) {
+                        tp->mdev_max = tp->mdev;
+                        if (tp->mdev_max > tp->rttvar)
+                                tp->rttvar = tp->mdev_max;
+                }
+                if (after(tp->snd_una, tp->rtt_seq)) {
+                        if (tp->mdev_max < tp->rttvar)
+                                tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
+                        tp->rtt_seq = tp->snd_nxt;
+                        tp->mdev_max = TCP_RTO_MIN;
+                }
+        } else {
+                /* no previous measure. */
+                tp->srtt = m<<3;        /* take the measured time to be rtt */
+                tp->mdev = m<<1;        /* make sure rto = 3*rtt */
+                tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+                tp->rtt_seq = tp->snd_nxt;
+        }
+        tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+}
+/* Calculate rto without backoff.  This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static inline void tcp_set_rto(struct tcp_sock *tp)
+{
+        /* Old crap is replaced with new one. 8)
+         *
+         * More seriously:
+         * 1. If rtt variance happened to be less 50msec, it is hallucination.
+         *    It cannot be less due to utterly erratic ACK generation made
+         *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+         *    to do with delayed acks, because at cwnd>2 true delack timeout
+         *    is invisible. Actually, Linux-2.4 also generates erratic
+         *    ACKs in some curcumstances.
+         */
+        tp->rto = (tp->srtt >> 3) + tp->rttvar;
+        /* 2. Fixups made earlier cannot be right.
+         *    If we do not estimate RTO correctly without them,
+         *    all the algo is pure shit and should be replaced
+         *    with correct one. It is exaclty, which we pretend to do.
+         */
+}
+/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+ * guarantees that rto is higher.
+ */
+static inline void tcp_bound_rto(struct tcp_sock *tp)
+{
+        if (tp->rto > TCP_RTO_MAX)
+                tp->rto = TCP_RTO_MAX;
+}
+/* Save metrics learned by this TCP session.
+   This function is called only, when TCP finishes successfully
+   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
+        if (sysctl_tcp_nometrics_save)
+                return;
+        dst_confirm(dst);
+        if (dst && (dst->flags&DST_HOST)) {
+                int m;
+                if (tp->backoff || !tp->srtt) {
+                        /* This session failed to estimate rtt. Why?
+                         * Probably, no packets returned in time.
+                         * Reset our results.
+                         */
+                        if (!(dst_metric_locked(dst, RTAX_RTT)))
+                                dst->metrics[RTAX_RTT-1] = 0;
+                        return;
+                }
+                m = dst_metric(dst, RTAX_RTT) - tp->srtt;
+                /* If newly calculated rtt larger than stored one,
+                 * store new one. Otherwise, use EWMA. Remember,
+                 * rtt overestimation is always better than underestimation.
+                 */
+                if (!(dst_metric_locked(dst, RTAX_RTT))) {
+                        if (m <= 0)
+                                dst->metrics[RTAX_RTT-1] = tp->srtt;
+                        else
+                                dst->metrics[RTAX_RTT-1] -= (m>>3);
+                }
+                if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+                        if (m < 0)
+                                m = -m;
+                        /* Scale deviation to rttvar fixed point */
+                        m >>= 1;
+                        if (m < tp->mdev)
+                                m = tp->mdev;
+                        if (m >= dst_metric(dst, RTAX_RTTVAR))
+                                dst->metrics[RTAX_RTTVAR-1] = m;
+                        else
+                                dst->metrics[RTAX_RTTVAR-1] -=
+                                        (dst->metrics[RTAX_RTTVAR-1] - m)>>2;
+                }
+                if (tp->snd_ssthresh >= 0xFFFF) {
+                        /* Slow start still did not finish. */
+                        if (dst_metric(dst, RTAX_SSTHRESH) &&
+                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+                            (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+                                dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
+                        if (!dst_metric_locked(dst, RTAX_CWND) &&
+                            tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+                                dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
+                } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+                           tp->ca_state == TCP_CA_Open) {
+                        /* Cong. avoidance phase, cwnd is reliable. */
+                        if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+                                dst->metrics[RTAX_SSTHRESH-1] =
+                                        max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
+                        if (!dst_metric_locked(dst, RTAX_CWND))
+                                dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
+                } else {
+                        /* Else slow start did not finish, cwnd is non-sense,
+                           ssthresh may be also invalid.
+                         */
+                        if (!dst_metric_locked(dst, RTAX_CWND))
+                                dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
+                        if (dst->metrics[RTAX_SSTHRESH-1] &&
+                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+                            tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
+                                dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+                }
+                if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+                        if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
+                            tp->reordering != sysctl_tcp_reordering)
+                                dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+                }
+        }
+}
+/* Numbers are taken from RFC2414.  */
+__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
+{
+        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+        if (!cwnd) {
+                if (tp->mss_cache_std > 1460)
+                        cwnd = 2;
+                else
+                        cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+        }
+        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+}
+/* Initialize metrics on socket. */
+static void tcp_init_metrics(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
+        if (dst == NULL)
+                goto reset;
+        dst_confirm(dst);
+        if (dst_metric_locked(dst, RTAX_CWND))
+                tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+        if (dst_metric(dst, RTAX_SSTHRESH)) {
+                tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+                if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+                        tp->snd_ssthresh = tp->snd_cwnd_clamp;
+        }
+        if (dst_metric(dst, RTAX_REORDERING) &&
+            tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+                tp->rx_opt.sack_ok &= ~2;
+                tp->reordering = dst_metric(dst, RTAX_REORDERING);
+        }
+        if (dst_metric(dst, RTAX_RTT) == 0)
+                goto reset;
+        if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+                goto reset;
+        /* Initial rtt is determined from SYN,SYN-ACK.
+         * The segment is small and rtt may appear much
+         * less than real one. Use per-dst memory
+         * to make it more realistic.
+         *
+         * A bit of theory. RTT is time passed after "normal" sized packet
+         * is sent until it is ACKed. In normal curcumstances sending small
+         * packets force peer to delay ACKs and calculation is correct too.
+         * The algorithm is adaptive and, provided we follow specs, it
+         * NEVER underestimate RTT. BUT! If peer tries to make some clever
+         * tricks sort of "quick acks" for time long enough to decrease RTT
+         * to low value, and then abruptly stops to do it and starts to delay
+         * ACKs, wait for troubles.
+         */
+        if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
+                tp->srtt = dst_metric(dst, RTAX_RTT);
+                tp->rtt_seq = tp->snd_nxt;
+        }
+        if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
+                tp->mdev = dst_metric(dst, RTAX_RTTVAR);
+                tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+        }
+        tcp_set_rto(tp);
+        tcp_bound_rto(tp);
+        if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+                goto reset;
+        tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+        return;
+reset:
+        /* Play conservative. If timestamps are not
+         * supported, TCP will fail to recalculate correct
+         * rtt, if initial rto is too small. FORGET ALL AND RESET!
+         */
+        if (!tp->rx_opt.saw_tstamp && tp->srtt) {
+                tp->srtt = 0;
+                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
+                tp->rto = TCP_TIMEOUT_INIT;
+        }
+}
+static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+{
+        if (metric > tp->reordering) {
+                tp->reordering = min(TCP_MAX_REORDERING, metric);
+                /* This exciting event is worth to be remembered. 8) */
+                if (ts)
+                        NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
+                else if (IsReno(tp))
+                        NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
+                else if (IsFack(tp))
+                        NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
+                else
+                        NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
+#if FASTRETRANS_DEBUG > 1
+                printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
+                       tp->rx_opt.sack_ok, tp->ca_state,
+                       tp->reordering,
+                       tp->fackets_out,
+                       tp->sacked_out,
+                       tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+                /* Disable FACK yet. */
+                tp->rx_opt.sack_ok &= ~2;
+        }
+}
+/* This procedure tags the retransmission queue when SACKs arrive.
+ *
+ * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
+ * Packets in queue with these bits set are counted in variables
+ * sacked_out, retrans_out and lost_out, correspondingly.
+ *
+ * Valid combinations are:
+ * Tag  InFlight        Description
+ * 0    1               - orig segment is in flight.
+ * S    0               - nothing flies, orig reached receiver.
+ * L    0               - nothing flies, orig lost by net.
+ * R    2               - both orig and retransmit are in flight.
+ * L|R  1               - orig is lost, retransmit is in flight.
+ * S|R  1               - orig reached receiver, retrans is still in flight.
+ * (L|S|R is logically valid, it could occur when L|R is sacked,
+ *  but it is equivalent to plain S and code short-curcuits it to S.
+ *  L|S is logically invalid, it would mean -1 packet in flight 8))
+ *
+ * These 6 states form finite state machine, controlled by the following events:
+ * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
+ * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
+ * 3. Loss detection event of one of three flavors:
+ *      A. Scoreboard estimator decided the packet is lost.
+ *         A'. Reno "three dupacks" marks head of queue lost.
+ *         A''. Its FACK modfication, head until snd.fack is lost.
+ *      B. SACK arrives sacking data transmitted after never retransmitted
+ *         hole was sent out.
+ *      C. SACK arrives sacking SND.NXT at the moment, when the
+ *         segment was retransmitted.
+ * 4. D-SACK added new rule: D-SACK changes any tag to S.
+ *
+ * It is pleasant to note, that state diagram turns out to be commutative,
+ * so that we are allowed not to be bothered by order of our actions,
+ * when multiple events arrive simultaneously. (see the function below).
+ *
+ * Reordering detection.
+ * --------------------
+ * Reordering metric is maximal distance, which a packet can be displaced
+ * in packet stream. With SACKs we can estimate it:
+ *
+ * 1. SACK fills old hole and the corresponding segment was not
+ *    ever retransmitted -> reordering. Alas, we cannot use it
+ *    when segment was retransmitted.
+ * 2. The last flaw is solved with D-SACK. D-SACK arrives
+ *    for retransmitted and already SACKed segment -> reordering..
+ * Both of these heuristics are not used in Loss state, when we cannot
+ * account for retransmits accurately.
+ */
+static int
+tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
+        struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
+        int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
+        int reord = tp->packets_out;
+        int prior_fackets;
+        u32 lost_retrans = 0;
+        int flag = 0;
+        int i;
+        /* So, SACKs for already sent large segments will be lost.
+         * Not good, but alternative is to resegment the queue. */
+        if (sk->sk_route_caps & NETIF_F_TSO) {
+                sk->sk_route_caps &= ~NETIF_F_TSO;
+                sock_set_flag(sk, SOCK_NO_LARGESEND);
+                tp->mss_cache = tp->mss_cache_std;
+        }
+        if (!tp->sacked_out)
+                tp->fackets_out = 0;
+        prior_fackets = tp->fackets_out;
+        for (i=0; i<num_sacks; i++, sp++) {
+                struct sk_buff *skb;
+                __u32 start_seq = ntohl(sp->start_seq);
+                __u32 end_seq = ntohl(sp->end_seq);
+                int fack_count = 0;
+                int dup_sack = 0;
+                /* Check for D-SACK. */
+                if (i == 0) {
+                        u32 ack = TCP_SKB_CB(ack_skb)->ack_seq;
+                        if (before(start_seq, ack)) {
+                                dup_sack = 1;
+                                tp->rx_opt.sack_ok |= 4;
+                                NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
+                        } else if (num_sacks > 1 &&
+                                   !after(end_seq, ntohl(sp[1].end_seq)) &&
+                                   !before(start_seq, ntohl(sp[1].start_seq))) {
+                                dup_sack = 1;
+                                tp->rx_opt.sack_ok |= 4;
+                                NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
+                        }
+                        /* D-SACK for already forgotten data...
+                         * Do dumb counting. */
+                        if (dup_sack &&
+                            !after(end_seq, prior_snd_una) &&
+                            after(end_seq, tp->undo_marker))
+                                tp->undo_retrans--;
+                        /* Eliminate too old ACKs, but take into
+                         * account more or less fresh ones, they can
+                         * contain valid SACK info.
+                         */
+                        if (before(ack, prior_snd_una - tp->max_window))
+                                return 0;
+                }
+                /* Event "B" in the comment above. */
+                if (after(end_seq, tp->high_seq))
+                        flag |= FLAG_DATA_LOST;
+                sk_stream_for_retrans_queue(skb, sk) {
+                        u8 sacked = TCP_SKB_CB(skb)->sacked;
+                        int in_sack;
+                        /* The retransmission queue is always in order, so
+                         * we can short-circuit the walk early.
+                         */
+                        if(!before(TCP_SKB_CB(skb)->seq, end_seq))
+                                break;
+                        fack_count += tcp_skb_pcount(skb);
+                        in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+                                !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+                        /* Account D-SACK for retransmitted packet. */
+                        if ((dup_sack && in_sack) &&
+                            (sacked & TCPCB_RETRANS) &&
+                            after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+                                tp->undo_retrans--;
+                        /* The frame is ACKed. */
+                        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
+                                if (sacked&TCPCB_RETRANS) {
+                                        if ((dup_sack && in_sack) &&
+                                            (sacked&TCPCB_SACKED_ACKED))
+                                                reord = min(fack_count, reord);
+                                } else {
+                                        /* If it was in a hole, we detected reordering. */
+                                        if (fack_count < prior_fackets &&
+                                            !(sacked&TCPCB_SACKED_ACKED))
+                                                reord = min(fack_count, reord);
+                                }
+                                /* Nothing to do; acked frame is about to be dropped. */
+                                continue;
+                        }
+                        if ((sacked&TCPCB_SACKED_RETRANS) &&
+                            after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
+                            (!lost_retrans || after(end_seq, lost_retrans)))
+                                lost_retrans = end_seq;
+                        if (!in_sack)
+                                continue;
+                        if (!(sacked&TCPCB_SACKED_ACKED)) {
+                                if (sacked & TCPCB_SACKED_RETRANS) {
+                                        /* If the segment is not tagged as lost,
+                                         * we do not clear RETRANS, believing
+                                         * that retransmission is still in flight.
+                                         */
+                                        if (sacked & TCPCB_LOST) {
+                                                TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+                                                tp->lost_out -= tcp_skb_pcount(skb);
+                                                tp->retrans_out -= tcp_skb_pcount(skb);
+                                        }
+                                } else {
+                                        /* New sack for not retransmitted frame,
+                                         * which was in hole. It is reordering.
+                                         */
+                                        if (!(sacked & TCPCB_RETRANS) &&
+                                            fack_count < prior_fackets)
+                                                reord = min(fack_count, reord);
+                                        if (sacked & TCPCB_LOST) {
+                                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+                                                tp->lost_out -= tcp_skb_pcount(skb);
+                                        }
+                                }
+                                TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+                                flag |= FLAG_DATA_SACKED;
+                                tp->sacked_out += tcp_skb_pcount(skb);
+                                if (fack_count > tp->fackets_out)
+                                        tp->fackets_out = fack_count;
+                        } else {
+                                if (dup_sack && (sacked&TCPCB_RETRANS))
+                                        reord = min(fack_count, reord);
+                        }
+                        /* D-SACK. We can detect redundant retransmission
+                         * in S|R and plain R frames and clear it.
+                         * undo_retrans is decreased above, L|R frames
+                         * are accounted above as well.
+                         */
+                        if (dup_sack &&
+                            (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
+                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+                                tp->retrans_out -= tcp_skb_pcount(skb);
+                        }
+                }
+        }
+        /* Check for lost retransmit. This superb idea is
+         * borrowed from "ratehalving". Event "C".
+         * Later note: FACK people cheated me again 8),
+         * we have to account for reordering! Ugly,
+         * but should help.
+         */
+        if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+                struct sk_buff *skb;
+                sk_stream_for_retrans_queue(skb, sk) {
+                        if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
+                                break;
+                        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+                                continue;
+                        if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
+                            after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
+                            (IsFack(tp) ||
+                             !before(lost_retrans,
+                                     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
+                                     tp->mss_cache_std))) {
+                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+                                tp->retrans_out -= tcp_skb_pcount(skb);
+                                if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+                                        tp->lost_out += tcp_skb_pcount(skb);
+                                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+                                        flag |= FLAG_DATA_SACKED;
+                                        NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
+                                }
+                        }
+                }
+        }
+        tp->left_out = tp->sacked_out + tp->lost_out;
+        if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
+                tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+#if FASTRETRANS_DEBUG > 0
+        BUG_TRAP((int)tp->sacked_out >= 0);
+        BUG_TRAP((int)tp->lost_out >= 0);
+        BUG_TRAP((int)tp->retrans_out >= 0);
+        BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
+#endif
+        return flag;
+}
+/* RTO occurred, but do not yet enter loss state. Instead, transmit two new
+ * segments to see from the next ACKs whether any data was really missing.
+ * If the RTO was spurious, new ACKs should arrive.
+ */
+void tcp_enter_frto(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        tp->frto_counter = 1;
+        if (tp->ca_state <= TCP_CA_Disorder ||
+            tp->snd_una == tp->high_seq ||
+            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+                tp->prior_ssthresh = tcp_current_ssthresh(tp);
+                if (!tcp_westwood_ssthresh(tp))
+                        tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+        }
+        /* Have to clear retransmission markers here to keep the bookkeeping
+         * in shape, even though we are not yet in Loss state.
+         * If something was really lost, it is eventually caught up
+         * in tcp_enter_frto_loss.
+         */
+        tp->retrans_out = 0;
+        tp->undo_marker = tp->snd_una;
+        tp->undo_retrans = 0;
+        sk_stream_for_retrans_queue(skb, sk) {
+                TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS;
+        }
+        tcp_sync_left_out(tp);
+        tcp_set_ca_state(tp, TCP_CA_Open);
+        tp->frto_highmark = tp->snd_nxt;
+}
+/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
+ * which indicates that we should follow the traditional RTO recovery,
+ * i.e. mark everything lost and do go-back-N retransmission.
+ */
+static void tcp_enter_frto_loss(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        int cnt = 0;
+        tp->sacked_out = 0;
+        tp->lost_out = 0;
+        tp->fackets_out = 0;
+        sk_stream_for_retrans_queue(skb, sk) {
+                cnt += tcp_skb_pcount(skb);
+                TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+                if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+                        /* Do not mark those segments lost that were
+                         * forward transmitted after RTO
+                         */
+                        if (!after(TCP_SKB_CB(skb)->end_seq,
+                                   tp->frto_highmark)) {
+                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+                                tp->lost_out += tcp_skb_pcount(skb);
+                        }
+                } else {
+                        tp->sacked_out += tcp_skb_pcount(skb);
+                        tp->fackets_out = cnt;
+                }
+        }
+        tcp_sync_left_out(tp);
+        tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1;
+        tp->snd_cwnd_cnt = 0;
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+        tp->undo_marker = 0;
+        tp->frto_counter = 0;
+        tp->reordering = min_t(unsigned int, tp->reordering,
+                                             sysctl_tcp_reordering);
+        tcp_set_ca_state(tp, TCP_CA_Loss);
+        tp->high_seq = tp->frto_highmark;
+        TCP_ECN_queue_cwr(tp);
+        init_bictcp(tp);
+}
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+        tp->left_out = 0;
+        tp->retrans_out = 0;
+        tp->fackets_out = 0;
+        tp->sacked_out = 0;
+        tp->lost_out = 0;
+        tp->undo_marker = 0;
+        tp->undo_retrans = 0;
+}
+/* Enter Loss state. If "how" is not zero, forget all SACK information
+ * and reset tags completely, otherwise preserve SACKs. If receiver
+ * dropped its ofo queue, we will know this due to reneging detection.
+ */
+void tcp_enter_loss(struct sock *sk, int how)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        int cnt = 0;
+        /* Reduce ssthresh if it has not yet been made inside this window. */
+        if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+                tp->prior_ssthresh = tcp_current_ssthresh(tp);
+                tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+        }
+        tp->snd_cwnd       = 1;
+        tp->snd_cwnd_cnt   = 0;
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+        tcp_clear_retrans(tp);
+        /* Push undo marker, if it was plain RTO and nothing
+         * was retransmitted. */
+        if (!how)
+                tp->undo_marker = tp->snd_una;
+        sk_stream_for_retrans_queue(skb, sk) {
+                cnt += tcp_skb_pcount(skb);
+                if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
+                        tp->undo_marker = 0;
+                TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
+                if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+                        tp->lost_out += tcp_skb_pcount(skb);
+                } else {
+                        tp->sacked_out += tcp_skb_pcount(skb);
+                        tp->fackets_out = cnt;
+                }
+        }
+        tcp_sync_left_out(tp);
+        tp->reordering = min_t(unsigned int, tp->reordering,
+                                             sysctl_tcp_reordering);
+        tcp_set_ca_state(tp, TCP_CA_Loss);
+        tp->high_seq = tp->snd_nxt;
+        TCP_ECN_queue_cwr(tp);
+}
+static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+{
+        struct sk_buff *skb;
+        /* If ACK arrived pointing to a remembered SACK,
+         * it means that our remembered SACKs do not reflect
+         * real state of receiver i.e.
+         * receiver _host_ is heavily congested (or buggy).
+         * Do processing similar to RTO timeout.
+         */
+        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
+            (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+                NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
+                tcp_enter_loss(sk, 1);
+                tp->retransmits++;
+                tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                return 1;
+        }
+        return 0;
+}
+static inline int tcp_fackets_out(struct tcp_sock *tp)
+{
+        return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
+}
+static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+}
+static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
+{
+        return tp->packets_out &&
+               tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+}
+/* Linux NewReno/SACK/FACK/ECN state machine.
+ * --------------------------------------
+ *
+ * "Open"       Normal state, no dubious events, fast path.
+ * "Disorder"   In all the respects it is "Open",
+ *              but requires a bit more attention. It is entered when
+ *              we see some SACKs or dupacks. It is split of "Open"
+ *              mainly to move some processing from fast path to slow one.
+ * "CWR"        CWND was reduced due to some Congestion Notification event.
+ *              It can be ECN, ICMP source quench, local device congestion.
+ * "Recovery"   CWND was reduced, we are fast-retransmitting.
+ * "Loss"       CWND was reduced due to RTO timeout or SACK reneging.
+ *
+ * tcp_fastretrans_alert() is entered:
+ * - each incoming ACK, if state is not "Open"
+ * - when arrived ACK is unusual, namely:
+ *      * SACK
+ *      * Duplicate ACK.
+ *      * ECN ECE.
+ *
+ * Counting packets in flight is pretty simple.
+ *
+ *      in_flight = packets_out - left_out + retrans_out
+ *
+ *      packets_out is SND.NXT-SND.UNA counted in packets.
+ *
+ *      retrans_out is number of retransmitted segments.
+ *
+ *      left_out is number of segments left network, but not ACKed yet.
+ *
+ *              left_out = sacked_out + lost_out
+ *
+ *     sacked_out: Packets, which arrived to receiver out of order
+ *                 and hence not ACKed. With SACKs this number is simply
+ *                 amount of SACKed data. Even without SACKs
+ *                 it is easy to give pretty reliable estimate of this number,
+ *                 counting duplicate ACKs.
+ *
+ *       lost_out: Packets lost by network. TCP has no explicit
+ *                 "loss notification" feedback from network (for now).
+ *                 It means that this number can be only _guessed_.
+ *                 Actually, it is the heuristics to predict lossage that
+ *                 distinguishes different algorithms.
+ *
+ *      F.e. after RTO, when all the queue is considered as lost,
+ *      lost_out = packets_out and in_flight = retrans_out.
+ *
+ *              Essentially, we have now two algorithms counting
+ *              lost packets.
+ *
+ *              FACK: It is the simplest heuristics. As soon as we decided
+ *              that something is lost, we decide that _all_ not SACKed
+ *              packets until the most forward SACK are lost. I.e.
+ *              lost_out = fackets_out - sacked_out and left_out = fackets_out.
+ *              It is absolutely correct estimate, if network does not reorder
+ *              packets. And it loses any connection to reality when reordering
+ *              takes place. We use FACK by default until reordering
+ *              is suspected on the path to this destination.
+ *
+ *              NewReno: when Recovery is entered, we assume that one segment
+ *              is lost (classic Reno). While we are in Recovery and
+ *              a partial ACK arrives, we assume that one more packet
+ *              is lost (NewReno). This heuristics are the same in NewReno
+ *              and SACK.
+ *
+ *  Imagine, that's all! Forget about all this shamanism about CWND inflation
+ *  deflation etc. CWND is real congestion window, never inflated, changes
+ *  only according to classic VJ rules.
+ *
+ * Really tricky (and requiring careful tuning) part of algorithm
+ * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The first determines the moment _when_ we should reduce CWND and,
+ * hence, slow down forward transmission. In fact, it determines the moment
+ * when we decide that hole is caused by loss, rather than by a reorder.
+ *
+ * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
+ * holes, caused by lost packets.
+ *
+ * And the most logically complicated part of algorithm is undo
+ * heuristics. We detect false retransmits due to both too early
+ * fast retransmit (reordering) and underestimated RTO, analyzing
+ * timestamps and D-SACKs. When we detect that some segments were
+ * retransmitted by mistake and CWND reduction was wrong, we undo
+ * window reduction and abort recovery phase. This logic is hidden
+ * inside several functions named tcp_try_undo_<something>.
+ */
+/* This function decides, when we should leave Disordered state
+ * and enter Recovery phase, reducing congestion window.
+ *
+ * Main question: may we further continue forward transmission
+ * with the same cwnd?
+ */
+static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
+{
+        __u32 packets_out;
+        /* Trick#1: The loss is proven. */
+        if (tp->lost_out)
+                return 1;
+        /* Not-A-Trick#2 : Classic rule... */
+        if (tcp_fackets_out(tp) > tp->reordering)
+                return 1;
+        /* Trick#3 : when we use RFC2988 timer restart, fast
+         * retransmit can be triggered by timeout of queue head.
+         */
+        if (tcp_head_timedout(sk, tp))
+                return 1;
+        /* Trick#4: It is still not OK... But will it be useful to delay
+         * recovery more?
+         */
+        packets_out = tp->packets_out;
+        if (packets_out <= tp->reordering &&
+            tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+            !tcp_may_send_now(sk, tp)) {
+                /* We have nothing to send. This connection is limited
+                 * either by receiver window or by application.
+                 */
+                return 1;
+        }
+        return 0;
+}
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+{
+        u32 holes;
+        holes = max(tp->lost_out, 1U);
+        holes = min(holes, tp->packets_out);
+        if ((tp->sacked_out + holes) > tp->packets_out) {
+                tp->sacked_out = tp->packets_out - holes;
+                tcp_update_reordering(tp, tp->packets_out+addend, 0);
+        }
+}
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+static void tcp_add_reno_sack(struct tcp_sock *tp)
+{
+        tp->sacked_out++;
+        tcp_check_reno_reordering(tp, 0);
+        tcp_sync_left_out(tp);
+}
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked)
+{
+        if (acked > 0) {
+                /* One ACK acked hole. The rest eat duplicate ACKs. */
+                if (acked-1 >= tp->sacked_out)
+                        tp->sacked_out = 0;
+                else
+                        tp->sacked_out -= acked-1;
+        }
+        tcp_check_reno_reordering(tp, acked);
+        tcp_sync_left_out(tp);
+}
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+        tp->sacked_out = 0;
+        tp->left_out = tp->lost_out;
+}
+/* Mark head of queue up as lost. */
+static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
+                               int packets, u32 high_seq)
+{
+        struct sk_buff *skb;
+        int cnt = packets;
+        BUG_TRAP(cnt <= tp->packets_out);
+        sk_stream_for_retrans_queue(skb, sk) {
+                cnt -= tcp_skb_pcount(skb);
+                if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+                        break;
+                if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+                        tp->lost_out += tcp_skb_pcount(skb);
+                }
+        }
+        tcp_sync_left_out(tp);
+}
+/* Account newly detected lost packet(s) */
+static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+{
+        if (IsFack(tp)) {
+                int lost = tp->fackets_out - tp->reordering;
+                if (lost <= 0)
+                        lost = 1;
+                tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
+        } else {
+                tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
+        }
+        /* New heuristics: it is possible only after we switched
+         * to restart timer each time when something is ACKed.
+         * Hence, we can detect timed out packets during fast
+         * retransmit without falling to slow start.
+         */
+        if (tcp_head_timedout(sk, tp)) {
+                struct sk_buff *skb;
+                sk_stream_for_retrans_queue(skb, sk) {
+                        if (tcp_skb_timedout(tp, skb) &&
+                            !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+                                tp->lost_out += tcp_skb_pcount(skb);
+                        }
+                }
+                tcp_sync_left_out(tp);
+        }
+}
+/* CWND moderation, preventing bursts due to too big ACKs
+ * in dubious situations.
+ */
+static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
+{
+        tp->snd_cwnd = min(tp->snd_cwnd,
+                           tcp_packets_in_flight(tp)+tcp_max_burst(tp));
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+/* Decrease cwnd each second ack. */
+static void tcp_cwnd_down(struct tcp_sock *tp)
+{
+        int decr = tp->snd_cwnd_cnt + 1;
+        __u32 limit;
+        /*
+         * TCP Westwood
+         * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
+         * in packets we use mss_cache). If sysctl_tcp_westwood is off
+         * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
+         * still used as usual. It prevents other strange cases in which
+         * BWE*RTTmin could assume value 0. It should not happen but...
+         */
+        if (!(limit = tcp_westwood_bw_rttmin(tp)))
+                limit = tp->snd_ssthresh/2;
+        tp->snd_cwnd_cnt = decr&1;
+        decr >>= 1;
+        if (decr && tp->snd_cwnd > limit)
+                tp->snd_cwnd -= decr;
+        tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline int tcp_packet_delayed(struct tcp_sock *tp)
+{
+        return !tp->retrans_stamp ||
+                (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+                 (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
+}
+/* Undo procedures. */
+#if FASTRETRANS_DEBUG > 1
+static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
+               msg,
+               NIPQUAD(inet->daddr), ntohs(inet->dport),
+               tp->snd_cwnd, tp->left_out,
+               tp->snd_ssthresh, tp->prior_ssthresh,
+               tp->packets_out);
+}
+#else
+#define DBGUNDO(x...) do { } while (0)
+#endif
+static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+{
+        if (tp->prior_ssthresh) {
+                if (tcp_is_bic(tp))
+                        tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+                else
+                        tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
+                if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
+                        tp->snd_ssthresh = tp->prior_ssthresh;
+                        TCP_ECN_withdraw_cwr(tp);
+                }
+        } else {
+                tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
+        }
+        tcp_moderate_cwnd(tp);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+static inline int tcp_may_undo(struct tcp_sock *tp)
+{
+        return tp->undo_marker &&
+                (!tp->undo_retrans || tcp_packet_delayed(tp));
+}
+/* People celebrate: "We love our President!" */
+static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
+{
+        if (tcp_may_undo(tp)) {
+                /* Happy end! We did not retransmit anything
+                 * or our original transmission succeeded.
+                 */
+                DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
+                tcp_undo_cwr(tp, 1);
+                if (tp->ca_state == TCP_CA_Loss)
+                        NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
+                else
+                        NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
+                tp->undo_marker = 0;
+        }
+        if (tp->snd_una == tp->high_seq && IsReno(tp)) {
+                /* Hold old state until something *above* high_seq
+                 * is ACKed. For Reno it is MUST to prevent false
+                 * fast retransmits (RFC2582). SACK TCP is safe. */
+                tcp_moderate_cwnd(tp);
+                return 1;
+        }
+        tcp_set_ca_state(tp, TCP_CA_Open);
+        return 0;
+}
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
+{
+        if (tp->undo_marker && !tp->undo_retrans) {
+                DBGUNDO(sk, tp, "D-SACK");
+                tcp_undo_cwr(tp, 1);
+                tp->undo_marker = 0;
+                NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
+        }
+}
+/* Undo during fast recovery after partial ACK. */
+static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
+                                int acked)
+{
+        /* Partial ACK arrived. Force Hoe's retransmit. */
+        int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
+        if (tcp_may_undo(tp)) {
+                /* Plain luck! Hole if filled with delayed
+                 * packet, rather than with a retransmit.
+                 */
+                if (tp->retrans_out == 0)
+                        tp->retrans_stamp = 0;
+                tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+                DBGUNDO(sk, tp, "Hoe");
+                tcp_undo_cwr(tp, 0);
+                NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
+                /* So... Do not make Hoe's retransmit yet.
+                 * If the first packet was delayed, the rest
+                 * ones are most probably delayed as well.
+                 */
+                failed = 0;
+        }
+        return failed;
+}
+/* Undo during loss recovery after partial ACK. */
+static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
+{
+        if (tcp_may_undo(tp)) {
+                struct sk_buff *skb;
+                sk_stream_for_retrans_queue(skb, sk) {
+                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+                }
+                DBGUNDO(sk, tp, "partial loss");
+                tp->lost_out = 0;
+                tp->left_out = tp->sacked_out;
+                tcp_undo_cwr(tp, 1);
+                NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
+                tp->retransmits = 0;
+                tp->undo_marker = 0;
+                if (!IsReno(tp))
+                        tcp_set_ca_state(tp, TCP_CA_Open);
+                return 1;
+        }
+        return 0;
+}
+static inline void tcp_complete_cwr(struct tcp_sock *tp)
+{
+        if (tcp_westwood_cwnd(tp)) 
+                tp->snd_ssthresh = tp->snd_cwnd;
+        else
+                tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
+{
+        tp->left_out = tp->sacked_out;
+        if (tp->retrans_out == 0)
+                tp->retrans_stamp = 0;
+        if (flag&FLAG_ECE)
+                tcp_enter_cwr(tp);
+        if (tp->ca_state != TCP_CA_CWR) {
+                int state = TCP_CA_Open;
+                if (tp->left_out || tp->retrans_out || tp->undo_marker)
+                        state = TCP_CA_Disorder;
+                if (tp->ca_state != state) {
+                        tcp_set_ca_state(tp, state);
+                        tp->high_seq = tp->snd_nxt;
+                }
+                tcp_moderate_cwnd(tp);
+        } else {
+                tcp_cwnd_down(tp);
+        }
+}
+/* Process an event, which can update packets-in-flight not trivially.
+ * Main goal of this function is to calculate new estimate for left_out,
+ * taking into account both packets sitting in receiver's buffer and
+ * packets lost by network.
+ *
+ * Besides that it does CWND reduction, when packet loss is detected
+ * and changes state of machine.
+ *
+ * It does _not_ decide what to send, it is made in function
+ * tcp_xmit_retransmit_queue().
+ */
+static void
+tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
+                      int prior_packets, int flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
+        /* Some technical things:
+         * 1. Reno does not count dupacks (sacked_out) automatically. */
+        if (!tp->packets_out)
+                tp->sacked_out = 0;
+        /* 2. SACK counts snd_fack in packets inaccurately. */
+        if (tp->sacked_out == 0)
+                tp->fackets_out = 0;
+        /* Now state machine starts.
+         * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+        if (flag&FLAG_ECE)
+                tp->prior_ssthresh = 0;
+        /* B. In all the states check for reneging SACKs. */
+        if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+                return;
+        /* C. Process data loss notification, provided it is valid. */
+        if ((flag&FLAG_DATA_LOST) &&
+            before(tp->snd_una, tp->high_seq) &&
+            tp->ca_state != TCP_CA_Open &&
+            tp->fackets_out > tp->reordering) {
+                tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
+                NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
+        }
+        /* D. Synchronize left_out to current state. */
+        tcp_sync_left_out(tp);
+        /* E. Check state exit conditions. State can be terminated
+         *    when high_seq is ACKed. */
+        if (tp->ca_state == TCP_CA_Open) {
+                if (!sysctl_tcp_frto)
+                        BUG_TRAP(tp->retrans_out == 0);
+                tp->retrans_stamp = 0;
+        } else if (!before(tp->snd_una, tp->high_seq)) {
+                switch (tp->ca_state) {
+                case TCP_CA_Loss:
+                        tp->retransmits = 0;
+                        if (tcp_try_undo_recovery(sk, tp))
+                                return;
+                        break;
+                case TCP_CA_CWR:
+                        /* CWR is to be held something *above* high_seq
+                         * is ACKed for CWR bit to reach receiver. */
+                        if (tp->snd_una != tp->high_seq) {
+                                tcp_complete_cwr(tp);
+                                tcp_set_ca_state(tp, TCP_CA_Open);
+                        }
+                        break;
+                case TCP_CA_Disorder:
+                        tcp_try_undo_dsack(sk, tp);
+                        if (!tp->undo_marker ||
+                            /* For SACK case do not Open to allow to undo
+                             * catching for all duplicate ACKs. */
+                            IsReno(tp) || tp->snd_una != tp->high_seq) {
+                                tp->undo_marker = 0;
+                                tcp_set_ca_state(tp, TCP_CA_Open);
+                        }
+                        break;
+                case TCP_CA_Recovery:
+                        if (IsReno(tp))
+                                tcp_reset_reno_sack(tp);
+                        if (tcp_try_undo_recovery(sk, tp))
+                                return;
+                        tcp_complete_cwr(tp);
+                        break;
+                }
+        }
+        /* F. Process state. */
+        switch (tp->ca_state) {
+        case TCP_CA_Recovery:
+                if (prior_snd_una == tp->snd_una) {
+                        if (IsReno(tp) && is_dupack)
+                                tcp_add_reno_sack(tp);
+                } else {
+                        int acked = prior_packets - tp->packets_out;
+                        if (IsReno(tp))
+                                tcp_remove_reno_sacks(sk, tp, acked);
+                        is_dupack = tcp_try_undo_partial(sk, tp, acked);
+                }
+                break;
+        case TCP_CA_Loss:
+                if (flag&FLAG_DATA_ACKED)
+                        tp->retransmits = 0;
+                if (!tcp_try_undo_loss(sk, tp)) {
+                        tcp_moderate_cwnd(tp);
+                        tcp_xmit_retransmit_queue(sk);
+                        return;
+                }
+                if (tp->ca_state != TCP_CA_Open)
+                        return;
+                /* Loss is undone; fall through to processing in Open state. */
+        default:
+                if (IsReno(tp)) {
+                        if (tp->snd_una != prior_snd_una)
+                                tcp_reset_reno_sack(tp);
+                        if (is_dupack)
+                                tcp_add_reno_sack(tp);
+                }
+                if (tp->ca_state == TCP_CA_Disorder)
+                        tcp_try_undo_dsack(sk, tp);
+                if (!tcp_time_to_recover(sk, tp)) {
+                        tcp_try_to_open(sk, tp, flag);
+                        return;
+                }
+                /* Otherwise enter Recovery state */
+                if (IsReno(tp))
+                        NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
+                else
+                        NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
+                tp->high_seq = tp->snd_nxt;
+                tp->prior_ssthresh = 0;
+                tp->undo_marker = tp->snd_una;
+                tp->undo_retrans = tp->retrans_out;
+                if (tp->ca_state < TCP_CA_CWR) {
+                        if (!(flag&FLAG_ECE))
+                                tp->prior_ssthresh = tcp_current_ssthresh(tp);
+                        tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+                        TCP_ECN_queue_cwr(tp);
+                }
+                tp->snd_cwnd_cnt = 0;
+                tcp_set_ca_state(tp, TCP_CA_Recovery);
+        }
+        if (is_dupack || tcp_head_timedout(sk, tp))
+                tcp_update_scoreboard(sk, tp);
+        tcp_cwnd_down(tp);
+        tcp_xmit_retransmit_queue(sk);
+}
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+{
+        __u32 seq_rtt;
+        /* RTTM Rule: A TSecr value received in a segment is used to
+         * update the averaged RTT measurement only if the segment
+         * acknowledges some new data, i.e., only if it advances the
+         * left edge of the send window.
+         *
+         * See draft-ietf-tcplw-high-performance-00, section 3.3.
+         * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+         *
+         * Changed: reset backoff as soon as we see the first valid sample.
+         * If we do not, we get strongly overstimated rto. With timestamps
+         * samples are accepted even from very old segments: f.e., when rtt=1
+         * increases to 8, we retransmit 5 times and after 8 seconds delayed
+         * answer arrives rto becomes 120 seconds! If at least one of segments
+         * in window is lost... Voila.                          --ANK (010210)
+         */
+        seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+        tcp_rtt_estimator(tp, seq_rtt);
+        tcp_set_rto(tp);
+        tp->backoff = 0;
+        tcp_bound_rto(tp);
+}
+static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+{
+        /* We don't have a timestamp. Can only use
+         * packets that are not retransmitted to determine
+         * rtt estimates. Also, we must not reset the
+         * backoff for rto until we get a non-retransmitted
+         * packet. This allows us to deal with a situation
+         * where the network delay has increased suddenly.
+         * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+         */
+        if (flag & FLAG_RETRANS_DATA_ACKED)
+                return;
+        tcp_rtt_estimator(tp, seq_rtt);
+        tcp_set_rto(tp);
+        tp->backoff = 0;
+        tcp_bound_rto(tp);
+}
+static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
+                                      int flag, s32 seq_rtt)
+{
+        /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
+        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+                tcp_ack_saw_tstamp(tp, flag);
+        else if (seq_rtt >= 0)
+                tcp_ack_no_tstamp(tp, seq_rtt, flag);
+}
+/*
+ * Compute congestion window to use.
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
+ *  "Binary Increase Congestion Control for Fast, Long Distance
+ *  Networks" in InfoComm 2004
+ * Available from:
+ *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
+{
+        /* orignal Reno behaviour */
+        if (!tcp_is_bic(tp))
+                return tp->snd_cwnd;
+        if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
+           (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
+                return tp->bictcp.cnt;
+        tp->bictcp.last_cwnd = tp->snd_cwnd;
+        tp->bictcp.last_stamp = tcp_time_stamp;
+      
+        /* start off normal */
+        if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
+                tp->bictcp.cnt = tp->snd_cwnd;
+        /* binary increase */
+        else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
+                __u32   dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
+                        / BICTCP_B;
+                if (dist > BICTCP_MAX_INCREMENT)
+                        /* linear increase */
+                        tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
+                else if (dist <= 1U)
+                        /* binary search increase */
+                        tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
+                                / BICTCP_B;
+                else
+                        /* binary search increase */
+                        tp->bictcp.cnt = tp->snd_cwnd / dist;
+        } else {
+                /* slow start amd linear increase */
+                if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
+                        /* slow start */
+                        tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
+                                / BICTCP_B;
+                else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
+                                        + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
+                        /* slow start */
+                        tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
+                                / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
+                else
+                        /* linear increase */
+                        tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
+        }
+        return tp->bictcp.cnt;
+}
+/* This is Jacobson's slow start and congestion avoidance. 
+ * SIGCOMM '88, p. 328.
+ */
+static inline void reno_cong_avoid(struct tcp_sock *tp)
+{
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                        tp->snd_cwnd++;
+        } else {
+                /* In dangerous area, increase slowly.
+                 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+                 */
+                if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
+                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt=0;
+                } else
+                        tp->snd_cwnd_cnt++;
+        }
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+/* This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *      ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
+{
+        /* The key players are v_beg_snd_una and v_beg_snd_nxt.
+         *
+         * These are so named because they represent the approximate values
+         * of snd_una and snd_nxt at the beginning of the current RTT. More
+         * precisely, they represent the amount of data sent during the RTT.
+         * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+         * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+         * bytes of data have been ACKed during the course of the RTT, giving
+         * an "actual" rate of:
+         *
+         *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+         *
+         * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+         * because delayed ACKs can cover more than one segment, so they
+         * don't line up nicely with the boundaries of RTTs.
+         *
+         * Another unfortunate fact of life is that delayed ACKs delay the
+         * advance of the left edge of our send window, so that the number
+         * of bytes we send in an RTT is often less than our cwnd will allow.
+         * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+         */
+        if (after(ack, tp->vegas.beg_snd_nxt)) {
+                /* Do the Vegas once-per-RTT cwnd adjustment. */
+                u32 old_wnd, old_snd_cwnd;
+                
+                /* Here old_wnd is essentially the window of data that was
+                 * sent during the previous RTT, and has all
+                 * been acknowledged in the course of the RTT that ended
+                 * with the ACK we just received. Likewise, old_snd_cwnd
+                 * is the cwnd during the previous RTT.
+                 */
+                old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
+                        tp->mss_cache_std;
+                old_snd_cwnd = tp->vegas.beg_snd_cwnd;
+                /* Save the extent of the current window so we can use this
+                 * at the end of the next RTT.
+                 */
+                tp->vegas.beg_snd_una  = tp->vegas.beg_snd_nxt;
+                tp->vegas.beg_snd_nxt  = tp->snd_nxt;
+                tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
+                /* Take into account the current RTT sample too, to
+                 * decrease the impact of delayed acks. This double counts
+                 * this sample since we count it for the next window as well,
+                 * but that's not too awful, since we're taking the min,
+                 * rather than averaging.
+                 */
+                vegas_rtt_calc(tp, seq_rtt);
+                /* We do the Vegas calculations only if we got enough RTT
+                 * samples that we can be reasonably sure that we got
+                 * at least one RTT sample that wasn't from a delayed ACK.
+                 * If we only had 2 samples total,
+                 * then that means we're getting only 1 ACK per RTT, which
+                 * means they're almost certainly delayed ACKs.
+                 * If  we have 3 samples, we should be OK.
+                 */
+                if (tp->vegas.cntRTT <= 2) {
+                        /* We don't have enough RTT samples to do the Vegas
+                         * calculation, so we'll behave like Reno.
+                         */
+                        if (tp->snd_cwnd > tp->snd_ssthresh)
+                                tp->snd_cwnd++;
+                } else {
+                        u32 rtt, target_cwnd, diff;
+                        /* We have enough RTT samples, so, using the Vegas
+                         * algorithm, we determine if we should increase or
+                         * decrease cwnd, and by how much.
+                         */
+                        /* Pluck out the RTT we are using for the Vegas
+                         * calculations. This is the min RTT seen during the
+                         * last RTT. Taking the min filters out the effects
+                         * of delayed ACKs, at the cost of noticing congestion
+                         * a bit later.
+                         */
+                        rtt = tp->vegas.minRTT;
+                        /* Calculate the cwnd we should have, if we weren't
+                         * going too fast.
+                         *
+                         * This is:
+                         *     (actual rate in segments) * baseRTT
+                         * We keep it as a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary point.
+                         */
+                        target_cwnd = ((old_wnd * tp->vegas.baseRTT)
+                                       << V_PARAM_SHIFT) / rtt;
+                        /* Calculate the difference between the window we had,
+                         * and the window we would like to have. This quantity
+                         * is the "Diff" from the Arizona Vegas papers.
+                         *
+                         * Again, this is a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary
+                         * point.
+                         */
+                        diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+                        if (tp->snd_cwnd < tp->snd_ssthresh) {
+                                /* Slow start.  */
+                                if (diff > sysctl_tcp_vegas_gamma) {
+                                        /* Going too fast. Time to slow down
+                                         * and switch to congestion avoidance.
+                                         */
+                                        tp->snd_ssthresh = 2;
+                                        /* Set cwnd to match the actual rate
+                                         * exactly:
+                                         *   cwnd = (actual rate) * baseRTT
+                                         * Then we add 1 because the integer
+                                         * truncation robs us of full link
+                                         * utilization.
+                                         */
+                                        tp->snd_cwnd = min(tp->snd_cwnd,
+                                                           (target_cwnd >>
+                                                            V_PARAM_SHIFT)+1);
+                                }
+                        } else {
+                                /* Congestion avoidance. */
+                                u32 next_snd_cwnd;
+                                /* Figure out where we would like cwnd
+                                 * to be.
+                                 */
+                                if (diff > sysctl_tcp_vegas_beta) {
+                                        /* The old window was too fast, so
+                                         * we slow down.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd - 1;
+                                } else if (diff < sysctl_tcp_vegas_alpha) {
+                                        /* We don't have enough extra packets
+                                         * in the network, so speed up.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd + 1;
+                                } else {
+                                        /* Sending just as fast as we
+                                         * should be.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd;
+                                }
+                                /* Adjust cwnd upward or downward, toward the
+                                 * desired value.
+                                 */
+                                if (next_snd_cwnd > tp->snd_cwnd)
+                                        tp->snd_cwnd++;
+                                else if (next_snd_cwnd < tp->snd_cwnd)
+                                        tp->snd_cwnd--;
+                        }
+                }
+                /* Wipe the slate clean for the next RTT. */
+                tp->vegas.cntRTT = 0;
+                tp->vegas.minRTT = 0x7fffffff;
+        }
+        /* The following code is executed for every ack we receive,
+         * except for conditions checked in should_advance_cwnd()
+         * before the call to tcp_cong_avoid(). Mainly this means that
+         * we only execute this code if the ack actually acked some
+         * data.
+         */
+        /* If we are in slow start, increase our cwnd in response to this ACK.
+         * (If we are not in slow start then we are in congestion avoidance,
+         * and adjust our congestion window only once per RTT. See the code
+         * above.)
+         */
+        if (tp->snd_cwnd <= tp->snd_ssthresh) 
+                tp->snd_cwnd++;
+        /* to keep cwnd from growing without bound */
+        tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+        /* Make sure that we are never so timid as to reduce our cwnd below
+         * 2 MSS.
+         *
+         * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+         */
+        tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
+{
+        if (tcp_vegas_enabled(tp))
+                vegas_cong_avoid(tp, ack, seq_rtt);
+        else
+                reno_cong_avoid(tp);
+}
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends to restart timer to now+rto.
+ */
+static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+{
+        if (!tp->packets_out) {
+                tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+        } else {
+                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+        }
+}
+/* There is one downside to this scheme.  Although we keep the
+ * ACK clock ticking, adjusting packet counters and advancing
+ * congestion window, we do not liberate socket send buffer
+ * space.
+ *
+ * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
+ * then making a write space wakeup callback is a possible
+ * future enhancement.  WARNING: it is not trivial to make.
+ */
+static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
+                         __u32 now, __s32 *seq_rtt)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
+        __u32 seq = tp->snd_una;
+        __u32 packets_acked;
+        int acked = 0;
+        /* If we get here, the whole TSO packet has not been
+         * acked.
+         */
+        BUG_ON(!after(scb->end_seq, seq));
+        packets_acked = tcp_skb_pcount(skb);
+        if (tcp_trim_head(sk, skb, seq - scb->seq))
+                return 0;
+        packets_acked -= tcp_skb_pcount(skb);
+        if (packets_acked) {
+                __u8 sacked = scb->sacked;
+                acked |= FLAG_DATA_ACKED;
+                if (sacked) {
+                        if (sacked & TCPCB_RETRANS) {
+                                if (sacked & TCPCB_SACKED_RETRANS)
+                                        tp->retrans_out -= packets_acked;
+                                acked |= FLAG_RETRANS_DATA_ACKED;
+                                *seq_rtt = -1;
+                        } else if (*seq_rtt < 0)
+                                *seq_rtt = now - scb->when;
+                        if (sacked & TCPCB_SACKED_ACKED)
+                                tp->sacked_out -= packets_acked;
+                        if (sacked & TCPCB_LOST)
+                                tp->lost_out -= packets_acked;
+                        if (sacked & TCPCB_URG) {
+                                if (tp->urg_mode &&
+                                    !before(seq, tp->snd_up))
+                                        tp->urg_mode = 0;
+                        }
+                } else if (*seq_rtt < 0)
+                        *seq_rtt = now - scb->when;
+                if (tp->fackets_out) {
+                        __u32 dval = min(tp->fackets_out, packets_acked);
+                        tp->fackets_out -= dval;
+                }
+                tp->packets_out -= packets_acked;
+                BUG_ON(tcp_skb_pcount(skb) == 0);
+                BUG_ON(!before(scb->seq, scb->end_seq));
+        }
+        return acked;
+}
+/* Remove acknowledged frames from the retransmission queue. */
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        __u32 now = tcp_time_stamp;
+        int acked = 0;
+        __s32 seq_rtt = -1;
+        while ((skb = skb_peek(&sk->sk_write_queue)) &&
+               skb != sk->sk_send_head) {
+                struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
+                __u8 sacked = scb->sacked;
+                /* If our packet is before the ack sequence we can
+                 * discard it as it's confirmed to have arrived at
+                 * the other end.
+                 */
+                if (after(scb->end_seq, tp->snd_una)) {
+                        if (tcp_skb_pcount(skb) > 1)
+                                acked |= tcp_tso_acked(sk, skb,
+                                                       now, &seq_rtt);
+                        break;
+                }
+                /* Initial outgoing SYN's get put onto the write_queue
+                 * just like anything else we transmit.  It is not
+                 * true data, and if we misinform our callers that
+                 * this ACK acks real data, we will erroneously exit
+                 * connection startup slow start one packet too
+                 * quickly.  This is severely frowned upon behavior.
+                 */
+                if (!(scb->flags & TCPCB_FLAG_SYN)) {
+                        acked |= FLAG_DATA_ACKED;
+                } else {
+                        acked |= FLAG_SYN_ACKED;
+                        tp->retrans_stamp = 0;
+                }
+                if (sacked) {
+                        if (sacked & TCPCB_RETRANS) {
+                                if(sacked & TCPCB_SACKED_RETRANS)
+                                        tp->retrans_out -= tcp_skb_pcount(skb);
+                                acked |= FLAG_RETRANS_DATA_ACKED;
+                                seq_rtt = -1;
+                        } else if (seq_rtt < 0)
+                                seq_rtt = now - scb->when;
+                        if (sacked & TCPCB_SACKED_ACKED)
+                                tp->sacked_out -= tcp_skb_pcount(skb);
+                        if (sacked & TCPCB_LOST)
+                                tp->lost_out -= tcp_skb_pcount(skb);
+                        if (sacked & TCPCB_URG) {
+                                if (tp->urg_mode &&
+                                    !before(scb->end_seq, tp->snd_up))
+                                        tp->urg_mode = 0;
+                        }
+                } else if (seq_rtt < 0)
+                        seq_rtt = now - scb->when;
+                tcp_dec_pcount_approx(&tp->fackets_out, skb);
+                tcp_packets_out_dec(tp, skb);
+                __skb_unlink(skb, skb->list);
+                sk_stream_free_skb(sk, skb);
+        }
+        if (acked&FLAG_ACKED) {
+                tcp_ack_update_rtt(tp, acked, seq_rtt);
+                tcp_ack_packets_out(sk, tp);
+        }
+#if FASTRETRANS_DEBUG > 0
+        BUG_TRAP((int)tp->sacked_out >= 0);
+        BUG_TRAP((int)tp->lost_out >= 0);
+        BUG_TRAP((int)tp->retrans_out >= 0);
+        if (!tp->packets_out && tp->rx_opt.sack_ok) {
+                if (tp->lost_out) {
+                        printk(KERN_DEBUG "Leak l=%u %d\n",
+                               tp->lost_out, tp->ca_state);
+                        tp->lost_out = 0;
+                }
+                if (tp->sacked_out) {
+                        printk(KERN_DEBUG "Leak s=%u %d\n",
+                               tp->sacked_out, tp->ca_state);
+                        tp->sacked_out = 0;
+                }
+                if (tp->retrans_out) {
+                        printk(KERN_DEBUG "Leak r=%u %d\n",
+                               tp->retrans_out, tp->ca_state);
+                        tp->retrans_out = 0;
+                }
+        }
+#endif
+        *seq_rtt_p = seq_rtt;
+        return acked;
+}
+static void tcp_ack_probe(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* Was it a usable window open? */
+        if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
+                   tp->snd_una + tp->snd_wnd)) {
+                tp->backoff = 0;
+                tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+                /* Socket must be waked up by subsequent tcp_data_snd_check().
+                 * This function is not for random using!
+                 */
+        } else {
+                tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
+                                     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+        }
+}
+static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+{
+        return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+                tp->ca_state != TCP_CA_Open);
+}
+static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+{
+        return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
+                !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+}
+/* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
+                                        u32 ack_seq, u32 nwin)
+{
+        return (after(ack, tp->snd_una) ||
+                after(ack_seq, tp->snd_wl1) ||
+                (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+}
+/* Update our send window.
+ *
+ * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
+ * and in FreeBSD. NetBSD's one is even worse.) is wrong.
+ */
+static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
+                                 struct sk_buff *skb, u32 ack, u32 ack_seq)
+{
+        int flag = 0;
+        u32 nwin = ntohs(skb->h.th->window);
+        if (likely(!skb->h.th->syn))
+                nwin <<= tp->rx_opt.snd_wscale;
+        if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
+                flag |= FLAG_WIN_UPDATE;
+                tcp_update_wl(tp, ack, ack_seq);
+                if (tp->snd_wnd != nwin) {
+                        tp->snd_wnd = nwin;
+                        /* Note, it is the only place, where
+                         * fast path is recovered for sending TCP.
+                         */
+                        tcp_fast_path_check(sk, tp);
+                        if (nwin > tp->max_window) {
+                                tp->max_window = nwin;
+                                tcp_sync_mss(sk, tp->pmtu_cookie);
+                        }
+                }
+        }
+        tp->snd_una = ack;
+        return flag;
+}
+static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        
+        tcp_sync_left_out(tp);
+        
+        if (tp->snd_una == prior_snd_una ||
+            !before(tp->snd_una, tp->frto_highmark)) {
+                /* RTO was caused by loss, start retransmitting in
+                 * go-back-N slow start
+                 */
+                tcp_enter_frto_loss(sk);
+                return;
+        }
+        if (tp->frto_counter == 1) {
+                /* First ACK after RTO advances the window: allow two new
+                 * segments out.
+                 */
+                tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
+        } else {
+                /* Also the second ACK after RTO advances the window.
+                 * The RTO was likely spurious. Reduce cwnd and continue
+                 * in congestion avoidance
+                 */
+                tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+                tcp_moderate_cwnd(tp);
+        }
+        /* F-RTO affects on two new ACKs following RTO.
+         * At latest on third ACK the TCP behavor is back to normal.
+         */
+        tp->frto_counter = (tp->frto_counter + 1) % 3;
+}
+/*
+ * TCP Westwood+
+ */
+/*
+ * @init_westwood
+ * This function initializes fields used in TCP Westwood+. We can't
+ * get no information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void init_westwood(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tp->westwood.bw_ns_est = 0;
+        tp->westwood.bw_est = 0;
+        tp->westwood.accounted = 0;
+        tp->westwood.cumul_ack = 0;
+        tp->westwood.rtt_win_sx = tcp_time_stamp;
+        tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
+        tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
+        tp->westwood.snd_una = tp->snd_una;
+}
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coeffients.
+ */
+static inline __u32 westwood_do_filter(__u32 a, __u32 b)
+{
+        return (((7 * a) + b) >> 3);
+}
+static void westwood_filter(struct sock *sk, __u32 delta)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tp->westwood.bw_ns_est =
+                westwood_do_filter(tp->westwood.bw_ns_est, 
+                                   tp->westwood.bk / delta);
+        tp->westwood.bw_est =
+                westwood_do_filter(tp->westwood.bw_est,
+                                   tp->westwood.bw_ns_est);
+}
+/* 
+ * @westwood_update_rttmin
+ * It is used to update RTTmin. In this case we MUST NOT use
+ * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
+ */
+static inline __u32 westwood_update_rttmin(const struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        __u32 rttmin = tp->westwood.rtt_min;
+        if (tp->westwood.rtt != 0 &&
+            (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
+                rttmin = tp->westwood.rtt;
+        return rttmin;
+}
+/*
+ * @westwood_acked
+ * Evaluate increases for dk. 
+ */
+static inline __u32 westwood_acked(const struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        return tp->snd_una - tp->westwood.snd_una;
+}
+/*
+ * @westwood_new_window
+ * It evaluates if we are receiving data inside the same RTT window as
+ * when we started.
+ * Return value:
+ * It returns 0 if we are still evaluating samples in the same RTT
+ * window, 1 if the sample has to be considered in the next window.
+ */
+static int westwood_new_window(const struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        __u32 left_bound;
+        __u32 rtt;
+        int ret = 0;
+        left_bound = tp->westwood.rtt_win_sx;
+        rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
+        /*
+         * A RTT-window has passed. Be careful since if RTT is less than
+         * 50ms we don't filter but we continue 'building the sample'.
+         * This minimum limit was choosen since an estimation on small
+         * time intervals is better to avoid...
+         * Obvioulsy on a LAN we reasonably will always have
+         * right_bound = left_bound + WESTWOOD_RTT_MIN
+         */
+        if ((left_bound + rtt) < tcp_time_stamp)
+                ret = 1;
+        return ret;
+}
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth. 
+ */
+static void __westwood_update_window(struct sock *sk, __u32 now)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        __u32 delta = now - tp->westwood.rtt_win_sx;
+        if (delta) {
+                if (tp->westwood.rtt)
+                        westwood_filter(sk, delta);
+                tp->westwood.bk = 0;
+                tp->westwood.rtt_win_sx = tcp_time_stamp;
+        }
+}
+static void westwood_update_window(struct sock *sk, __u32 now)
+{
+        if (westwood_new_window(sk)) 
+                __westwood_update_window(sk, now);
+}
+/*
+ * @__tcp_westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successfull. In such case infact update is
+ * straight forward and doesn't need any particular care.
+ */
+static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        westwood_update_window(sk, tcp_time_stamp);
+        tp->westwood.bk += westwood_acked(sk);
+        tp->westwood.snd_una = tp->snd_una;
+        tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
+static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+        if (tcp_is_westwood(tcp_sk(sk)))
+                __tcp_westwood_fast_bw(sk, skb);
+}
+/*
+ * @westwood_dupack_update
+ * It updates accounted and cumul_ack when receiving a dupack.
+ */
+static void westwood_dupack_update(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tp->westwood.accounted += tp->mss_cache_std;
+        tp->westwood.cumul_ack = tp->mss_cache_std;
+}
+static inline int westwood_may_change_cumul(struct tcp_sock *tp)
+{
+        return (tp->westwood.cumul_ack > tp->mss_cache_std);
+}
+static inline void westwood_partial_update(struct tcp_sock *tp)
+{
+        tp->westwood.accounted -= tp->westwood.cumul_ack;
+        tp->westwood.cumul_ack = tp->mss_cache_std;
+}
+static inline void westwood_complete_update(struct tcp_sock *tp)
+{
+        tp->westwood.cumul_ack -= tp->westwood.accounted;
+        tp->westwood.accounted = 0;
+}
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating dk in case of
+ * delayed or partial acks.
+ */
+static inline __u32 westwood_acked_count(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tp->westwood.cumul_ack = westwood_acked(sk);
+        /* If cumul_ack is 0 this is a dupack since it's not moving
+         * tp->snd_una.
+         */
+        if (!(tp->westwood.cumul_ack))
+                westwood_dupack_update(sk);
+        if (westwood_may_change_cumul(tp)) {
+                /* Partial or delayed ack */
+                if (tp->westwood.accounted >= tp->westwood.cumul_ack)
+                        westwood_partial_update(tp);
+                else
+                        westwood_complete_update(tp);
+        }
+        tp->westwood.snd_una = tp->snd_una;
+        return tp->westwood.cumul_ack;
+}
+/*
+ * @__tcp_westwood_slow_bw
+ * It is called when something is going wrong..even if there could
+ * be no problems! Infact a simple delayed packet may trigger a
+ * dupack. But we need to be careful in such case.
+ */
+static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        westwood_update_window(sk, tcp_time_stamp);
+        tp->westwood.bk += westwood_acked_count(sk);
+        tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
+static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+        if (tcp_is_westwood(tcp_sk(sk)))
+                __tcp_westwood_slow_bw(sk, skb);
+}
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 prior_snd_una = tp->snd_una;
+        u32 ack_seq = TCP_SKB_CB(skb)->seq;
+        u32 ack = TCP_SKB_CB(skb)->ack_seq;
+        u32 prior_in_flight;
+        s32 seq_rtt;
+        int prior_packets;
+        /* If the ack is newer than sent or older than previous acks
+         * then we can probably ignore it.
+         */
+        if (after(ack, tp->snd_nxt))
+                goto uninteresting_ack;
+        if (before(ack, prior_snd_una))
+                goto old_ack;
+        if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+                /* Window is constant, pure forward advance.
+                 * No more checks are required.
+                 * Note, we use the fact that SND.UNA>=SND.WL2.
+                 */
+                tcp_update_wl(tp, ack, ack_seq);
+                tp->snd_una = ack;
+                tcp_westwood_fast_bw(sk, skb);
+                flag |= FLAG_WIN_UPDATE;
+                NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
+        } else {
+                if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+                        flag |= FLAG_DATA;
+                else
+                        NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
+                flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
+                if (TCP_SKB_CB(skb)->sacked)
+                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+                if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
+                        flag |= FLAG_ECE;
+                tcp_westwood_slow_bw(sk,skb);
+        }
+        /* We passed data and got it acked, remove any soft error
+         * log. Something worked...
+         */
+        sk->sk_err_soft = 0;
+        tp->rcv_tstamp = tcp_time_stamp;
+        prior_packets = tp->packets_out;
+        if (!prior_packets)
+                goto no_queue;
+        prior_in_flight = tcp_packets_in_flight(tp);
+        /* See if we can take anything off of the retransmit queue. */
+        flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+        if (tp->frto_counter)
+                tcp_process_frto(sk, prior_snd_una);
+        if (tcp_ack_is_dubious(tp, flag)) {
+                /* Advanve CWND, if state allows this. */
+                if ((flag & FLAG_DATA_ACKED) &&
+                    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
+                    tcp_may_raise_cwnd(tp, flag))
+                        tcp_cong_avoid(tp, ack, seq_rtt);
+                tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
+        } else {
+                if ((flag & FLAG_DATA_ACKED) && 
+                    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
+                        tcp_cong_avoid(tp, ack, seq_rtt);
+        }
+        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
+                dst_confirm(sk->sk_dst_cache);
+        return 1;
+no_queue:
+        tp->probes_out = 0;
+        /* If this ack opens up a zero window, clear backoff.  It was
+         * being used to time the probes, and is probably far higher than
+         * it needs to be for normal retransmission.
+         */
+        if (sk->sk_send_head)
+                tcp_ack_probe(sk);
+        return 1;
+old_ack:
+        if (TCP_SKB_CB(skb)->sacked)
+                tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+uninteresting_ack:
+        SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+        return 0;
+}
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
+{
+        unsigned char *ptr;
+        struct tcphdr *th = skb->h.th;
+        int length=(th->doff*4)-sizeof(struct tcphdr);
+        ptr = (unsigned char *)(th + 1);
+        opt_rx->saw_tstamp = 0;
+        while(length>0) {
+                int opcode=*ptr++;
+                int opsize;
+                switch (opcode) {
+                        case TCPOPT_EOL:
+                                return;
+                        case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
+                                length--;
+                                continue;
+                        default:
+                                opsize=*ptr++;
+                                if (opsize < 2) /* "silly options" */
+                                        return;
+                                if (opsize > length)
+                                        return; /* don't parse partial options */
+                                switch(opcode) {
+                                case TCPOPT_MSS:
+                                        if(opsize==TCPOLEN_MSS && th->syn && !estab) {
+                                                u16 in_mss = ntohs(get_unaligned((__u16 *)ptr));
+                                                if (in_mss) {
+                                                        if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
+                                                                in_mss = opt_rx->user_mss;
+                                                        opt_rx->mss_clamp = in_mss;
+                                                }
+                                        }
+                                        break;
+                                case TCPOPT_WINDOW:
+                                        if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
+                                                if (sysctl_tcp_window_scaling) {
+                                                        __u8 snd_wscale = *(__u8 *) ptr;
+                                                        opt_rx->wscale_ok = 1;
+                                                        if (snd_wscale > 14) {
+                                                                if(net_ratelimit())
+                                                                        printk(KERN_INFO "tcp_parse_options: Illegal window "
+                                                                               "scaling value %d >14 received.\n",
+                                                                               snd_wscale);
+                                                                snd_wscale = 14;
+                                                        }
+                                                        opt_rx->snd_wscale = snd_wscale;
+                                                }
+                                        break;
+                                case TCPOPT_TIMESTAMP:
+                                        if(opsize==TCPOLEN_TIMESTAMP) {
+                                                if ((estab && opt_rx->tstamp_ok) ||
+                                                    (!estab && sysctl_tcp_timestamps)) {
+                                                        opt_rx->saw_tstamp = 1;
+                                                        opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr));
+                                                        opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4)));
+                                                }
+                                        }
+                                        break;
+                                case TCPOPT_SACK_PERM:
+                                        if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
+                                                if (sysctl_tcp_sack) {
+                                                        opt_rx->sack_ok = 1;
+                                                        tcp_sack_reset(opt_rx);
+                                                }
+                                        }
+                                        break;
+                                case TCPOPT_SACK:
+                                        if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+                                           !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+                                           opt_rx->sack_ok) {
+                                                TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+                                        }
+                                };
+                                ptr+=opsize-2;
+                                length-=opsize;
+                };
+        }
+}
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
+                                         struct tcp_sock *tp)
+{
+        if (th->doff == sizeof(struct tcphdr)>>2) {
+                tp->rx_opt.saw_tstamp = 0;
+                return 0;
+        } else if (tp->rx_opt.tstamp_ok &&
+                   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
+                __u32 *ptr = (__u32 *)(th + 1);
+                if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+                                  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+                        tp->rx_opt.saw_tstamp = 1;
+                        ++ptr;
+                        tp->rx_opt.rcv_tsval = ntohl(*ptr);
+                        ++ptr;
+                        tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+                        return 1;
+                }
+        }
+        tcp_parse_options(skb, &tp->rx_opt, 1);
+        return 1;
+}
+static inline void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+        tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+        tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+}
+static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+        if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+                /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+                 * extra check below makes sure this can only happen
+                 * for pure ACK frames.  -DaveM
+                 *
+                 * Not only, also it occurs for expired timestamps.
+                 */
+                if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
+                   xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+                        tcp_store_ts_recent(tp);
+        }
+}
+/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+ *
+ * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
+ * it can pass through stack. So, the following predicate verifies that
+ * this segment is not used for anything but congestion avoidance or
+ * fast retransmit. Moreover, we even are able to eliminate most of such
+ * second order effects, if we apply some small "replay" window (~RTO)
+ * to timestamp space.
+ *
+ * All these measures still do not guarantee that we reject wrapped ACKs
+ * on networks with high bandwidth, when sequence space is recycled fastly,
+ * but it guarantees that such events will be very rare and do not affect
+ * connection seriously. This doesn't look nice, but alas, PAWS is really
+ * buggy extension.
+ *
+ * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
+ * states that events when retransmit arrives after original data are rare.
+ * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
+ * the biggest problem on large power networks even with minor reordering.
+ * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
+ * up to bandwidth of 18Gigabit/sec. 8) ]
+ */
+static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        struct tcphdr *th = skb->h.th;
+        u32 seq = TCP_SKB_CB(skb)->seq;
+        u32 ack = TCP_SKB_CB(skb)->ack_seq;
+        return (/* 1. Pure ACK with correct sequence number. */
+                (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+                /* 2. ... and duplicate ACK. */
+                ack == tp->snd_una &&
+                /* 3. ... and does not update window. */
+                !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+                /* 4. ... and sits in replay window. */
+                (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+}
+static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
+                xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
+                !tcp_disordered_ack(tp, skb));
+}
+/* Check segment sequence number for validity.
+ *
+ * Segment controls are considered valid, if the segment
+ * fits to the window after truncation to the window. Acceptability
+ * of data (and SYN, FIN, of course) is checked separately.
+ * See tcp_data_queue(), for example.
+ *
+ * Also, controls (RST is main one) are accepted using RCV.WUP instead
+ * of RCV.NXT. Peer still did not advance his SND.UNA when we
+ * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
+ * (borrowed from freebsd)
+ */
+static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+        return  !before(end_seq, tp->rcv_wup) &&
+                !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+}
+/* When we get a reset we do this. */
+static void tcp_reset(struct sock *sk)
+{
+        /* We want the right error as BSD sees it (and indeed as we do). */
+        switch (sk->sk_state) {
+                case TCP_SYN_SENT:
+                        sk->sk_err = ECONNREFUSED;
+                        break;
+                case TCP_CLOSE_WAIT:
+                        sk->sk_err = EPIPE;
+                        break;
+                case TCP_CLOSE:
+                        return;
+                default:
+                        sk->sk_err = ECONNRESET;
+        }
+        if (!sock_flag(sk, SOCK_DEAD))
+                sk->sk_error_report(sk);
+        tcp_done(sk);
+}
+/*
+ *      Process the FIN bit. This now behaves as it is supposed to work
+ *      and the FIN takes effect when it is validly part of sequence
+ *      space. Not before when we get holes.
+ *
+ *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
+ *      TIME-WAIT)
+ *
+ *      If we are in FINWAIT-1, a received FIN indicates simultaneous
+ *      close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tcp_schedule_ack(tp);
+        sk->sk_shutdown |= RCV_SHUTDOWN;
+        sock_set_flag(sk, SOCK_DONE);
+        switch (sk->sk_state) {
+                case TCP_SYN_RECV:
+                case TCP_ESTABLISHED:
+                        /* Move to CLOSE_WAIT */
+                        tcp_set_state(sk, TCP_CLOSE_WAIT);
+                        tp->ack.pingpong = 1;
+                        break;
+                case TCP_CLOSE_WAIT:
+                case TCP_CLOSING:
+                        /* Received a retransmission of the FIN, do
+                         * nothing.
+                         */
+                        break;
+                case TCP_LAST_ACK:
+                        /* RFC793: Remain in the LAST-ACK state. */
+                        break;
+                case TCP_FIN_WAIT1:
+                        /* This case occurs when a simultaneous close
+                         * happens, we must ack the received FIN and
+                         * enter the CLOSING state.
+                         */
+                        tcp_send_ack(sk);
+                        tcp_set_state(sk, TCP_CLOSING);
+                        break;
+                case TCP_FIN_WAIT2:
+                        /* Received a FIN -- send ACK and enter TIME_WAIT. */
+                        tcp_send_ack(sk);
+                        tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+                        break;
+                default:
+                        /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+                         * cases we should never reach this piece of code.
+                         */
+                        printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
+                               __FUNCTION__, sk->sk_state);
+                        break;
+        };
+        /* It _is_ possible, that we have something out-of-order _after_ FIN.
+         * Probably, we should reset in this case. For now drop them.
+         */
+        __skb_queue_purge(&tp->out_of_order_queue);
+        if (tp->rx_opt.sack_ok)
+                tcp_sack_reset(&tp->rx_opt);
+        sk_stream_mem_reclaim(sk);
+        if (!sock_flag(sk, SOCK_DEAD)) {
+                sk->sk_state_change(sk);
+                /* Do not send POLL_HUP for half duplex close. */
+                if (sk->sk_shutdown == SHUTDOWN_MASK ||
+                    sk->sk_state == TCP_CLOSE)
+                        sk_wake_async(sk, 1, POLL_HUP);
+                else
+                        sk_wake_async(sk, 1, POLL_IN);
+        }
+}
+static __inline__ int
+tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+{
+        if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
+                if (before(seq, sp->start_seq))
+                        sp->start_seq = seq;
+                if (after(end_seq, sp->end_seq))
+                        sp->end_seq = end_seq;
+                return 1;
+        }
+        return 0;
+}
+static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+        if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+                if (before(seq, tp->rcv_nxt))
+                        NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
+                else
+                        NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT);
+                tp->rx_opt.dsack = 1;
+                tp->duplicate_sack[0].start_seq = seq;
+                tp->duplicate_sack[0].end_seq = end_seq;
+                tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
+        }
+}
+static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+        if (!tp->rx_opt.dsack)
+                tcp_dsack_set(tp, seq, end_seq);
+        else
+                tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
+}
+static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+            before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
+                tcp_enter_quickack_mode(tp);
+                if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+                        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+                        if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
+                                end_seq = tp->rcv_nxt;
+                        tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq);
+                }
+        }
+        tcp_send_ack(sk);
+}
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
+{
+        int this_sack;
+        struct tcp_sack_block *sp = &tp->selective_acks[0];
+        struct tcp_sack_block *swalk = sp+1;
+        /* See if the recent change to the first SACK eats into
+         * or hits the sequence space of other SACK blocks, if so coalesce.
+         */
+        for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) {
+                if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
+                        int i;
+                        /* Zap SWALK, by moving every further SACK up by one slot.
+                         * Decrease num_sacks.
+                         */
+                        tp->rx_opt.num_sacks--;
+                        tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+                        for(i=this_sack; i < tp->rx_opt.num_sacks; i++)
+                                sp[i] = sp[i+1];
+                        continue;
+                }
+                this_sack++, swalk++;
+        }
+}
+static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+{
+        __u32 tmp;
+        tmp = sack1->start_seq;
+        sack1->start_seq = sack2->start_seq;
+        sack2->start_seq = tmp;
+        tmp = sack1->end_seq;
+        sack1->end_seq = sack2->end_seq;
+        sack2->end_seq = tmp;
+}
+static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sack_block *sp = &tp->selective_acks[0];
+        int cur_sacks = tp->rx_opt.num_sacks;
+        int this_sack;
+        if (!cur_sacks)
+                goto new_sack;
+        for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) {
+                if (tcp_sack_extend(sp, seq, end_seq)) {
+                        /* Rotate this_sack to the first one. */
+                        for (; this_sack>0; this_sack--, sp--)
+                                tcp_sack_swap(sp, sp-1);
+                        if (cur_sacks > 1)
+                                tcp_sack_maybe_coalesce(tp);
+                        return;
+                }
+        }
+        /* Could not find an adjacent existing SACK, build a new one,
+         * put it at the front, and shift everyone else down.  We
+         * always know there is at least one SACK present already here.
+         *
+         * If the sack array is full, forget about the last one.
+         */
+        if (this_sack >= 4) {
+                this_sack--;
+                tp->rx_opt.num_sacks--;
+                sp--;
+        }
+        for(; this_sack > 0; this_sack--, sp--)
+                *sp = *(sp-1);
+new_sack:
+        /* Build the new head SACK, and we're done. */
+        sp->start_seq = seq;
+        sp->end_seq = end_seq;
+        tp->rx_opt.num_sacks++;
+        tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+}
+/* RCV.NXT advances, some SACKs should be eaten. */
+static void tcp_sack_remove(struct tcp_sock *tp)
+{
+        struct tcp_sack_block *sp = &tp->selective_acks[0];
+        int num_sacks = tp->rx_opt.num_sacks;
+        int this_sack;
+        /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+        if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+                tp->rx_opt.num_sacks = 0;
+                tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
+                return;
+        }
+        for(this_sack = 0; this_sack < num_sacks; ) {
+                /* Check if the start of the sack is covered by RCV.NXT. */
+                if (!before(tp->rcv_nxt, sp->start_seq)) {
+                        int i;
+                        /* RCV.NXT must cover all the block! */
+                        BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq));
+                        /* Zap this SACK, by moving forward any other SACKS. */
+                        for (i=this_sack+1; i < num_sacks; i++)
+                                tp->selective_acks[i-1] = tp->selective_acks[i];
+                        num_sacks--;
+                        continue;
+                }
+                this_sack++;
+                sp++;
+        }
+        if (num_sacks != tp->rx_opt.num_sacks) {
+                tp->rx_opt.num_sacks = num_sacks;
+                tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+        }
+}
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        __u32 dsack_high = tp->rcv_nxt;
+        struct sk_buff *skb;
+        while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+                        break;
+                if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
+                        __u32 dsack = dsack_high;
+                        if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
+                                dsack_high = TCP_SKB_CB(skb)->end_seq;
+                        tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack);
+                }
+                if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+                        SOCK_DEBUG(sk, "ofo packet was already received \n");
+                        __skb_unlink(skb, skb->list);
+                        __kfree_skb(skb);
+                        continue;
+                }
+                SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+                           tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+                           TCP_SKB_CB(skb)->end_seq);
+                __skb_unlink(skb, skb->list);
+                __skb_queue_tail(&sk->sk_receive_queue, skb);
+                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                if(skb->h.th->fin)
+                        tcp_fin(skb, sk, skb->h.th);
+        }
+}
+static int tcp_prune_queue(struct sock *sk);
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcphdr *th = skb->h.th;
+        struct tcp_sock *tp = tcp_sk(sk);
+        int eaten = -1;
+        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+                goto drop;
+        th = skb->h.th;
+        __skb_pull(skb, th->doff*4);
+        TCP_ECN_accept_cwr(tp, skb);
+        if (tp->rx_opt.dsack) {
+                tp->rx_opt.dsack = 0;
+                tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
+                                                    4 - tp->rx_opt.tstamp_ok);
+        }
+        /*  Queue data for delivery to the user.
+         *  Packets in sequence go to the receive queue.
+         *  Out of sequence packets to the out_of_order_queue.
+         */
+        if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+                if (tcp_receive_window(tp) == 0)
+                        goto out_of_window;
+                /* Ok. In sequence. In window. */
+                if (tp->ucopy.task == current &&
+                    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
+                    sock_owned_by_user(sk) && !tp->urg_data) {
+                        int chunk = min_t(unsigned int, skb->len,
+                                                        tp->ucopy.len);
+                        __set_current_state(TASK_RUNNING);
+                        local_bh_enable();
+                        if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
+                                tp->ucopy.len -= chunk;
+                                tp->copied_seq += chunk;
+                                eaten = (chunk == skb->len && !th->fin);
+                                tcp_rcv_space_adjust(sk);
+                        }
+                        local_bh_disable();
+                }
+                if (eaten <= 0) {
+queue_and_out:
+                        if (eaten < 0 &&
+                            (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+                             !sk_stream_rmem_schedule(sk, skb))) {
+                                if (tcp_prune_queue(sk) < 0 ||
+                                    !sk_stream_rmem_schedule(sk, skb))
+                                        goto drop;
+                        }
+                        sk_stream_set_owner_r(skb, sk);
+                        __skb_queue_tail(&sk->sk_receive_queue, skb);
+                }
+                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                if(skb->len)
+                        tcp_event_data_recv(sk, tp, skb);
+                if(th->fin)
+                        tcp_fin(skb, sk, th);
+                if (skb_queue_len(&tp->out_of_order_queue)) {
+                        tcp_ofo_queue(sk);
+                        /* RFC2581. 4.2. SHOULD send immediate ACK, when
+                         * gap in queue is filled.
+                         */
+                        if (!skb_queue_len(&tp->out_of_order_queue))
+                                tp->ack.pingpong = 0;
+                }
+                if (tp->rx_opt.num_sacks)
+                        tcp_sack_remove(tp);
+                tcp_fast_path_check(sk, tp);
+                if (eaten > 0)
+                        __kfree_skb(skb);
+                else if (!sock_flag(sk, SOCK_DEAD))
+                        sk->sk_data_ready(sk, 0);
+                return;
+        }
+        if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+                /* A retransmit, 2nd most common case.  Force an immediate ack. */
+                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
+                tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+out_of_window:
+                tcp_enter_quickack_mode(tp);
+                tcp_schedule_ack(tp);
+drop:
+                __kfree_skb(skb);
+                return;
+        }
+        /* Out of window. F.e. zero window probe. */
+        if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+                goto out_of_window;
+        tcp_enter_quickack_mode(tp);
+        if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+                /* Partial packet, seq < rcv_next < end_seq */
+                SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+                           tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+                           TCP_SKB_CB(skb)->end_seq);
+                tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
+                
+                /* If window is closed, drop tail of packet. But after
+                 * remembering D-SACK for its head made in previous line.
+                 */
+                if (!tcp_receive_window(tp))
+                        goto out_of_window;
+                goto queue_and_out;
+        }
+        TCP_ECN_check_ce(tp, skb);
+        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+            !sk_stream_rmem_schedule(sk, skb)) {
+                if (tcp_prune_queue(sk) < 0 ||
+                    !sk_stream_rmem_schedule(sk, skb))
+                        goto drop;
+        }
+        /* Disable header prediction. */
+        tp->pred_flags = 0;
+        tcp_schedule_ack(tp);
+        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+                   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+        sk_stream_set_owner_r(skb, sk);
+        if (!skb_peek(&tp->out_of_order_queue)) {
+                /* Initial out of order segment, build 1 SACK. */
+                if (tp->rx_opt.sack_ok) {
+                        tp->rx_opt.num_sacks = 1;
+                        tp->rx_opt.dsack     = 0;
+                        tp->rx_opt.eff_sacks = 1;
+                        tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+                        tp->selective_acks[0].end_seq =
+                                                TCP_SKB_CB(skb)->end_seq;
+                }
+                __skb_queue_head(&tp->out_of_order_queue,skb);
+        } else {
+                struct sk_buff *skb1 = tp->out_of_order_queue.prev;
+                u32 seq = TCP_SKB_CB(skb)->seq;
+                u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+                if (seq == TCP_SKB_CB(skb1)->end_seq) {
+                        __skb_append(skb1, skb);
+                        if (!tp->rx_opt.num_sacks ||
+                            tp->selective_acks[0].end_seq != seq)
+                                goto add_sack;
+                        /* Common case: data arrive in order after hole. */
+                        tp->selective_acks[0].end_seq = end_seq;
+                        return;
+                }
+                /* Find place to insert this segment. */
+                do {
+                        if (!after(TCP_SKB_CB(skb1)->seq, seq))
+                                break;
+                } while ((skb1 = skb1->prev) !=
+                         (struct sk_buff*)&tp->out_of_order_queue);
+                /* Do skb overlap to previous one? */
+                if (skb1 != (struct sk_buff*)&tp->out_of_order_queue &&
+                    before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+                        if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                                /* All the bits are present. Drop. */
+                                __kfree_skb(skb);
+                                tcp_dsack_set(tp, seq, end_seq);
+                                goto add_sack;
+                        }
+                        if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+                                /* Partial overlap. */
+                                tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq);
+                        } else {
+                                skb1 = skb1->prev;
+                        }
+                }
+                __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
+                
+                /* And clean segments covered by new one as whole. */
+                while ((skb1 = skb->next) !=
+                       (struct sk_buff*)&tp->out_of_order_queue &&
+                       after(end_seq, TCP_SKB_CB(skb1)->seq)) {
+                       if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                               tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
+                               break;
+                       }
+                       __skb_unlink(skb1, skb1->list);
+                       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
+                       __kfree_skb(skb1);
+                }
+add_sack:
+                if (tp->rx_opt.sack_ok)
+                        tcp_sack_new_ofo_skb(sk, seq, end_seq);
+        }
+}
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff *head,
+             struct sk_buff *tail, u32 start, u32 end)
+{
+        struct sk_buff *skb;
+        /* First, check that queue is collapsable and find
+         * the point where collapsing can be useful. */
+        for (skb = head; skb != tail; ) {
+                /* No new bits? It is possible on ofo queue. */
+                if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+                        struct sk_buff *next = skb->next;
+                        __skb_unlink(skb, skb->list);
+                        __kfree_skb(skb);
+                        NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
+                        skb = next;
+                        continue;
+                }
+                /* The first skb to collapse is:
+                 * - not SYN/FIN and
+                 * - bloated or contains data before "start" or
+                 *   overlaps to the next one.
+                 */
+                if (!skb->h.th->syn && !skb->h.th->fin &&
+                    (tcp_win_from_space(skb->truesize) > skb->len ||
+                     before(TCP_SKB_CB(skb)->seq, start) ||
+                     (skb->next != tail &&
+                      TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+                        break;
+                /* Decided to skip this, advance start seq. */
+                start = TCP_SKB_CB(skb)->end_seq;
+                skb = skb->next;
+        }
+        if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+                return;
+        while (before(start, end)) {
+                struct sk_buff *nskb;
+                int header = skb_headroom(skb);
+                int copy = SKB_MAX_ORDER(header, 0);
+                /* Too big header? This can happen with IPv6. */
+                if (copy < 0)
+                        return;
+                if (end-start < copy)
+                        copy = end-start;
+                nskb = alloc_skb(copy+header, GFP_ATOMIC);
+                if (!nskb)
+                        return;
+                skb_reserve(nskb, header);
+                memcpy(nskb->head, skb->head, header);
+                nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
+                nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
+                nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
+                memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+                TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+                __skb_insert(nskb, skb->prev, skb, skb->list);
+                sk_stream_set_owner_r(nskb, sk);
+                /* Copy data, releasing collapsed skbs. */
+                while (copy > 0) {
+                        int offset = start - TCP_SKB_CB(skb)->seq;
+                        int size = TCP_SKB_CB(skb)->end_seq - start;
+                        if (offset < 0) BUG();
+                        if (size > 0) {
+                                size = min(copy, size);
+                                if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+                                        BUG();
+                                TCP_SKB_CB(nskb)->end_seq += size;
+                                copy -= size;
+                                start += size;
+                        }
+                        if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+                                struct sk_buff *next = skb->next;
+                                __skb_unlink(skb, skb->list);
+                                __kfree_skb(skb);
+                                NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
+                                skb = next;
+                                if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+                                        return;
+                        }
+                }
+        }
+}
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+        struct sk_buff *head;
+        u32 start, end;
+        if (skb == NULL)
+                return;
+        start = TCP_SKB_CB(skb)->seq;
+        end = TCP_SKB_CB(skb)->end_seq;
+        head = skb;
+        for (;;) {
+                skb = skb->next;
+                /* Segment is terminated when we see gap or when
+                 * we are at the end of all the queue. */
+                if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+                    after(TCP_SKB_CB(skb)->seq, end) ||
+                    before(TCP_SKB_CB(skb)->end_seq, start)) {
+                        tcp_collapse(sk, head, skb, start, end);
+                        head = skb;
+                        if (skb == (struct sk_buff *)&tp->out_of_order_queue)
+                                break;
+                        /* Start new segment */
+                        start = TCP_SKB_CB(skb)->seq;
+                        end = TCP_SKB_CB(skb)->end_seq;
+                } else {
+                        if (before(TCP_SKB_CB(skb)->seq, start))
+                                start = TCP_SKB_CB(skb)->seq;
+                        if (after(TCP_SKB_CB(skb)->end_seq, end))
+                                end = TCP_SKB_CB(skb)->end_seq;
+                }
+        }
+}
+/* Reduce allocated memory if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int tcp_prune_queue(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk); 
+        SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+        NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
+        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+                tcp_clamp_window(sk, tp);
+        else if (tcp_memory_pressure)
+                tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+        tcp_collapse_ofo_queue(sk);
+        tcp_collapse(sk, sk->sk_receive_queue.next,
+                     (struct sk_buff*)&sk->sk_receive_queue,
+                     tp->copied_seq, tp->rcv_nxt);
+        sk_stream_mem_reclaim(sk);
+        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+                return 0;
+        /* Collapsing did not help, destructive actions follow.
+         * This must not ever occur. */
+        /* First, purge the out_of_order queue. */
+        if (skb_queue_len(&tp->out_of_order_queue)) {
+                NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 
+                                 skb_queue_len(&tp->out_of_order_queue));
+                __skb_queue_purge(&tp->out_of_order_queue);
+                /* Reset SACK state.  A conforming SACK implementation will
+                 * do the same at a timeout based retransmit.  When a connection
+                 * is in a sad state like this, we care only about integrity
+                 * of the connection not performance.
+                 */
+                if (tp->rx_opt.sack_ok)
+                        tcp_sack_reset(&tp->rx_opt);
+                sk_stream_mem_reclaim(sk);
+        }
+        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+                return 0;
+        /* If we are really being abused, tell the caller to silently
+         * drop receive data on the floor.  It will get retransmitted
+         * and hopefully then we'll have sufficient space.
+         */
+        NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED);
+        /* Massive buffer overcommit. */
+        tp->pred_flags = 0;
+        return -1;
+}
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+void tcp_cwnd_application_limited(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (tp->ca_state == TCP_CA_Open &&
+            sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+                /* Limited by application or receiver window. */
+                u32 win_used = max(tp->snd_cwnd_used, 2U);
+                if (win_used < tp->snd_cwnd) {
+                        tp->snd_ssthresh = tcp_current_ssthresh(tp);
+                        tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+                }
+                tp->snd_cwnd_used = 0;
+        }
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+/* When incoming ACK allowed to free some skb from write_queue,
+ * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
+ * on the exit from tcp input handler.
+ *
+ * PROBLEM: sndbuf expansion does not work well with largesend.
+ */
+static void tcp_new_space(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (tp->packets_out < tp->snd_cwnd &&
+            !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+            !tcp_memory_pressure &&
+            atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+                        MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
+                    demanded = max_t(unsigned int, tp->snd_cwnd,
+                                                   tp->reordering + 1);
+                sndmem *= 2*demanded;
+                if (sndmem > sk->sk_sndbuf)
+                        sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+                tp->snd_cwnd_stamp = tcp_time_stamp;
+        }
+        sk->sk_write_space(sk);
+}
+static inline void tcp_check_space(struct sock *sk)
+{
+        if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
+                sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+                if (sk->sk_socket &&
+                    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+                        tcp_new_space(sk);
+        }
+}
+static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+            tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+            tcp_write_xmit(sk, tp->nonagle))
+                tcp_check_probe_timer(sk, tp);
+}
+static __inline__ void tcp_data_snd_check(struct sock *sk)
+{
+        struct sk_buff *skb = sk->sk_send_head;
+        if (skb != NULL)
+                __tcp_data_snd_check(sk, skb);
+        tcp_check_space(sk);
+}
+/*
+ * Check if sending an ack is needed.
+ */
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+            /* More than one full frame received... */
+        if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+             /* ... and right edge of window advances far enough.
+              * (tcp_recvmsg() will send ACK otherwise). Or...
+              */
+             && __tcp_select_window(sk) >= tp->rcv_wnd) ||
+            /* We ACK each frame or... */
+            tcp_in_quickack_mode(tp) ||
+            /* We have out of order data. */
+            (ofo_possible &&
+             skb_peek(&tp->out_of_order_queue))) {
+                /* Then ack it now */
+                tcp_send_ack(sk);
+        } else {
+                /* Else, send delayed ack. */
+                tcp_send_delayed_ack(sk);
+        }
+}
+static __inline__ void tcp_ack_snd_check(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (!tcp_ack_scheduled(tp)) {
+                /* We sent a data segment already. */
+                return;
+        }
+        __tcp_ack_snd_check(sk, 1);
+}
+/*
+ *      This routine is only called when we have urgent data
+ *      signalled. Its the 'slow' part of tcp_urg. It could be
+ *      moved inline now as tcp_urg is only called from one
+ *      place. We handle URGent data wrong. We have to - as
+ *      BSD still doesn't use the correction from RFC961.
+ *      For 1003.1g we should support a new option TCP_STDURG to permit
+ *      either form (or just set the sysctl tcp_stdurg).
+ */
+ 
+static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 ptr = ntohs(th->urg_ptr);
+        if (ptr && !sysctl_tcp_stdurg)
+                ptr--;
+        ptr += ntohl(th->seq);
+        /* Ignore urgent data that we've already seen and read. */
+        if (after(tp->copied_seq, ptr))
+                return;
+        /* Do not replay urg ptr.
+         *
+         * NOTE: interesting situation not covered by specs.
+         * Misbehaving sender may send urg ptr, pointing to segment,
+         * which we already have in ofo queue. We are not able to fetch
+         * such data and will stay in TCP_URG_NOTYET until will be eaten
+         * by recvmsg(). Seems, we are not obliged to handle such wicked
+         * situations. But it is worth to think about possibility of some
+         * DoSes using some hypothetical application level deadlock.
+         */
+        if (before(ptr, tp->rcv_nxt))
+                return;
+        /* Do we already have a newer (or duplicate) urgent pointer? */
+        if (tp->urg_data && !after(ptr, tp->urg_seq))
+                return;
+        /* Tell the world about our new urgent pointer. */
+        sk_send_sigurg(sk);
+        /* We may be adding urgent data when the last byte read was
+         * urgent. To do this requires some care. We cannot just ignore
+         * tp->copied_seq since we would read the last urgent byte again
+         * as data, nor can we alter copied_seq until this data arrives
+         * or we break the sematics of SIOCATMARK (and thus sockatmark())
+         *
+         * NOTE. Double Dutch. Rendering to plain English: author of comment
+         * above did something sort of  send("A", MSG_OOB); send("B", MSG_OOB);
+         * and expect that both A and B disappear from stream. This is _wrong_.
+         * Though this happens in BSD with high probability, this is occasional.
+         * Any application relying on this is buggy. Note also, that fix "works"
+         * only in this artificial test. Insert some normal data between A and B and we will
+         * decline of BSD again. Verdict: it is better to remove to trap
+         * buggy users.
+         */
+        if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+            !sock_flag(sk, SOCK_URGINLINE) &&
+            tp->copied_seq != tp->rcv_nxt) {
+                struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+                tp->copied_seq++;
+                if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+                        __skb_unlink(skb, skb->list);
+                        __kfree_skb(skb);
+                }
+        }
+        tp->urg_data   = TCP_URG_NOTYET;
+        tp->urg_seq    = ptr;
+        /* Disable header prediction. */
+        tp->pred_flags = 0;
+}
+/* This is the 'fast' part of urgent handling. */
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* Check if we get a new urgent pointer - normally not. */
+        if (th->urg)
+                tcp_check_urg(sk,th);
+        /* Do we wait for any urgent data? - normally not... */
+        if (tp->urg_data == TCP_URG_NOTYET) {
+                u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
+                          th->syn;
+                /* Is the urgent pointer pointing into this packet? */   
+                if (ptr < skb->len) {
+                        u8 tmp;
+                        if (skb_copy_bits(skb, ptr, &tmp, 1))
+                                BUG();
+                        tp->urg_data = TCP_URG_VALID | tmp;
+                        if (!sock_flag(sk, SOCK_DEAD))
+                                sk->sk_data_ready(sk, 0);
+                }
+        }
+}
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int chunk = skb->len - hlen;
+        int err;
+        local_bh_enable();
+        if (skb->ip_summed==CHECKSUM_UNNECESSARY)
+                err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
+        else
+                err = skb_copy_and_csum_datagram_iovec(skb, hlen,
+                                                       tp->ucopy.iov);
+        if (!err) {
+                tp->ucopy.len -= chunk;
+                tp->copied_seq += chunk;
+                tcp_rcv_space_adjust(sk);
+        }
+        local_bh_disable();
+        return err;
+}
+static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+        int result;
+        if (sock_owned_by_user(sk)) {
+                local_bh_enable();
+                result = __tcp_checksum_complete(skb);
+                local_bh_disable();
+        } else {
+                result = __tcp_checksum_complete(skb);
+        }
+        return result;
+}
+static __inline__ int
+tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+        return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+                __tcp_checksum_complete_user(sk, skb);
+}
+/*
+ *      TCP receive function for the ESTABLISHED state. 
+ *
+ *      It is split into a fast path and a slow path. The fast path is 
+ *      disabled when:
+ *      - A zero window was announced from us - zero window probing
+ *        is only handled properly in the slow path. 
+ *      - Out of order segments arrived.
+ *      - Urgent data is expected.
+ *      - There is no buffer space left
+ *      - Unexpected TCP flags/window values/header lengths are received
+ *        (detected by checking the TCP header against pred_flags) 
+ *      - Data is sent in both directions. Fast path only supports pure senders
+ *        or pure receivers (this means either the sequence number or the ack
+ *        value must stay constant)
+ *      - Unexpected TCP option.
+ *
+ *      When these conditions are not satisfied it drops into a standard 
+ *      receive procedure patterned after RFC793 to handle all cases.
+ *      The first three cases are guaranteed by proper pred_flags setting,
+ *      the rest is checked inline. Fast processing is turned on in 
+ *      tcp_data_queue when everything is OK.
+ */
+int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+                        struct tcphdr *th, unsigned len)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /*
+         *      Header prediction.
+         *      The code loosely follows the one in the famous 
+         *      "30 instruction TCP receive" Van Jacobson mail.
+         *      
+         *      Van's trick is to deposit buffers into socket queue 
+         *      on a device interrupt, to call tcp_recv function
+         *      on the receive process context and checksum and copy
+         *      the buffer to user space. smart...
+         *
+         *      Our current scheme is not silly either but we take the 
+         *      extra cost of the net_bh soft interrupt processing...
+         *      We do checksum and copy also but from device to kernel.
+         */
+        tp->rx_opt.saw_tstamp = 0;
+        /*      pred_flags is 0xS?10 << 16 + snd_wnd
+         *      if header_predition is to be made
+         *      'S' will always be tp->tcp_header_len >> 2
+         *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+         *  turn it off (when there are holes in the receive 
+         *       space for instance)
+         *      PSH flag is ignored.
+         */
+        if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+                TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+                int tcp_header_len = tp->tcp_header_len;
+                /* Timestamp header prediction: tcp_header_len
+                 * is automatically equal to th->doff*4 due to pred_flags
+                 * match.
+                 */
+                /* Check timestamp */
+                if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+                        __u32 *ptr = (__u32 *)(th + 1);
+                        /* No? Slow path! */
+                        if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+                                          | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+                                goto slow_path;
+                        tp->rx_opt.saw_tstamp = 1;
+                        ++ptr; 
+                        tp->rx_opt.rcv_tsval = ntohl(*ptr);
+                        ++ptr;
+                        tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+                        /* If PAWS failed, check it more carefully in slow path */
+                        if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+                                goto slow_path;
+                        /* DO NOT update ts_recent here, if checksum fails
+                         * and timestamp was corrupted part, it will result
+                         * in a hung connection since we will drop all
+                         * future packets due to the PAWS test.
+                         */
+                }
+                if (len <= tcp_header_len) {
+                        /* Bulk data transfer: sender */
+                        if (len == tcp_header_len) {
+                                /* Predicted packet is in window by definition.
+                                 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+                                 * Hence, check seq<=rcv_wup reduces to:
+                                 */
+                                if (tcp_header_len ==
+                                    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+                                    tp->rcv_nxt == tp->rcv_wup)
+                                        tcp_store_ts_recent(tp);
+                                tcp_rcv_rtt_measure_ts(tp, skb);
+                                /* We know that such packets are checksummed
+                                 * on entry.
+                                 */
+                                tcp_ack(sk, skb, 0);
+                                __kfree_skb(skb); 
+                                tcp_data_snd_check(sk);
+                                return 0;
+                        } else { /* Header too small */
+                                TCP_INC_STATS_BH(TCP_MIB_INERRS);
+                                goto discard;
+                        }
+                } else {
+                        int eaten = 0;
+                        if (tp->ucopy.task == current &&
+                            tp->copied_seq == tp->rcv_nxt &&
+                            len - tcp_header_len <= tp->ucopy.len &&
+                            sock_owned_by_user(sk)) {
+                                __set_current_state(TASK_RUNNING);
+                                if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+                                        /* Predicted packet is in window by definition.
+                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+                                         * Hence, check seq<=rcv_wup reduces to:
+                                         */
+                                        if (tcp_header_len ==
+                                            (sizeof(struct tcphdr) +
+                                             TCPOLEN_TSTAMP_ALIGNED) &&
+                                            tp->rcv_nxt == tp->rcv_wup)
+                                                tcp_store_ts_recent(tp);
+                                        tcp_rcv_rtt_measure_ts(tp, skb);
+                                        __skb_pull(skb, tcp_header_len);
+                                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                                        NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
+                                        eaten = 1;
+                                }
+                        }
+                        if (!eaten) {
+                                if (tcp_checksum_complete_user(sk, skb))
+                                        goto csum_error;
+                                /* Predicted packet is in window by definition.
+                                 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+                                 * Hence, check seq<=rcv_wup reduces to:
+                                 */
+                                if (tcp_header_len ==
+                                    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+                                    tp->rcv_nxt == tp->rcv_wup)
+                                        tcp_store_ts_recent(tp);
+                                tcp_rcv_rtt_measure_ts(tp, skb);
+                                if ((int)skb->truesize > sk->sk_forward_alloc)
+                                        goto step5;
+                                NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
+                                /* Bulk data transfer: receiver */
+                                __skb_pull(skb,tcp_header_len);
+                                __skb_queue_tail(&sk->sk_receive_queue, skb);
+                                sk_stream_set_owner_r(skb, sk);
+                                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                        }
+                        tcp_event_data_recv(sk, tp, skb);
+                        if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+                                /* Well, only one small jumplet in fast path... */
+                                tcp_ack(sk, skb, FLAG_DATA);
+                                tcp_data_snd_check(sk);
+                                if (!tcp_ack_scheduled(tp))
+                                        goto no_ack;
+                        }
+                        if (eaten) {
+                                if (tcp_in_quickack_mode(tp)) {
+                                        tcp_send_ack(sk);
+                                } else {
+                                        tcp_send_delayed_ack(sk);
+                                }
+                        } else {
+                                __tcp_ack_snd_check(sk, 0);
+                        }
+no_ack:
+                        if (eaten)
+                                __kfree_skb(skb);
+                        else
+                                sk->sk_data_ready(sk, 0);
+                        return 0;
+                }
+        }
+slow_path:
+        if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb))
+                goto csum_error;
+        /*
+         * RFC1323: H1. Apply PAWS check first.
+         */
+        if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+            tcp_paws_discard(tp, skb)) {
+                if (!th->rst) {
+                        NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+                        tcp_send_dupack(sk, skb);
+                        goto discard;
+                }
+                /* Resets are accepted even if PAWS failed.
+                   ts_recent update must be made after we are sure
+                   that the packet is in window.
+                 */
+        }
+        /*
+         *      Standard slow path.
+         */
+        if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+                /* RFC793, page 37: "In all states except SYN-SENT, all reset
+                 * (RST) segments are validated by checking their SEQ-fields."
+                 * And page 69: "If an incoming segment is not acceptable,
+                 * an acknowledgment should be sent in reply (unless the RST bit
+                 * is set, if so drop the segment and return)".
+                 */
+                if (!th->rst)
+                        tcp_send_dupack(sk, skb);
+                goto discard;
+        }
+        if(th->rst) {
+                tcp_reset(sk);
+                goto discard;
+        }
+        tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+        if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+                TCP_INC_STATS_BH(TCP_MIB_INERRS);
+                NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
+                tcp_reset(sk);
+                return 1;
+        }
+step5:
+        if(th->ack)
+                tcp_ack(sk, skb, FLAG_SLOWPATH);
+        tcp_rcv_rtt_measure_ts(tp, skb);
+        /* Process urgent data. */
+        tcp_urg(sk, skb, th);
+        /* step 7: process the segment text */
+        tcp_data_queue(sk, skb);
+        tcp_data_snd_check(sk);
+        tcp_ack_snd_check(sk);
+        return 0;
+csum_error:
+        TCP_INC_STATS_BH(TCP_MIB_INERRS);
+discard:
+        __kfree_skb(skb);
+        return 0;
+}
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+                                         struct tcphdr *th, unsigned len)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int saved_clamp = tp->rx_opt.mss_clamp;
+        tcp_parse_options(skb, &tp->rx_opt, 0);
+        if (th->ack) {
+                /* rfc793:
+                 * "If the state is SYN-SENT then
+                 *    first check the ACK bit
+                 *      If the ACK bit is set
+                 *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+                 *        a reset (unless the RST bit is set, if so drop
+                 *        the segment and return)"
+                 *
+                 *  We do not send data with SYN, so that RFC-correct
+                 *  test reduces to:
+                 */
+                if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+                        goto reset_and_undo;
+                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+                    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
+                             tcp_time_stamp)) {
+                        NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED);
+                        goto reset_and_undo;
+                }
+                /* Now ACK is acceptable.
+                 *
+                 * "If the RST bit is set
+                 *    If the ACK was acceptable then signal the user "error:
+                 *    connection reset", drop the segment, enter CLOSED state,
+                 *    delete TCB, and return."
+                 */
+                if (th->rst) {
+                        tcp_reset(sk);
+                        goto discard;
+                }
+                /* rfc793:
+                 *   "fifth, if neither of the SYN or RST bits is set then
+                 *    drop the segment and return."
+                 *
+                 *    See note below!
+                 *                                        --ANK(990513)
+                 */
+                if (!th->syn)
+                        goto discard_and_undo;
+                /* rfc793:
+                 *   "If the SYN bit is on ...
+                 *    are acceptable then ...
+                 *    (our SYN has been ACKed), change the connection
+                 *    state to ESTABLISHED..."
+                 */
+                TCP_ECN_rcv_synack(tp, th);
+                if (tp->ecn_flags&TCP_ECN_OK)
+                        sock_set_flag(sk, SOCK_NO_LARGESEND);
+                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+                tcp_ack(sk, skb, FLAG_SLOWPATH);
+                /* Ok.. it's good. Set up sequence numbers and
+                 * move to established.
+                 */
+                tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+                /* RFC1323: The window in SYN & SYN/ACK segments is
+                 * never scaled.
+                 */
+                tp->snd_wnd = ntohs(th->window);
+                tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
+                if (!tp->rx_opt.wscale_ok) {
+                        tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
+                        tp->window_clamp = min(tp->window_clamp, 65535U);
+                }
+                if (tp->rx_opt.saw_tstamp) {
+                        tp->rx_opt.tstamp_ok       = 1;
+                        tp->tcp_header_len =
+                                sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+                        tp->advmss          -= TCPOLEN_TSTAMP_ALIGNED;
+                        tcp_store_ts_recent(tp);
+                } else {
+                        tp->tcp_header_len = sizeof(struct tcphdr);
+                }
+                if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
+                        tp->rx_opt.sack_ok |= 2;
+                tcp_sync_mss(sk, tp->pmtu_cookie);
+                tcp_initialize_rcv_mss(sk);
+                /* Remember, tcp_poll() does not lock socket!
+                 * Change state from SYN-SENT only after copied_seq
+                 * is initialized. */
+                tp->copied_seq = tp->rcv_nxt;
+                mb();
+                tcp_set_state(sk, TCP_ESTABLISHED);
+                /* Make sure socket is routed, for correct metrics.  */
+                tp->af_specific->rebuild_header(sk);
+                tcp_init_metrics(sk);
+                /* Prevent spurious tcp_cwnd_restart() on first data
+                 * packet.
+                 */
+                tp->lsndtime = tcp_time_stamp;
+                tcp_init_buffer_space(sk);
+                if (sock_flag(sk, SOCK_KEEPOPEN))
+                        tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+                if (!tp->rx_opt.snd_wscale)
+                        __tcp_fast_path_on(tp, tp->snd_wnd);
+                else
+                        tp->pred_flags = 0;
+                if (!sock_flag(sk, SOCK_DEAD)) {
+                        sk->sk_state_change(sk);
+                        sk_wake_async(sk, 0, POLL_OUT);
+                }
+                if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+                        /* Save one ACK. Data will be ready after
+                         * several ticks, if write_pending is set.
+                         *
+                         * It may be deleted, but with this feature tcpdumps
+                         * look so _wonderfully_ clever, that I was not able
+                         * to stand against the temptation 8)     --ANK
+                         */
+                        tcp_schedule_ack(tp);
+                        tp->ack.lrcvtime = tcp_time_stamp;
+                        tp->ack.ato      = TCP_ATO_MIN;
+                        tcp_incr_quickack(tp);
+                        tcp_enter_quickack_mode(tp);
+                        tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+discard:
+                        __kfree_skb(skb);
+                        return 0;
+                } else {
+                        tcp_send_ack(sk);
+                }
+                return -1;
+        }
+        /* No ACK in the segment */
+        if (th->rst) {
+                /* rfc793:
+                 * "If the RST bit is set
+                 *
+                 *      Otherwise (no ACK) drop the segment and return."
+                 */
+                goto discard_and_undo;
+        }
+        /* PAWS check. */
+        if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0))
+                goto discard_and_undo;
+        if (th->syn) {
+                /* We see SYN without ACK. It is attempt of
+                 * simultaneous connect with crossed SYNs.
+                 * Particularly, it can be connect to self.
+                 */
+                tcp_set_state(sk, TCP_SYN_RECV);
+                if (tp->rx_opt.saw_tstamp) {
+                        tp->rx_opt.tstamp_ok = 1;
+                        tcp_store_ts_recent(tp);
+                        tp->tcp_header_len =
+                                sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+                } else {
+                        tp->tcp_header_len = sizeof(struct tcphdr);
+                }
+                tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+                /* RFC1323: The window in SYN & SYN/ACK segments is
+                 * never scaled.
+                 */
+                tp->snd_wnd    = ntohs(th->window);
+                tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
+                tp->max_window = tp->snd_wnd;
+                TCP_ECN_rcv_syn(tp, th);
+                if (tp->ecn_flags&TCP_ECN_OK)
+                        sock_set_flag(sk, SOCK_NO_LARGESEND);
+                tcp_sync_mss(sk, tp->pmtu_cookie);
+                tcp_initialize_rcv_mss(sk);
+                tcp_send_synack(sk);
+#if 0
+                /* Note, we could accept data and URG from this segment.
+                 * There are no obstacles to make this.
+                 *
+                 * However, if we ignore data in ACKless segments sometimes,
+                 * we have no reasons to accept it sometimes.
+                 * Also, seems the code doing it in step6 of tcp_rcv_state_process
+                 * is not flawless. So, discard packet for sanity.
+                 * Uncomment this return to process the data.
+                 */
+                return -1;
+#else
+                goto discard;
+#endif
+        }
+        /* "fifth, if neither of the SYN or RST bits is set then
+         * drop the segment and return."
+         */
+discard_and_undo:
+        tcp_clear_options(&tp->rx_opt);
+        tp->rx_opt.mss_clamp = saved_clamp;
+        goto discard;
+reset_and_undo:
+        tcp_clear_options(&tp->rx_opt);
+        tp->rx_opt.mss_clamp = saved_clamp;
+        return 1;
+}
+/*
+ *      This function implements the receiving procedure of RFC 793 for
+ *      all states except ESTABLISHED and TIME_WAIT. 
+ *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ *      address independent.
+ */
+        
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+                          struct tcphdr *th, unsigned len)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int queued = 0;
+        tp->rx_opt.saw_tstamp = 0;
+        switch (sk->sk_state) {
+        case TCP_CLOSE:
+                goto discard;
+        case TCP_LISTEN:
+                if(th->ack)
+                        return 1;
+                if(th->rst)
+                        goto discard;
+                if(th->syn) {
+                        if(tp->af_specific->conn_request(sk, skb) < 0)
+                                return 1;
+                        init_westwood(sk);
+                        init_bictcp(tp);
+                        /* Now we have several options: In theory there is 
+                         * nothing else in the frame. KA9Q has an option to 
+                         * send data with the syn, BSD accepts data with the
+                         * syn up to the [to be] advertised window and 
+                         * Solaris 2.1 gives you a protocol error. For now 
+                         * we just ignore it, that fits the spec precisely 
+                         * and avoids incompatibilities. It would be nice in
+                         * future to drop through and process the data.
+                         *
+                         * Now that TTCP is starting to be used we ought to 
+                         * queue this data.
+                         * But, this leaves one open to an easy denial of
+                         * service attack, and SYN cookies can't defend
+                         * against this problem. So, we drop the data
+                         * in the interest of security over speed.
+                         */
+                        goto discard;
+                }
+                goto discard;
+        case TCP_SYN_SENT:
+                init_westwood(sk);
+                init_bictcp(tp);
+                queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+                if (queued >= 0)
+                        return queued;
+                /* Do step6 onward by hand. */
+                tcp_urg(sk, skb, th);
+                __kfree_skb(skb);
+                tcp_data_snd_check(sk);
+                return 0;
+        }
+        if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+            tcp_paws_discard(tp, skb)) {
+                if (!th->rst) {
+                        NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+                        tcp_send_dupack(sk, skb);
+                        goto discard;
+                }
+                /* Reset is accepted even if it did not pass PAWS. */
+        }
+        /* step 1: check sequence number */
+        if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+                if (!th->rst)
+                        tcp_send_dupack(sk, skb);
+                goto discard;
+        }
+        /* step 2: check RST bit */
+        if(th->rst) {
+                tcp_reset(sk);
+                goto discard;
+        }
+        tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+        /* step 3: check security and precedence [ignored] */
+        /*      step 4:
+         *
+         *      Check for a SYN in window.
+         */
+        if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+                NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
+                tcp_reset(sk);
+                return 1;
+        }
+        /* step 5: check the ACK field */
+        if (th->ack) {
+                int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
+                switch(sk->sk_state) {
+                case TCP_SYN_RECV:
+                        if (acceptable) {
+                                tp->copied_seq = tp->rcv_nxt;
+                                mb();
+                                tcp_set_state(sk, TCP_ESTABLISHED);
+                                sk->sk_state_change(sk);
+                                /* Note, that this wakeup is only for marginal
+                                 * crossed SYN case. Passively open sockets
+                                 * are not waked up, because sk->sk_sleep ==
+                                 * NULL and sk->sk_socket == NULL.
+                                 */
+                                if (sk->sk_socket) {
+                                        sk_wake_async(sk,0,POLL_OUT);
+                                }
+                                tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+                                tp->snd_wnd = ntohs(th->window) <<
+                                              tp->rx_opt.snd_wscale;
+                                tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
+                                            TCP_SKB_CB(skb)->seq);
+                                /* tcp_ack considers this ACK as duplicate
+                                 * and does not calculate rtt.
+                                 * Fix it at least with timestamps.
+                                 */
+                                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+                                    !tp->srtt)
+                                        tcp_ack_saw_tstamp(tp, 0);
+                                if (tp->rx_opt.tstamp_ok)
+                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+                                /* Make sure socket is routed, for
+                                 * correct metrics.
+                                 */
+                                tp->af_specific->rebuild_header(sk);
+                                tcp_init_metrics(sk);
+                                /* Prevent spurious tcp_cwnd_restart() on
+                                 * first data packet.
+                                 */
+                                tp->lsndtime = tcp_time_stamp;
+                                tcp_initialize_rcv_mss(sk);
+                                tcp_init_buffer_space(sk);
+                                tcp_fast_path_on(tp);
+                        } else {
+                                return 1;
+                        }
+                        break;
+                case TCP_FIN_WAIT1:
+                        if (tp->snd_una == tp->write_seq) {
+                                tcp_set_state(sk, TCP_FIN_WAIT2);
+                                sk->sk_shutdown |= SEND_SHUTDOWN;
+                                dst_confirm(sk->sk_dst_cache);
+                                if (!sock_flag(sk, SOCK_DEAD))
+                                        /* Wake up lingering close() */
+                                        sk->sk_state_change(sk);
+                                else {
+                                        int tmo;
+                                        if (tp->linger2 < 0 ||
+                                            (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+                                             after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+                                                tcp_done(sk);
+                                                NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
+                                                return 1;
+                                        }
+                                        tmo = tcp_fin_time(tp);
+                                        if (tmo > TCP_TIMEWAIT_LEN) {
+                                                tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+                                        } else if (th->fin || sock_owned_by_user(sk)) {
+                                                /* Bad case. We could lose such FIN otherwise.
+                                                 * It is not a big problem, but it looks confusing
+                                                 * and not so rare event. We still can lose it now,
+                                                 * if it spins in bh_lock_sock(), but it is really
+                                                 * marginal case.
+                                                 */
+                                                tcp_reset_keepalive_timer(sk, tmo);
+                                        } else {
+                                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+                                                goto discard;
+                                        }
+                                }
+                        }
+                        break;
+                case TCP_CLOSING:
+                        if (tp->snd_una == tp->write_seq) {
+                                tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+                                goto discard;
+                        }
+                        break;
+                case TCP_LAST_ACK:
+                        if (tp->snd_una == tp->write_seq) {
+                                tcp_update_metrics(sk);
+                                tcp_done(sk);
+                                goto discard;
+                        }
+                        break;
+                }
+        } else
+                goto discard;
+        /* step 6: check the URG bit */
+        tcp_urg(sk, skb, th);
+        /* step 7: process the segment text */
+        switch (sk->sk_state) {
+        case TCP_CLOSE_WAIT:
+        case TCP_CLOSING:
+        case TCP_LAST_ACK:
+                if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+                        break;
+        case TCP_FIN_WAIT1:
+        case TCP_FIN_WAIT2:
+                /* RFC 793 says to queue data in these states,
+                 * RFC 1122 says we MUST send a reset. 
+                 * BSD 4.4 also does reset.
+                 */
+                if (sk->sk_shutdown & RCV_SHUTDOWN) {
+                        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+                            after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+                                NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
+                                tcp_reset(sk);
+                                return 1;
+                        }
+                }
+                /* Fall through */
+        case TCP_ESTABLISHED: 
+                tcp_data_queue(sk, skb);
+                queued = 1;
+                break;
+        }
+        /* tcp_data could move socket to TIME-WAIT */
+        if (sk->sk_state != TCP_CLOSE) {
+                tcp_data_snd_check(sk);
+                tcp_ack_snd_check(sk);
+        }
+        if (!queued) { 
+discard:
+                __kfree_skb(skb);
+        }
+        return 0;
+}
+EXPORT_SYMBOL(sysctl_tcp_ecn);
+EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(tcp_parse_options);
+EXPORT_SYMBOL(tcp_rcv_established);
+EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 000000000000..3ac6659869c4
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2663 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
+ *
+ *              IPv4 specific functions
+ *
+ *
+ *              code split from:
+ *              linux/ipv4/tcp.c
+ *              linux/ipv4/tcp_input.c
+ *              linux/ipv4/tcp_output.c
+ *
+ *              See tcp.c for author information
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+/*
+ * Changes:
+ *              David S. Miller :       New socket lookup architecture.
+ *                                      This code is dedicated to John Dyson.
+ *              David S. Miller :       Change semantics of established hash,
+ *                                      half is devoted to TIME_WAIT sockets
+ *                                      and the rest go in the other half.
+ *              Andi Kleen :            Add support for syncookies and fixed
+ *                                      some bugs: ip options weren't passed to
+ *                                      the TCP layer, missed a check for an
+ *                                      ACK bit.
+ *              Andi Kleen :            Implemented fast path mtu discovery.
+ *                                      Fixed many serious bugs in the
+ *                                      open_request handling and moved
+ *                                      most of it into the af independent code.
+ *                                      Added tail drop and some other bugfixes.
+ *                                      Added new listen sematics.
+ *              Mike McLagan    :       Routing by source
+ *      Juan Jose Ciarlante:            ip_dynaddr bits
+ *              Andi Kleen:             various fixes.
+ *      Vitaly E. Lavrov        :       Transparent proxy revived after year
+ *                                      coma.
+ *      Andi Kleen              :       Fix new listen.
+ *      Andi Kleen              :       Fix accept error reporting.
+ *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
+ *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
+ *                                      a single port at the same time.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/times.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/stddef.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+extern int sysctl_ip_dynaddr;
+int sysctl_tcp_tw_reuse;
+int sysctl_tcp_low_latency;
+/* Check TCP sequence numbers in ICMP packets. */
+#define ICMP_MIN_LENGTH 8
+/* Socket used for sending RSTs */
+static struct socket *tcp_socket;
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+                       struct sk_buff *skb);
+struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
+        .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
+        .__tcp_lhash_users      =       ATOMIC_INIT(0),
+        .__tcp_lhash_wait
+          = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
+        .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
+};
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+int tcp_port_rover = 1024 - 1;
+static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+                                 __u32 faddr, __u16 fport)
+{
+        int h = (laddr ^ lport) ^ (faddr ^ fport);
+        h ^= h >> 16;
+        h ^= h >> 8;
+        return h & (tcp_ehash_size - 1);
+}
+static __inline__ int tcp_sk_hashfn(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        __u32 laddr = inet->rcv_saddr;
+        __u16 lport = inet->num;
+        __u32 faddr = inet->daddr;
+        __u16 fport = inet->dport;
+        return tcp_hashfn(laddr, lport, faddr, fport);
+}
+/* Allocate and initialize a new TCP local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
+                                          unsigned short snum)
+{
+        struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
+                                                      SLAB_ATOMIC);
+        if (tb) {
+                tb->port = snum;
+                tb->fastreuse = 0;
+                INIT_HLIST_HEAD(&tb->owners);
+                hlist_add_head(&tb->node, &head->chain);
+        }
+        return tb;
+}
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
+{
+        if (hlist_empty(&tb->owners)) {
+                __hlist_del(&tb->node);
+                kmem_cache_free(tcp_bucket_cachep, tb);
+        }
+}
+/* Caller must disable local BH processing. */
+static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+        struct tcp_bind_hashbucket *head =
+                                &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
+        struct tcp_bind_bucket *tb;
+        spin_lock(&head->lock);
+        tb = tcp_sk(sk)->bind_hash;
+        sk_add_bind_node(child, &tb->owners);
+        tcp_sk(child)->bind_hash = tb;
+        spin_unlock(&head->lock);
+}
+inline void tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+        local_bh_disable();
+        __tcp_inherit_port(sk, child);
+        local_bh_enable();
+}
+void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
+                   unsigned short snum)
+{
+        inet_sk(sk)->num = snum;
+        sk_add_bind_node(sk, &tb->owners);
+        tcp_sk(sk)->bind_hash = tb;
+}
+static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
+{
+        const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
+        struct sock *sk2;
+        struct hlist_node *node;
+        int reuse = sk->sk_reuse;
+        sk_for_each_bound(sk2, node, &tb->owners) {
+                if (sk != sk2 &&
+                    !tcp_v6_ipv6only(sk2) &&
+                    (!sk->sk_bound_dev_if ||
+                     !sk2->sk_bound_dev_if ||
+                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+                        if (!reuse || !sk2->sk_reuse ||
+                            sk2->sk_state == TCP_LISTEN) {
+                                const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
+                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+                                    sk2_rcv_saddr == sk_rcv_saddr)
+                                        break;
+                        }
+                }
+        }
+        return node != NULL;
+}
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+        struct tcp_bind_hashbucket *head;
+        struct hlist_node *node;
+        struct tcp_bind_bucket *tb;
+        int ret;
+        local_bh_disable();
+        if (!snum) {
+                int low = sysctl_local_port_range[0];
+                int high = sysctl_local_port_range[1];
+                int remaining = (high - low) + 1;
+                int rover;
+                spin_lock(&tcp_portalloc_lock);
+                rover = tcp_port_rover;
+                do {
+                        rover++;
+                        if (rover < low || rover > high)
+                                rover = low;
+                        head = &tcp_bhash[tcp_bhashfn(rover)];
+                        spin_lock(&head->lock);
+                        tb_for_each(tb, node, &head->chain)
+                                if (tb->port == rover)
+                                        goto next;
+                        break;
+                next:
+                        spin_unlock(&head->lock);
+                } while (--remaining > 0);
+                tcp_port_rover = rover;
+                spin_unlock(&tcp_portalloc_lock);
+                /* Exhausted local port range during search? */
+                ret = 1;
+                if (remaining <= 0)
+                        goto fail;
+                /* OK, here is the one we will use.  HEAD is
+                 * non-NULL and we hold it's mutex.
+                 */
+                snum = rover;
+        } else {
+                head = &tcp_bhash[tcp_bhashfn(snum)];
+                spin_lock(&head->lock);
+                tb_for_each(tb, node, &head->chain)
+                        if (tb->port == snum)
+                                goto tb_found;
+        }
+        tb = NULL;
+        goto tb_not_found;
+tb_found:
+        if (!hlist_empty(&tb->owners)) {
+                if (sk->sk_reuse > 1)
+                        goto success;
+                if (tb->fastreuse > 0 &&
+                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+                        goto success;
+                } else {
+                        ret = 1;
+                        if (tcp_bind_conflict(sk, tb))
+                                goto fail_unlock;
+                }
+        }
+tb_not_found:
+        ret = 1;
+        if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
+                goto fail_unlock;
+        if (hlist_empty(&tb->owners)) {
+                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+                        tb->fastreuse = 1;
+                else
+                        tb->fastreuse = 0;
+        } else if (tb->fastreuse &&
+                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+                tb->fastreuse = 0;
+success:
+        if (!tcp_sk(sk)->bind_hash)
+                tcp_bind_hash(sk, tb, snum);
+        BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
+        ret = 0;
+fail_unlock:
+        spin_unlock(&head->lock);
+fail:
+        local_bh_enable();
+        return ret;
+}
+/* Get rid of any references to a local port held by the
+ * given sock.
+ */
+static void __tcp_put_port(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
+        struct tcp_bind_bucket *tb;
+        spin_lock(&head->lock);
+        tb = tcp_sk(sk)->bind_hash;
+        __sk_del_bind_node(sk);
+        tcp_sk(sk)->bind_hash = NULL;
+        inet->num = 0;
+        tcp_bucket_destroy(tb);
+        spin_unlock(&head->lock);
+}
+void tcp_put_port(struct sock *sk)
+{
+        local_bh_disable();
+        __tcp_put_port(sk);
+        local_bh_enable();
+}
+/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+void tcp_listen_wlock(void)
+{
+        write_lock(&tcp_lhash_lock);
+        if (atomic_read(&tcp_lhash_users)) {
+                DEFINE_WAIT(wait);
+                for (;;) {
+                        prepare_to_wait_exclusive(&tcp_lhash_wait,
+                                                &wait, TASK_UNINTERRUPTIBLE);
+                        if (!atomic_read(&tcp_lhash_users))
+                                break;
+                        write_unlock_bh(&tcp_lhash_lock);
+                        schedule();
+                        write_lock_bh(&tcp_lhash_lock);
+                }
+                finish_wait(&tcp_lhash_wait, &wait);
+        }
+}
+static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
+{
+        struct hlist_head *list;
+        rwlock_t *lock;
+        BUG_TRAP(sk_unhashed(sk));
+        if (listen_possible && sk->sk_state == TCP_LISTEN) {
+                list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
+                lock = &tcp_lhash_lock;
+                tcp_listen_wlock();
+        } else {
+                list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
+                lock = &tcp_ehash[sk->sk_hashent].lock;
+                write_lock(lock);
+        }
+        __sk_add_node(sk, list);
+        sock_prot_inc_use(sk->sk_prot);
+        write_unlock(lock);
+        if (listen_possible && sk->sk_state == TCP_LISTEN)
+                wake_up(&tcp_lhash_wait);
+}
+static void tcp_v4_hash(struct sock *sk)
+{
+        if (sk->sk_state != TCP_CLOSE) {
+                local_bh_disable();
+                __tcp_v4_hash(sk, 1);
+                local_bh_enable();
+        }
+}
+void tcp_unhash(struct sock *sk)
+{
+        rwlock_t *lock;
+        if (sk_unhashed(sk))
+                goto ende;
+        if (sk->sk_state == TCP_LISTEN) {
+                local_bh_disable();
+                tcp_listen_wlock();
+                lock = &tcp_lhash_lock;
+        } else {
+                struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
+                lock = &head->lock;
+                write_lock_bh(&head->lock);
+        }
+        if (__sk_del_node_init(sk))
+                sock_prot_dec_use(sk->sk_prot);
+        write_unlock_bh(lock);
+ ende:
+        if (sk->sk_state == TCP_LISTEN)
+                wake_up(&tcp_lhash_wait);
+}
+/* Don't inline this cruft.  Here are some nice properties to
+ * exploit here.  The BSD API does not allow a listening TCP
+ * to specify the remote port nor the remote address for the
+ * connection.  So always assume those are both wildcarded
+ * during the search since they can never be otherwise.
+ */
+static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
+                                             unsigned short hnum, int dif)
+{
+        struct sock *result = NULL, *sk;
+        struct hlist_node *node;
+        int score, hiscore;
+        hiscore=-1;
+        sk_for_each(sk, node, head) {
+                struct inet_sock *inet = inet_sk(sk);
+                if (inet->num == hnum && !ipv6_only_sock(sk)) {
+                        __u32 rcv_saddr = inet->rcv_saddr;
+                        score = (sk->sk_family == PF_INET ? 1 : 0);
+                        if (rcv_saddr) {
+                                if (rcv_saddr != daddr)
+                                        continue;
+                                score+=2;
+                        }
+                        if (sk->sk_bound_dev_if) {
+                                if (sk->sk_bound_dev_if != dif)
+                                        continue;
+                                score+=2;
+                        }
+                        if (score == 5)
+                                return sk;
+                        if (score > hiscore) {
+                                hiscore = score;
+                                result = sk;
+                        }
+                }
+        }
+        return result;
+}
+/* Optimize the common listener case. */
+static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
+                unsigned short hnum, int dif)
+{
+        struct sock *sk = NULL;
+        struct hlist_head *head;
+        read_lock(&tcp_lhash_lock);
+        head = &tcp_listening_hash[tcp_lhashfn(hnum)];
+        if (!hlist_empty(head)) {
+                struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+                if (inet->num == hnum && !sk->sk_node.next &&
+                    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
+                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
+                    !sk->sk_bound_dev_if)
+                        goto sherry_cache;
+                sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
+        }
+        if (sk) {
+sherry_cache:
+                sock_hold(sk);
+        }
+        read_unlock(&tcp_lhash_lock);
+        return sk;
+}
+/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ *
+ * Local BH must be disabled here.
+ */
+static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
+                                                       u32 daddr, u16 hnum,
+                                                       int dif)
+{
+        struct tcp_ehash_bucket *head;
+        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+        __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
+        struct sock *sk;
+        struct hlist_node *node;
+        /* Optimize here for direct hit, only listening connections can
+         * have wildcards anyways.
+         */
+        int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+        head = &tcp_ehash[hash];
+        read_lock(&head->lock);
+        sk_for_each(sk, node, &head->chain) {
+                if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
+                        goto hit; /* You sunk my battleship! */
+        }
+        /* Must check for a TIME_WAIT'er before going to listener hash. */
+        sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
+                if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
+                        goto hit;
+        }
+        sk = NULL;
+out:
+        read_unlock(&head->lock);
+        return sk;
+hit:
+        sock_hold(sk);
+        goto out;
+}
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
+                                           u32 daddr, u16 hnum, int dif)
+{
+        struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
+                                                      daddr, hnum, dif);
+        return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
+}
+inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
+                                  u16 dport, int dif)
+{
+        struct sock *sk;
+        local_bh_disable();
+        sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
+        local_bh_enable();
+        return sk;
+}
+EXPORT_SYMBOL_GPL(tcp_v4_lookup);
+static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
+{
+        return secure_tcp_sequence_number(skb->nh.iph->daddr,
+                                          skb->nh.iph->saddr,
+                                          skb->h.th->dest,
+                                          skb->h.th->source);
+}
+/* called with local bh disabled */
+static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
+                                      struct tcp_tw_bucket **twp)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        u32 daddr = inet->rcv_saddr;
+        u32 saddr = inet->daddr;
+        int dif = sk->sk_bound_dev_if;
+        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+        __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
+        int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
+        struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+        struct sock *sk2;
+        struct hlist_node *node;
+        struct tcp_tw_bucket *tw;
+        write_lock(&head->lock);
+        /* Check TIME-WAIT sockets first. */
+        sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
+                tw = (struct tcp_tw_bucket *)sk2;
+                if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+                        struct tcp_sock *tp = tcp_sk(sk);
+                        /* With PAWS, it is safe from the viewpoint
+                           of data integrity. Even without PAWS it
+                           is safe provided sequence spaces do not
+                           overlap i.e. at data rates <= 80Mbit/sec.
+                           Actually, the idea is close to VJ's one,
+                           only timestamp cache is held not per host,
+                           but per port pair and TW bucket is used
+                           as state holder.
+                           If TW bucket has been already destroyed we
+                           fall back to VJ's scheme and use initial
+                           timestamp retrieved from peer table.
+                         */
+                        if (tw->tw_ts_recent_stamp &&
+                            (!twp || (sysctl_tcp_tw_reuse &&
+                                      xtime.tv_sec -
+                                      tw->tw_ts_recent_stamp > 1))) {
+                                if ((tp->write_seq =
+                                                tw->tw_snd_nxt + 65535 + 2) == 0)
+                                        tp->write_seq = 1;
+                                tp->rx_opt.ts_recent       = tw->tw_ts_recent;
+                                tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+                                sock_hold(sk2);
+                                goto unique;
+                        } else
+                                goto not_unique;
+                }
+        }
+        tw = NULL;
+        /* And established part... */
+        sk_for_each(sk2, node, &head->chain) {
+                if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+                        goto not_unique;
+        }
+unique:
+        /* Must record num and sport now. Otherwise we will see
+         * in hash table socket with a funny identity. */
+        inet->num = lport;
+        inet->sport = htons(lport);
+        sk->sk_hashent = hash;
+        BUG_TRAP(sk_unhashed(sk));
+        __sk_add_node(sk, &head->chain);
+        sock_prot_inc_use(sk->sk_prot);
+        write_unlock(&head->lock);
+        if (twp) {
+                *twp = tw;
+                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+        } else if (tw) {
+                /* Silly. Should hash-dance instead... */
+                tcp_tw_deschedule(tw);
+                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+                tcp_tw_put(tw);
+        }
+        return 0;
+not_unique:
+        write_unlock(&head->lock);
+        return -EADDRNOTAVAIL;
+}
+static inline u32 connect_port_offset(const struct sock *sk)
+{
+        const struct inet_sock *inet = inet_sk(sk);
+        return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+                                         inet->dport);
+}
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+static inline int tcp_v4_hash_connect(struct sock *sk)
+{
+        unsigned short snum = inet_sk(sk)->num;
+        struct tcp_bind_hashbucket *head;
+        struct tcp_bind_bucket *tb;
+        int ret;
+        if (!snum) {
+                int low = sysctl_local_port_range[0];
+                int high = sysctl_local_port_range[1];
+                int range = high - low;
+                int i;
+                int port;
+                static u32 hint;
+                u32 offset = hint + connect_port_offset(sk);
+                struct hlist_node *node;
+                struct tcp_tw_bucket *tw = NULL;
+                local_bh_disable();
+                for (i = 1; i <= range; i++) {
+                        port = low + (i + offset) % range;
+                        head = &tcp_bhash[tcp_bhashfn(port)];
+                        spin_lock(&head->lock);
+                        /* Does not bother with rcv_saddr checks,
+                         * because the established check is already
+                         * unique enough.
+                         */
+                        tb_for_each(tb, node, &head->chain) {
+                                if (tb->port == port) {
+                                        BUG_TRAP(!hlist_empty(&tb->owners));
+                                        if (tb->fastreuse >= 0)
+                                                goto next_port;
+                                        if (!__tcp_v4_check_established(sk,
+                                                                        port,
+                                                                        &tw))
+                                                goto ok;
+                                        goto next_port;
+                                }
+                        }
+                        tb = tcp_bucket_create(head, port);
+                        if (!tb) {
+                                spin_unlock(&head->lock);
+                                break;
+                        }
+                        tb->fastreuse = -1;
+                        goto ok;
+                next_port:
+                        spin_unlock(&head->lock);
+                }
+                local_bh_enable();
+                return -EADDRNOTAVAIL;
+ok:
+                hint += i;
+                /* Head lock still held and bh's disabled */
+                tcp_bind_hash(sk, tb, port);
+                if (sk_unhashed(sk)) {
+                        inet_sk(sk)->sport = htons(port);
+                        __tcp_v4_hash(sk, 0);
+                }
+                spin_unlock(&head->lock);
+                if (tw) {
+                        tcp_tw_deschedule(tw);
+                        tcp_tw_put(tw);
+                }
+                ret = 0;
+                goto out;
+        }
+        head  = &tcp_bhash[tcp_bhashfn(snum)];
+        tb  = tcp_sk(sk)->bind_hash;
+        spin_lock_bh(&head->lock);
+        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+                __tcp_v4_hash(sk, 0);
+                spin_unlock_bh(&head->lock);
+                return 0;
+        } else {
+                spin_unlock(&head->lock);
+                /* No definite answer... Walk to established hash table */
+                ret = __tcp_v4_check_established(sk, snum, NULL);
+out:
+                local_bh_enable();
+                return ret;
+        }
+}
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+        struct rtable *rt;
+        u32 daddr, nexthop;
+        int tmp;
+        int err;
+        if (addr_len < sizeof(struct sockaddr_in))
+                return -EINVAL;
+        if (usin->sin_family != AF_INET)
+                return -EAFNOSUPPORT;
+        nexthop = daddr = usin->sin_addr.s_addr;
+        if (inet->opt && inet->opt->srr) {
+                if (!daddr)
+                        return -EINVAL;
+                nexthop = inet->opt->faddr;
+        }
+        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+                               IPPROTO_TCP,
+                               inet->sport, usin->sin_port, sk);
+        if (tmp < 0)
+                return tmp;
+        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+                ip_rt_put(rt);
+                return -ENETUNREACH;
+        }
+        if (!inet->opt || !inet->opt->srr)
+                daddr = rt->rt_dst;
+        if (!inet->saddr)
+                inet->saddr = rt->rt_src;
+        inet->rcv_saddr = inet->saddr;
+        if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
+                /* Reset inherited state */
+                tp->rx_opt.ts_recent       = 0;
+                tp->rx_opt.ts_recent_stamp = 0;
+                tp->write_seq              = 0;
+        }
+        if (sysctl_tcp_tw_recycle &&
+            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
+                struct inet_peer *peer = rt_get_peer(rt);
+                /* VJ's idea. We save last timestamp seen from
+                 * the destination in peer table, when entering state TIME-WAIT
+                 * and initialize rx_opt.ts_recent from it, when trying new connection.
+                 */
+                if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
+                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+                        tp->rx_opt.ts_recent = peer->tcp_ts;
+                }
+        }
+        inet->dport = usin->sin_port;
+        inet->daddr = daddr;
+        tp->ext_header_len = 0;
+        if (inet->opt)
+                tp->ext_header_len = inet->opt->optlen;
+        tp->rx_opt.mss_clamp = 536;
+        /* Socket identity is still unknown (sport may be zero).
+         * However we set state to SYN-SENT and not releasing socket
+         * lock select source port, enter ourselves into the hash tables and
+         * complete initialization after this.
+         */
+        tcp_set_state(sk, TCP_SYN_SENT);
+        err = tcp_v4_hash_connect(sk);
+        if (err)
+                goto failure;
+        err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
+        if (err)
+                goto failure;
+        /* OK, now commit destination to socket.  */
+        __sk_dst_set(sk, &rt->u.dst);
+        tcp_v4_setup_caps(sk, &rt->u.dst);
+        if (!tp->write_seq)
+                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
+                                                           inet->daddr,
+                                                           inet->sport,
+                                                           usin->sin_port);
+        inet->id = tp->write_seq ^ jiffies;
+        err = tcp_connect(sk);
+        rt = NULL;
+        if (err)
+                goto failure;
+        return 0;
+failure:
+        /* This unhashes the socket and releases the local port, if necessary. */
+        tcp_set_state(sk, TCP_CLOSE);
+        ip_rt_put(rt);
+        sk->sk_route_caps = 0;
+        inet->dport = 0;
+        return err;
+}
+static __inline__ int tcp_v4_iif(struct sk_buff *skb)
+{
+        return ((struct rtable *)skb->dst)->rt_iif;
+}
+static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
+{
+        return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
+}
+static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
+                                              struct open_request ***prevp,
+                                              __u16 rport,
+                                              __u32 raddr, __u32 laddr)
+{
+        struct tcp_listen_opt *lopt = tp->listen_opt;
+        struct open_request *req, **prev;
+        for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
+             (req = *prev) != NULL;
+             prev = &req->dl_next) {
+                if (req->rmt_port == rport &&
+                    req->af.v4_req.rmt_addr == raddr &&
+                    req->af.v4_req.loc_addr == laddr &&
+                    TCP_INET_FAMILY(req->class->family)) {
+                        BUG_TRAP(!req->sk);
+                        *prevp = prev;
+                        break;
+                }
+        }
+        return req;
+}
+static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_listen_opt *lopt = tp->listen_opt;
+        u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
+        req->expires = jiffies + TCP_TIMEOUT_INIT;
+        req->retrans = 0;
+        req->sk = NULL;
+        req->dl_next = lopt->syn_table[h];
+        write_lock(&tp->syn_wait_lock);
+        lopt->syn_table[h] = req;
+        write_unlock(&tp->syn_wait_lock);
+        tcp_synq_added(sk);
+}
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
+                                     u32 mtu)
+{
+        struct dst_entry *dst;
+        struct inet_sock *inet = inet_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
+         * send out by Linux are always <576bytes so they should go through
+         * unfragmented).
+         */
+        if (sk->sk_state == TCP_LISTEN)
+                return;
+        /* We don't check in the destentry if pmtu discovery is forbidden
+         * on this route. We just assume that no packet_to_big packets
+         * are send back when pmtu discovery is not active.
+         * There is a small race when the user changes this flag in the
+         * route, but I think that's acceptable.
+         */
+        if ((dst = __sk_dst_check(sk, 0)) == NULL)
+                return;
+        dst->ops->update_pmtu(dst, mtu);
+        /* Something is about to be wrong... Remember soft error
+         * for the case, if this connection will not able to recover.
+         */
+        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+                sk->sk_err_soft = EMSGSIZE;
+        mtu = dst_mtu(dst);
+        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+            tp->pmtu_cookie > mtu) {
+                tcp_sync_mss(sk, mtu);
+                /* Resend the TCP packet because it's
+                 * clear that the old packet has been
+                 * dropped. This is the new "fast" path mtu
+                 * discovery.
+                 */
+                tcp_simple_retransmit(sk);
+        } /* else let the usual retransmit timer handle it */
+}
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.  After adjustment
+ * header points to the first 8 bytes of the tcp header.  We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+void tcp_v4_err(struct sk_buff *skb, u32 info)
+{
+        struct iphdr *iph = (struct iphdr *)skb->data;
+        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+        struct tcp_sock *tp;
+        struct inet_sock *inet;
+        int type = skb->h.icmph->type;
+        int code = skb->h.icmph->code;
+        struct sock *sk;
+        __u32 seq;
+        int err;
+        if (skb->len < (iph->ihl << 2) + 8) {
+                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+                return;
+        }
+        sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
+                           th->source, tcp_v4_iif(skb));
+        if (!sk) {
+                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+                return;
+        }
+        if (sk->sk_state == TCP_TIME_WAIT) {
+                tcp_tw_put((struct tcp_tw_bucket *)sk);
+                return;
+        }
+        bh_lock_sock(sk);
+        /* If too many ICMPs get dropped on busy
+         * servers this needs to be solved differently.
+         */
+        if (sock_owned_by_user(sk))
+                NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+        if (sk->sk_state == TCP_CLOSE)
+                goto out;
+        tp = tcp_sk(sk);
+        seq = ntohl(th->seq);
+        if (sk->sk_state != TCP_LISTEN &&
+            !between(seq, tp->snd_una, tp->snd_nxt)) {
+                NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+                goto out;
+        }
+        switch (type) {
+        case ICMP_SOURCE_QUENCH:
+                /* Just silently ignore these. */
+                goto out;
+        case ICMP_PARAMETERPROB:
+                err = EPROTO;
+                break;
+        case ICMP_DEST_UNREACH:
+                if (code > NR_ICMP_UNREACH)
+                        goto out;
+                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+                        if (!sock_owned_by_user(sk))
+                                do_pmtu_discovery(sk, iph, info);
+                        goto out;
+                }
+                err = icmp_err_convert[code].errno;
+                break;
+        case ICMP_TIME_EXCEEDED:
+                err = EHOSTUNREACH;
+                break;
+        default:
+                goto out;
+        }
+        switch (sk->sk_state) {
+                struct open_request *req, **prev;
+        case TCP_LISTEN:
+                if (sock_owned_by_user(sk))
+                        goto out;
+                req = tcp_v4_search_req(tp, &prev, th->dest,
+                                        iph->daddr, iph->saddr);
+                if (!req)
+                        goto out;
+                /* ICMPs are not backlogged, hence we cannot get
+                   an established socket here.
+                 */
+                BUG_TRAP(!req->sk);
+                if (seq != req->snt_isn) {
+                        NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+                        goto out;
+                }
+                /*
+                 * Still in SYN_RECV, just remove it silently.
+                 * There is no good way to pass the error to the newly
+                 * created socket, and POSIX does not want network
+                 * errors returned from accept().
+                 */
+                tcp_synq_drop(sk, req, prev);
+                goto out;
+        case TCP_SYN_SENT:
+        case TCP_SYN_RECV:  /* Cannot happen.
+                               It can f.e. if SYNs crossed.
+                             */
+                if (!sock_owned_by_user(sk)) {
+                        TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+                        sk->sk_err = err;
+                        sk->sk_error_report(sk);
+                        tcp_done(sk);
+                } else {
+                        sk->sk_err_soft = err;
+                }
+                goto out;
+        }
+        /* If we've already connected we will keep trying
+         * until we time out, or the user gives up.
+         *
+         * rfc1122 4.2.3.9 allows to consider as hard errors
+         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+         * but it is obsoleted by pmtu discovery).
+         *
+         * Note, that in modern internet, where routing is unreliable
+         * and in each dark corner broken firewalls sit, sending random
+         * errors ordered by their masters even this two messages finally lose
+         * their original sense (even Linux sends invalid PORT_UNREACHs)
+         *
+         * Now we are in compliance with RFCs.
+         *                                                      --ANK (980905)
+         */
+        inet = inet_sk(sk);
+        if (!sock_owned_by_user(sk) && inet->recverr) {
+                sk->sk_err = err;
+                sk->sk_error_report(sk);
+        } else  { /* Only an error on timeout */
+                sk->sk_err_soft = err;
+        }
+out:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+                       struct sk_buff *skb)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        if (skb->ip_summed == CHECKSUM_HW) {
+                th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
+                skb->csum = offsetof(struct tcphdr, check);
+        } else {
+                th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
+                                         csum_partial((char *)th,
+                                                      th->doff << 2,
+                                                      skb->csum));
+        }
+}
+/*
+ *      This routine will send an RST to the other tcp.
+ *
+ *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ *                    for reset.
+ *      Answer: if a packet caused RST, it is not for a socket
+ *              existing in our system, if it is matched to a socket,
+ *              it is just duplicate segment or bug in other side's TCP.
+ *              So that we build reply only basing on parameters
+ *              arrived with segment.
+ *      Exception: precedence violation. We do not implement it in any case.
+ */
+static void tcp_v4_send_reset(struct sk_buff *skb)
+{
+        struct tcphdr *th = skb->h.th;
+        struct tcphdr rth;
+        struct ip_reply_arg arg;
+        /* Never send a reset in response to a reset. */
+        if (th->rst)
+                return;
+        if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
+                return;
+        /* Swap the send and the receive. */
+        memset(&rth, 0, sizeof(struct tcphdr));
+        rth.dest   = th->source;
+        rth.source = th->dest;
+        rth.doff   = sizeof(struct tcphdr) / 4;
+        rth.rst    = 1;
+        if (th->ack) {
+                rth.seq = th->ack_seq;
+        } else {
+                rth.ack = 1;
+                rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
+                                    skb->len - (th->doff << 2));
+        }
+        memset(&arg, 0, sizeof arg);
+        arg.iov[0].iov_base = (unsigned char *)&rth;
+        arg.iov[0].iov_len  = sizeof rth;
+        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+                                      skb->nh.iph->saddr, /*XXX*/
+                                      sizeof(struct tcphdr), IPPROTO_TCP, 0);
+        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+        ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
+        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+        TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
+}
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+   outside socket context is ugly, certainly. What can I do?
+ */
+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+                            u32 win, u32 ts)
+{
+        struct tcphdr *th = skb->h.th;
+        struct {
+                struct tcphdr th;
+                u32 tsopt[3];
+        } rep;
+        struct ip_reply_arg arg;
+        memset(&rep.th, 0, sizeof(struct tcphdr));
+        memset(&arg, 0, sizeof arg);
+        arg.iov[0].iov_base = (unsigned char *)&rep;
+        arg.iov[0].iov_len  = sizeof(rep.th);
+        if (ts) {
+                rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+                                     (TCPOPT_TIMESTAMP << 8) |
+                                     TCPOLEN_TIMESTAMP);
+                rep.tsopt[1] = htonl(tcp_time_stamp);
+                rep.tsopt[2] = htonl(ts);
+                arg.iov[0].iov_len = sizeof(rep);
+        }
+        /* Swap the send and the receive. */
+        rep.th.dest    = th->source;
+        rep.th.source  = th->dest;
+        rep.th.doff    = arg.iov[0].iov_len / 4;
+        rep.th.seq     = htonl(seq);
+        rep.th.ack_seq = htonl(ack);
+        rep.th.ack     = 1;
+        rep.th.window  = htons(win);
+        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+                                      skb->nh.iph->saddr, /*XXX*/
+                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
+        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
+        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+}
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+        tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
+                        tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+        tcp_tw_put(tw);
+}
+static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
+{
+        tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
+                        req->ts_recent);
+}
+static struct dst_entry* tcp_v4_route_req(struct sock *sk,
+                                          struct open_request *req)
+{
+        struct rtable *rt;
+        struct ip_options *opt = req->af.v4_req.opt;
+        struct flowi fl = { .oif = sk->sk_bound_dev_if,
+                            .nl_u = { .ip4_u =
+                                      { .daddr = ((opt && opt->srr) ?
+                                                  opt->faddr :
+                                                  req->af.v4_req.rmt_addr),
+                                        .saddr = req->af.v4_req.loc_addr,
+                                        .tos = RT_CONN_FLAGS(sk) } },
+                            .proto = IPPROTO_TCP,
+                            .uli_u = { .ports =
+                                       { .sport = inet_sk(sk)->sport,
+                                         .dport = req->rmt_port } } };
+        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+                return NULL;
+        }
+        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+                ip_rt_put(rt);
+                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+                return NULL;
+        }
+        return &rt->u.dst;
+}
+/*
+ *      Send a SYN-ACK after having received an ACK.
+ *      This still operates on a open_request only, not on a big
+ *      socket.
+ */
+static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+                              struct dst_entry *dst)
+{
+        int err = -1;
+        struct sk_buff * skb;
+        /* First, grab a route. */
+        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+                goto out;
+        skb = tcp_make_synack(sk, dst, req);
+        if (skb) {
+                struct tcphdr *th = skb->h.th;
+                th->check = tcp_v4_check(th, skb->len,
+                                         req->af.v4_req.loc_addr,
+                                         req->af.v4_req.rmt_addr,
+                                         csum_partial((char *)th, skb->len,
+                                                      skb->csum));
+                err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+                                            req->af.v4_req.rmt_addr,
+                                            req->af.v4_req.opt);
+                if (err == NET_XMIT_CN)
+                        err = 0;
+        }
+out:
+        dst_release(dst);
+        return err;
+}
+/*
+ *      IPv4 open_request destructor.
+ */
+static void tcp_v4_or_free(struct open_request *req)
+{
+        if (req->af.v4_req.opt)
+                kfree(req->af.v4_req.opt);
+}
+static inline void syn_flood_warning(struct sk_buff *skb)
+{
+        static unsigned long warntime;
+        if (time_after(jiffies, (warntime + HZ * 60))) {
+                warntime = jiffies;
+                printk(KERN_INFO
+                       "possible SYN flooding on port %d. Sending cookies.\n",
+                       ntohs(skb->h.th->dest));
+        }
+}
+/*
+ * Save and compile IPv4 options into the open_request if needed.
+ */
+static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
+                                                     struct sk_buff *skb)
+{
+        struct ip_options *opt = &(IPCB(skb)->opt);
+        struct ip_options *dopt = NULL;
+        if (opt && opt->optlen) {
+                int opt_size = optlength(opt);
+                dopt = kmalloc(opt_size, GFP_ATOMIC);
+                if (dopt) {
+                        if (ip_options_echo(dopt, skb)) {
+                                kfree(dopt);
+                                dopt = NULL;
+                        }
+                }
+        }
+        return dopt;
+}
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
+ */
+int sysctl_max_syn_backlog = 256;
+struct or_calltable or_ipv4 = {
+        .family         =       PF_INET,
+        .rtx_syn_ack    =       tcp_v4_send_synack,
+        .send_ack       =       tcp_v4_or_send_ack,
+        .destructor     =       tcp_v4_or_free,
+        .send_reset     =       tcp_v4_send_reset,
+};
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_options_received tmp_opt;
+        struct open_request *req;
+        __u32 saddr = skb->nh.iph->saddr;
+        __u32 daddr = skb->nh.iph->daddr;
+        __u32 isn = TCP_SKB_CB(skb)->when;
+        struct dst_entry *dst = NULL;
+#ifdef CONFIG_SYN_COOKIES
+        int want_cookie = 0;
+#else
+#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
+#endif
+        /* Never answer to SYNs send to broadcast or multicast */
+        if (((struct rtable *)skb->dst)->rt_flags &
+            (RTCF_BROADCAST | RTCF_MULTICAST))
+                goto drop;
+        /* TW buckets are converted to open requests without
+         * limitations, they conserve resources and peer is
+         * evidently real one.
+         */
+        if (tcp_synq_is_full(sk) && !isn) {
+#ifdef CONFIG_SYN_COOKIES
+                if (sysctl_tcp_syncookies) {
+                        want_cookie = 1;
+                } else
+#endif
+                goto drop;
+        }
+        /* Accept backlog is full. If we have already queued enough
+         * of warm entries in syn queue, drop request. It is better than
+         * clogging syn queue with openreqs with exponentially increasing
+         * timeout.
+         */
+        if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+                goto drop;
+        req = tcp_openreq_alloc();
+        if (!req)
+                goto drop;
+        tcp_clear_options(&tmp_opt);
+        tmp_opt.mss_clamp = 536;
+        tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
+        tcp_parse_options(skb, &tmp_opt, 0);
+        if (want_cookie) {
+                tcp_clear_options(&tmp_opt);
+                tmp_opt.saw_tstamp = 0;
+        }
+        if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
+                /* Some OSes (unknown ones, but I see them on web server, which
+                 * contains information interesting only for windows'
+                 * users) do not send their stamp in SYN. It is easy case.
+                 * We simply do not advertise TS support.
+                 */
+                tmp_opt.saw_tstamp = 0;
+                tmp_opt.tstamp_ok  = 0;
+        }
+        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+        tcp_openreq_init(req, &tmp_opt, skb);
+        req->af.v4_req.loc_addr = daddr;
+        req->af.v4_req.rmt_addr = saddr;
+        req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+        req->class = &or_ipv4;
+        if (!want_cookie)
+                TCP_ECN_create_request(req, skb->h.th);
+        if (want_cookie) {
+#ifdef CONFIG_SYN_COOKIES
+                syn_flood_warning(skb);
+#endif
+                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+        } else if (!isn) {
+                struct inet_peer *peer = NULL;
+                /* VJ's idea. We save last timestamp seen
+                 * from the destination in peer table, when entering
+                 * state TIME-WAIT, and check against it before
+                 * accepting new connection request.
+                 *
+                 * If "isn" is not zero, this request hit alive
+                 * timewait bucket, so that all the necessary checks
+                 * are made in the function processing timewait state.
+                 */
+                if (tmp_opt.saw_tstamp &&
+                    sysctl_tcp_tw_recycle &&
+                    (dst = tcp_v4_route_req(sk, req)) != NULL &&
+                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
+                    peer->v4daddr == saddr) {
+                        if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+                            (s32)(peer->tcp_ts - req->ts_recent) >
+                                                        TCP_PAWS_WINDOW) {
+                                NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
+                                dst_release(dst);
+                                goto drop_and_free;
+                        }
+                }
+                /* Kill the following clause, if you dislike this way. */
+                else if (!sysctl_tcp_syncookies &&
+                         (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+                          (sysctl_max_syn_backlog >> 2)) &&
+                         (!peer || !peer->tcp_ts_stamp) &&
+                         (!dst || !dst_metric(dst, RTAX_RTT))) {
+                        /* Without syncookies last quarter of
+                         * backlog is filled with destinations,
+                         * proven to be alive.
+                         * It means that we continue to communicate
+                         * to destinations, already remembered
+                         * to the moment of synflood.
+                         */
+                        NETDEBUG(if (net_ratelimit()) \
+                                        printk(KERN_DEBUG "TCP: drop open "
+                                                          "request from %u.%u."
+                                                          "%u.%u/%u\n", \
+                                               NIPQUAD(saddr),
+                                               ntohs(skb->h.th->source)));
+                        dst_release(dst);
+                        goto drop_and_free;
+                }
+                isn = tcp_v4_init_sequence(sk, skb);
+        }
+        req->snt_isn = isn;
+        if (tcp_v4_send_synack(sk, req, dst))
+                goto drop_and_free;
+        if (want_cookie) {
+                tcp_openreq_free(req);
+        } else {
+                tcp_v4_synq_add(sk, req);
+        }
+        return 0;
+drop_and_free:
+        tcp_openreq_free(req);
+drop:
+        TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+        return 0;
+}
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+                                  struct open_request *req,
+                                  struct dst_entry *dst)
+{
+        struct inet_sock *newinet;
+        struct tcp_sock *newtp;
+        struct sock *newsk;
+        if (sk_acceptq_is_full(sk))
+                goto exit_overflow;
+        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+                goto exit;
+        newsk = tcp_create_openreq_child(sk, req, skb);
+        if (!newsk)
+                goto exit;
+        newsk->sk_dst_cache = dst;
+        tcp_v4_setup_caps(newsk, dst);
+        newtp                 = tcp_sk(newsk);
+        newinet               = inet_sk(newsk);
+        newinet->daddr        = req->af.v4_req.rmt_addr;
+        newinet->rcv_saddr    = req->af.v4_req.loc_addr;
+        newinet->saddr        = req->af.v4_req.loc_addr;
+        newinet->opt          = req->af.v4_req.opt;
+        req->af.v4_req.opt    = NULL;
+        newinet->mc_index     = tcp_v4_iif(skb);
+        newinet->mc_ttl       = skb->nh.iph->ttl;
+        newtp->ext_header_len = 0;
+        if (newinet->opt)
+                newtp->ext_header_len = newinet->opt->optlen;
+        newinet->id = newtp->write_seq ^ jiffies;
+        tcp_sync_mss(newsk, dst_mtu(dst));
+        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+        tcp_initialize_rcv_mss(newsk);
+        __tcp_v4_hash(newsk, 0);
+        __tcp_inherit_port(sk, newsk);
+        return newsk;
+exit_overflow:
+        NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+exit:
+        NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+        dst_release(dst);
+        return NULL;
+}
+static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcphdr *th = skb->h.th;
+        struct iphdr *iph = skb->nh.iph;
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sock *nsk;
+        struct open_request **prev;
+        /* Find possible connection requests. */
+        struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
+                                                     iph->saddr, iph->daddr);
+        if (req)
+                return tcp_check_req(sk, skb, req, prev);
+        nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
+                                          th->source,
+                                          skb->nh.iph->daddr,
+                                          ntohs(th->dest),
+                                          tcp_v4_iif(skb));
+        if (nsk) {
+                if (nsk->sk_state != TCP_TIME_WAIT) {
+                        bh_lock_sock(nsk);
+                        return nsk;
+                }
+                tcp_tw_put((struct tcp_tw_bucket *)nsk);
+                return NULL;
+        }
+#ifdef CONFIG_SYN_COOKIES
+        if (!th->rst && !th->syn && th->ack)
+                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+#endif
+        return sk;
+}
+static int tcp_v4_checksum_init(struct sk_buff *skb)
+{
+        if (skb->ip_summed == CHECKSUM_HW) {
+                skb->ip_summed = CHECKSUM_UNNECESSARY;
+                if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
+                                  skb->nh.iph->daddr, skb->csum))
+                        return 0;
+                NETDEBUG(if (net_ratelimit())
+                                printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+                skb->ip_summed = CHECKSUM_NONE;
+        }
+        if (skb->len <= 76) {
+                if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
+                                 skb->nh.iph->daddr,
+                                 skb_checksum(skb, 0, skb->len, 0)))
+                        return -1;
+                skb->ip_summed = CHECKSUM_UNNECESSARY;
+        } else {
+                skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
+                                          skb->nh.iph->saddr,
+                                          skb->nh.iph->daddr, 0);
+        }
+        return 0;
+}
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+                TCP_CHECK_TIMER(sk);
+                if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
+                        goto reset;
+                TCP_CHECK_TIMER(sk);
+                return 0;
+        }
+        if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
+                goto csum_err;
+        if (sk->sk_state == TCP_LISTEN) {
+                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+                if (!nsk)
+                        goto discard;
+                if (nsk != sk) {
+                        if (tcp_child_process(sk, nsk, skb))
+                                goto reset;
+                        return 0;
+                }
+        }
+        TCP_CHECK_TIMER(sk);
+        if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
+                goto reset;
+        TCP_CHECK_TIMER(sk);
+        return 0;
+reset:
+        tcp_v4_send_reset(skb);
+discard:
+        kfree_skb(skb);
+        /* Be careful here. If this function gets more complicated and
+         * gcc suffers from register pressure on the x86, sk (in %ebx)
+         * might be destroyed here. This current version compiles correctly,
+         * but you have been warned.
+         */
+        return 0;
+csum_err:
+        TCP_INC_STATS_BH(TCP_MIB_INERRS);
+        goto discard;
+}
+/*
+ *      From tcp_input.c
+ */
+int tcp_v4_rcv(struct sk_buff *skb)
+{
+        struct tcphdr *th;
+        struct sock *sk;
+        int ret;
+        if (skb->pkt_type != PACKET_HOST)
+                goto discard_it;
+        /* Count it even if it's bad */
+        TCP_INC_STATS_BH(TCP_MIB_INSEGS);
+        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+                goto discard_it;
+        th = skb->h.th;
+        if (th->doff < sizeof(struct tcphdr) / 4)
+                goto bad_packet;
+        if (!pskb_may_pull(skb, th->doff * 4))
+                goto discard_it;
+        /* An explanation is required here, I think.
+         * Packet length and doff are validated by header prediction,
+         * provided case of th->doff==0 is elimineted.
+         * So, we defer the checks. */
+        if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
+             tcp_v4_checksum_init(skb) < 0))
+                goto bad_packet;
+        th = skb->h.th;
+        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+                                    skb->len - th->doff * 4);
+        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+        TCP_SKB_CB(skb)->when    = 0;
+        TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
+        TCP_SKB_CB(skb)->sacked  = 0;
+        sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
+                             skb->nh.iph->daddr, ntohs(th->dest),
+                             tcp_v4_iif(skb));
+        if (!sk)
+                goto no_tcp_socket;
+process:
+        if (sk->sk_state == TCP_TIME_WAIT)
+                goto do_time_wait;
+        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+                goto discard_and_relse;
+        if (sk_filter(sk, skb, 0))
+                goto discard_and_relse;
+        skb->dev = NULL;
+        bh_lock_sock(sk);
+        ret = 0;
+        if (!sock_owned_by_user(sk)) {
+                if (!tcp_prequeue(sk, skb))
+                        ret = tcp_v4_do_rcv(sk, skb);
+        } else
+                sk_add_backlog(sk, skb);
+        bh_unlock_sock(sk);
+        sock_put(sk);
+        return ret;
+no_tcp_socket:
+        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+                goto discard_it;
+        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+bad_packet:
+                TCP_INC_STATS_BH(TCP_MIB_INERRS);
+        } else {
+                tcp_v4_send_reset(skb);
+        }
+discard_it:
+        /* Discard frame. */
+        kfree_skb(skb);
+        return 0;
+discard_and_relse:
+        sock_put(sk);
+        goto discard_it;
+do_time_wait:
+        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+                tcp_tw_put((struct tcp_tw_bucket *) sk);
+                goto discard_it;
+        }
+        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+                TCP_INC_STATS_BH(TCP_MIB_INERRS);
+                tcp_tw_put((struct tcp_tw_bucket *) sk);
+                goto discard_it;
+        }
+        switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+                                           skb, th, skb->len)) {
+        case TCP_TW_SYN: {
+                struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
+                                                          ntohs(th->dest),
+                                                          tcp_v4_iif(skb));
+                if (sk2) {
+                        tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
+                        tcp_tw_put((struct tcp_tw_bucket *)sk);
+                        sk = sk2;
+                        goto process;
+                }
+                /* Fall through to ACK */
+        }
+        case TCP_TW_ACK:
+                tcp_v4_timewait_ack(sk, skb);
+                break;
+        case TCP_TW_RST:
+                goto no_tcp_socket;
+        case TCP_TW_SUCCESS:;
+        }
+        goto discard_it;
+}
+/* With per-bucket locks this operation is not-atomic, so that
+ * this version is not worse.
+ */
+static void __tcp_v4_rehash(struct sock *sk)
+{
+        sk->sk_prot->unhash(sk);
+        sk->sk_prot->hash(sk);
+}
+static int tcp_v4_reselect_saddr(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        int err;
+        struct rtable *rt;
+        __u32 old_saddr = inet->saddr;
+        __u32 new_saddr;
+        __u32 daddr = inet->daddr;
+        if (inet->opt && inet->opt->srr)
+                daddr = inet->opt->faddr;
+        /* Query new route. */
+        err = ip_route_connect(&rt, daddr, 0,
+                               RT_CONN_FLAGS(sk),
+                               sk->sk_bound_dev_if,
+                               IPPROTO_TCP,
+                               inet->sport, inet->dport, sk);
+        if (err)
+                return err;
+        __sk_dst_set(sk, &rt->u.dst);
+        tcp_v4_setup_caps(sk, &rt->u.dst);
+        new_saddr = rt->rt_src;
+        if (new_saddr == old_saddr)
+                return 0;
+        if (sysctl_ip_dynaddr > 1) {
+                printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
+                                 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+                       NIPQUAD(old_saddr),
+                       NIPQUAD(new_saddr));
+        }
+        inet->saddr = new_saddr;
+        inet->rcv_saddr = new_saddr;
+        /* XXX The only one ugly spot where we need to
+         * XXX really change the sockets identity after
+         * XXX it has entered the hashes. -DaveM
+         *
+         * Besides that, it does not check for connection
+         * uniqueness. Wait for troubles.
+         */
+        __tcp_v4_rehash(sk);
+        return 0;
+}
+int tcp_v4_rebuild_header(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+        u32 daddr;
+        int err;
+        /* Route is OK, nothing to do. */
+        if (rt)
+                return 0;
+        /* Reroute. */
+        daddr = inet->daddr;
+        if (inet->opt && inet->opt->srr)
+                daddr = inet->opt->faddr;
+        {
+                struct flowi fl = { .oif = sk->sk_bound_dev_if,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = daddr,
+                                                .saddr = inet->saddr,
+                                                .tos = RT_CONN_FLAGS(sk) } },
+                                    .proto = IPPROTO_TCP,
+                                    .uli_u = { .ports =
+                                               { .sport = inet->sport,
+                                                 .dport = inet->dport } } };
+                                                
+                err = ip_route_output_flow(&rt, &fl, sk, 0);
+        }
+        if (!err) {
+                __sk_dst_set(sk, &rt->u.dst);
+                tcp_v4_setup_caps(sk, &rt->u.dst);
+                return 0;
+        }
+        /* Routing failed... */
+        sk->sk_route_caps = 0;
+        if (!sysctl_ip_dynaddr ||
+            sk->sk_state != TCP_SYN_SENT ||
+            (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+            (err = tcp_v4_reselect_saddr(sk)) != 0)
+                sk->sk_err_soft = -err;
+        return err;
+}
+static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+        struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
+        struct inet_sock *inet = inet_sk(sk);
+        sin->sin_family         = AF_INET;
+        sin->sin_addr.s_addr    = inet->daddr;
+        sin->sin_port           = inet->dport;
+}
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+int tcp_v4_remember_stamp(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
+        struct inet_peer *peer = NULL;
+        int release_it = 0;
+        if (!rt || rt->rt_dst != inet->daddr) {
+                peer = inet_getpeer(inet->daddr, 1);
+                release_it = 1;
+        } else {
+                if (!rt->peer)
+                        rt_bind_peer(rt, 1);
+                peer = rt->peer;
+        }
+        if (peer) {
+                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
+                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+                     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
+                        peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
+                        peer->tcp_ts = tp->rx_opt.ts_recent;
+                }
+                if (release_it)
+                        inet_putpeer(peer);
+                return 1;
+        }
+        return 0;
+}
+int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+{
+        struct inet_peer *peer = NULL;
+        peer = inet_getpeer(tw->tw_daddr, 1);
+        if (peer) {
+                if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+                     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
+                        peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
+                        peer->tcp_ts = tw->tw_ts_recent;
+                }
+                inet_putpeer(peer);
+                return 1;
+        }
+        return 0;
+}
+struct tcp_func ipv4_specific = {
+        .queue_xmit     =       ip_queue_xmit,
+        .send_check     =       tcp_v4_send_check,
+        .rebuild_header =       tcp_v4_rebuild_header,
+        .conn_request   =       tcp_v4_conn_request,
+        .syn_recv_sock  =       tcp_v4_syn_recv_sock,
+        .remember_stamp =       tcp_v4_remember_stamp,
+        .net_header_len =       sizeof(struct iphdr),
+        .setsockopt     =       ip_setsockopt,
+        .getsockopt     =       ip_getsockopt,
+        .addr2sockaddr  =       v4_addr2sockaddr,
+        .sockaddr_len   =       sizeof(struct sockaddr_in),
+};
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        skb_queue_head_init(&tp->out_of_order_queue);
+        tcp_init_xmit_timers(sk);
+        tcp_prequeue_init(tp);
+        tp->rto  = TCP_TIMEOUT_INIT;
+        tp->mdev = TCP_TIMEOUT_INIT;
+        /* So many TCP implementations out there (incorrectly) count the
+         * initial SYN frame in their delayed-ACK and congestion control
+         * algorithms that we must have the following bandaid to talk
+         * efficiently to them.  -DaveM
+         */
+        tp->snd_cwnd = 2;
+        /* See draft-stevens-tcpca-spec-01 for discussion of the
+         * initialization of these values.
+         */
+        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
+        tp->snd_cwnd_clamp = ~0;
+        tp->mss_cache_std = tp->mss_cache = 536;
+        tp->reordering = sysctl_tcp_reordering;
+        sk->sk_state = TCP_CLOSE;
+        sk->sk_write_space = sk_stream_write_space;
+        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+        tp->af_specific = &ipv4_specific;
+        sk->sk_sndbuf = sysctl_tcp_wmem[1];
+        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+        atomic_inc(&tcp_sockets_allocated);
+        return 0;
+}
+int tcp_v4_destroy_sock(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tcp_clear_xmit_timers(sk);
+        /* Cleanup up the write buffer. */
+        sk_stream_writequeue_purge(sk);
+        /* Cleans up our, hopefully empty, out_of_order_queue. */
+        __skb_queue_purge(&tp->out_of_order_queue);
+        /* Clean prequeue, it must be empty really */
+        __skb_queue_purge(&tp->ucopy.prequeue);
+        /* Clean up a referenced TCP bind bucket. */
+        if (tp->bind_hash)
+                tcp_put_port(sk);
+        /*
+         * If sendmsg cached page exists, toss it.
+         */
+        if (sk->sk_sndmsg_page) {
+                __free_page(sk->sk_sndmsg_page);
+                sk->sk_sndmsg_page = NULL;
+        }
+        atomic_dec(&tcp_sockets_allocated);
+        return 0;
+}
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCP sock list dumping. */
+static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+{
+        return hlist_empty(head) ? NULL :
+                list_entry(head->first, struct tcp_tw_bucket, tw_node);
+}
+static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+{
+        return tw->tw_node.next ?
+                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+}
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+        struct tcp_sock *tp;
+        struct hlist_node *node;
+        struct sock *sk = cur;
+        struct tcp_iter_state* st = seq->private;
+        if (!sk) {
+                st->bucket = 0;
+                sk = sk_head(&tcp_listening_hash[0]);
+                goto get_sk;
+        }
+        ++st->num;
+        if (st->state == TCP_SEQ_STATE_OPENREQ) {
+                struct open_request *req = cur;
+                tp = tcp_sk(st->syn_wait_sk);
+                req = req->dl_next;
+                while (1) {
+                        while (req) {
+                                if (req->class->family == st->family) {
+                                        cur = req;
+                                        goto out;
+                                }
+                                req = req->dl_next;
+                        }
+                        if (++st->sbucket >= TCP_SYNQ_HSIZE)
+                                break;
+get_req:
+                        req = tp->listen_opt->syn_table[st->sbucket];
+                }
+                sk        = sk_next(st->syn_wait_sk);
+                st->state = TCP_SEQ_STATE_LISTENING;
+                read_unlock_bh(&tp->syn_wait_lock);
+        } else {
+                tp = tcp_sk(sk);
+                read_lock_bh(&tp->syn_wait_lock);
+                if (tp->listen_opt && tp->listen_opt->qlen)
+                        goto start_req;
+                read_unlock_bh(&tp->syn_wait_lock);
+                sk = sk_next(sk);
+        }
+get_sk:
+        sk_for_each_from(sk, node) {
+                if (sk->sk_family == st->family) {
+                        cur = sk;
+                        goto out;
+                }
+                tp = tcp_sk(sk);
+                read_lock_bh(&tp->syn_wait_lock);
+                if (tp->listen_opt && tp->listen_opt->qlen) {
+start_req:
+                        st->uid         = sock_i_uid(sk);
+                        st->syn_wait_sk = sk;
+                        st->state       = TCP_SEQ_STATE_OPENREQ;
+                        st->sbucket     = 0;
+                        goto get_req;
+                }
+                read_unlock_bh(&tp->syn_wait_lock);
+        }
+        if (++st->bucket < TCP_LHTABLE_SIZE) {
+                sk = sk_head(&tcp_listening_hash[st->bucket]);
+                goto get_sk;
+        }
+        cur = NULL;
+out:
+        return cur;
+}
+static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
+{
+        void *rc = listening_get_next(seq, NULL);
+        while (rc && *pos) {
+                rc = listening_get_next(seq, rc);
+                --*pos;
+        }
+        return rc;
+}
+static void *established_get_first(struct seq_file *seq)
+{
+        struct tcp_iter_state* st = seq->private;
+        void *rc = NULL;
+        for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+                struct sock *sk;
+                struct hlist_node *node;
+                struct tcp_tw_bucket *tw;
+                /* We can reschedule _before_ having picked the target: */
+                cond_resched_softirq();
+                read_lock(&tcp_ehash[st->bucket].lock);
+                sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+                        if (sk->sk_family != st->family) {
+                                continue;
+                        }
+                        rc = sk;
+                        goto out;
+                }
+                st->state = TCP_SEQ_STATE_TIME_WAIT;
+                tw_for_each(tw, node,
+                            &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+                        if (tw->tw_family != st->family) {
+                                continue;
+                        }
+                        rc = tw;
+                        goto out;
+                }
+                read_unlock(&tcp_ehash[st->bucket].lock);
+                st->state = TCP_SEQ_STATE_ESTABLISHED;
+        }
+out:
+        return rc;
+}
+static void *established_get_next(struct seq_file *seq, void *cur)
+{
+        struct sock *sk = cur;
+        struct tcp_tw_bucket *tw;
+        struct hlist_node *node;
+        struct tcp_iter_state* st = seq->private;
+        ++st->num;
+        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
+                tw = cur;
+                tw = tw_next(tw);
+get_tw:
+                while (tw && tw->tw_family != st->family) {
+                        tw = tw_next(tw);
+                }
+                if (tw) {
+                        cur = tw;
+                        goto out;
+                }
+                read_unlock(&tcp_ehash[st->bucket].lock);
+                st->state = TCP_SEQ_STATE_ESTABLISHED;
+                /* We can reschedule between buckets: */
+                cond_resched_softirq();
+                if (++st->bucket < tcp_ehash_size) {
+                        read_lock(&tcp_ehash[st->bucket].lock);
+                        sk = sk_head(&tcp_ehash[st->bucket].chain);
+                } else {
+                        cur = NULL;
+                        goto out;
+                }
+        } else
+                sk = sk_next(sk);
+        sk_for_each_from(sk, node) {
+                if (sk->sk_family == st->family)
+                        goto found;
+        }
+        st->state = TCP_SEQ_STATE_TIME_WAIT;
+        tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+        goto get_tw;
+found:
+        cur = sk;
+out:
+        return cur;
+}
+static void *established_get_idx(struct seq_file *seq, loff_t pos)
+{
+        void *rc = established_get_first(seq);
+        while (rc && pos) {
+                rc = established_get_next(seq, rc);
+                --pos;
+        }               
+        return rc;
+}
+static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
+{
+        void *rc;
+        struct tcp_iter_state* st = seq->private;
+        tcp_listen_lock();
+        st->state = TCP_SEQ_STATE_LISTENING;
+        rc        = listening_get_idx(seq, &pos);
+        if (!rc) {
+                tcp_listen_unlock();
+                local_bh_disable();
+                st->state = TCP_SEQ_STATE_ESTABLISHED;
+                rc        = established_get_idx(seq, pos);
+        }
+        return rc;
+}
+static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct tcp_iter_state* st = seq->private;
+        st->state = TCP_SEQ_STATE_LISTENING;
+        st->num = 0;
+        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        void *rc = NULL;
+        struct tcp_iter_state* st;
+        if (v == SEQ_START_TOKEN) {
+                rc = tcp_get_idx(seq, 0);
+                goto out;
+        }
+        st = seq->private;
+        switch (st->state) {
+        case TCP_SEQ_STATE_OPENREQ:
+        case TCP_SEQ_STATE_LISTENING:
+                rc = listening_get_next(seq, v);
+                if (!rc) {
+                        tcp_listen_unlock();
+                        local_bh_disable();
+                        st->state = TCP_SEQ_STATE_ESTABLISHED;
+                        rc        = established_get_first(seq);
+                }
+                break;
+        case TCP_SEQ_STATE_ESTABLISHED:
+        case TCP_SEQ_STATE_TIME_WAIT:
+                rc = established_get_next(seq, v);
+                break;
+        }
+out:
+        ++*pos;
+        return rc;
+}
+static void tcp_seq_stop(struct seq_file *seq, void *v)
+{
+        struct tcp_iter_state* st = seq->private;
+        switch (st->state) {
+        case TCP_SEQ_STATE_OPENREQ:
+                if (v) {
+                        struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
+                        read_unlock_bh(&tp->syn_wait_lock);
+                }
+        case TCP_SEQ_STATE_LISTENING:
+                if (v != SEQ_START_TOKEN)
+                        tcp_listen_unlock();
+                break;
+        case TCP_SEQ_STATE_TIME_WAIT:
+        case TCP_SEQ_STATE_ESTABLISHED:
+                if (v)
+                        read_unlock(&tcp_ehash[st->bucket].lock);
+                local_bh_enable();
+                break;
+        }
+}
+static int tcp_seq_open(struct inode *inode, struct file *file)
+{
+        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
+        struct seq_file *seq;
+        struct tcp_iter_state *s;
+        int rc;
+        if (unlikely(afinfo == NULL))
+                return -EINVAL;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                return -ENOMEM;
+        memset(s, 0, sizeof(*s));
+        s->family               = afinfo->family;
+        s->seq_ops.start        = tcp_seq_start;
+        s->seq_ops.next         = tcp_seq_next;
+        s->seq_ops.show         = afinfo->seq_show;
+        s->seq_ops.stop         = tcp_seq_stop;
+        rc = seq_open(file, &s->seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq          = file->private_data;
+        seq->private = s;
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
+{
+        int rc = 0;
+        struct proc_dir_entry *p;
+        if (!afinfo)
+                return -EINVAL;
+        afinfo->seq_fops->owner         = afinfo->owner;
+        afinfo->seq_fops->open          = tcp_seq_open;
+        afinfo->seq_fops->read          = seq_read;
+        afinfo->seq_fops->llseek        = seq_lseek;
+        afinfo->seq_fops->release       = seq_release_private;
+        
+        p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+        if (p)
+                p->data = afinfo;
+        else
+                rc = -ENOMEM;
+        return rc;
+}
+void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
+{
+        if (!afinfo)
+                return;
+        proc_net_remove(afinfo->name);
+        memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
+}
+static void get_openreq4(struct sock *sk, struct open_request *req,
+                         char *tmpbuf, int i, int uid)
+{
+        int ttd = req->expires - jiffies;
+        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
+                i,
+                req->af.v4_req.loc_addr,
+                ntohs(inet_sk(sk)->sport),
+                req->af.v4_req.rmt_addr,
+                ntohs(req->rmt_port),
+                TCP_SYN_RECV,
+                0, 0, /* could print option size, but that is af dependent. */
+                1,    /* timers active (only the expire timer) */
+                jiffies_to_clock_t(ttd),
+                req->retrans,
+                uid,
+                0,  /* non standard timer */
+                0, /* open_requests have no inode */
+                atomic_read(&sk->sk_refcnt),
+                req);
+}
+static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
+{
+        int timer_active;
+        unsigned long timer_expires;
+        struct tcp_sock *tp = tcp_sk(sp);
+        struct inet_sock *inet = inet_sk(sp);
+        unsigned int dest = inet->daddr;
+        unsigned int src = inet->rcv_saddr;
+        __u16 destp = ntohs(inet->dport);
+        __u16 srcp = ntohs(inet->sport);
+        if (tp->pending == TCP_TIME_RETRANS) {
+                timer_active    = 1;
+                timer_expires   = tp->timeout;
+        } else if (tp->pending == TCP_TIME_PROBE0) {
+                timer_active    = 4;
+                timer_expires   = tp->timeout;
+        } else if (timer_pending(&sp->sk_timer)) {
+                timer_active    = 2;
+                timer_expires   = sp->sk_timer.expires;
+        } else {
+                timer_active    = 0;
+                timer_expires = jiffies;
+        }
+        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+                        "%08X %5d %8d %lu %d %p %u %u %u %u %d",
+                i, src, srcp, dest, destp, sp->sk_state,
+                tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
+                timer_active,
+                jiffies_to_clock_t(timer_expires - jiffies),
+                tp->retransmits,
+                sock_i_uid(sp),
+                tp->probes_out,
+                sock_i_ino(sp),
+                atomic_read(&sp->sk_refcnt), sp,
+                tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+                tp->snd_cwnd,
+                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
+}
+static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+{
+        unsigned int dest, src;
+        __u16 destp, srcp;
+        int ttd = tw->tw_ttd - jiffies;
+        if (ttd < 0)
+                ttd = 0;
+        dest  = tw->tw_daddr;
+        src   = tw->tw_rcv_saddr;
+        destp = ntohs(tw->tw_dport);
+        srcp  = ntohs(tw->tw_sport);
+        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
+                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
+                atomic_read(&tw->tw_refcnt), tw);
+}
+#define TMPSZ 150
+static int tcp4_seq_show(struct seq_file *seq, void *v)
+{
+        struct tcp_iter_state* st;
+        char tmpbuf[TMPSZ + 1];
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq, "%-*s\n", TMPSZ - 1,
+                           "  sl  local_address rem_address   st tx_queue "
+                           "rx_queue tr tm->when retrnsmt   uid  timeout "
+                           "inode");
+                goto out;
+        }
+        st = seq->private;
+        switch (st->state) {
+        case TCP_SEQ_STATE_LISTENING:
+        case TCP_SEQ_STATE_ESTABLISHED:
+                get_tcp4_sock(v, tmpbuf, st->num);
+                break;
+        case TCP_SEQ_STATE_OPENREQ:
+                get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
+                break;
+        case TCP_SEQ_STATE_TIME_WAIT:
+                get_timewait4_sock(v, tmpbuf, st->num);
+                break;
+        }
+        seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
+out:
+        return 0;
+}
+static struct file_operations tcp4_seq_fops;
+static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+        .owner          = THIS_MODULE,
+        .name           = "tcp",
+        .family         = AF_INET,
+        .seq_show       = tcp4_seq_show,
+        .seq_fops       = &tcp4_seq_fops,
+};
+int __init tcp4_proc_init(void)
+{
+        return tcp_proc_register(&tcp4_seq_afinfo);
+}
+void tcp4_proc_exit(void)
+{
+        tcp_proc_unregister(&tcp4_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+struct proto tcp_prot = {
+        .name                   = "TCP",
+        .owner                  = THIS_MODULE,
+        .close                  = tcp_close,
+        .connect                = tcp_v4_connect,
+        .disconnect             = tcp_disconnect,
+        .accept                 = tcp_accept,
+        .ioctl                  = tcp_ioctl,
+        .init                   = tcp_v4_init_sock,
+        .destroy                = tcp_v4_destroy_sock,
+        .shutdown               = tcp_shutdown,
+        .setsockopt             = tcp_setsockopt,
+        .getsockopt             = tcp_getsockopt,
+        .sendmsg                = tcp_sendmsg,
+        .recvmsg                = tcp_recvmsg,
+        .backlog_rcv            = tcp_v4_do_rcv,
+        .hash                   = tcp_v4_hash,
+        .unhash                 = tcp_unhash,
+        .get_port               = tcp_v4_get_port,
+        .enter_memory_pressure  = tcp_enter_memory_pressure,
+        .sockets_allocated      = &tcp_sockets_allocated,
+        .memory_allocated       = &tcp_memory_allocated,
+        .memory_pressure        = &tcp_memory_pressure,
+        .sysctl_mem             = sysctl_tcp_mem,
+        .sysctl_wmem            = sysctl_tcp_wmem,
+        .sysctl_rmem            = sysctl_tcp_rmem,
+        .max_header             = MAX_TCP_HEADER,
+        .obj_size               = sizeof(struct tcp_sock),
+};
+void __init tcp_v4_init(struct net_proto_family *ops)
+{
+        int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
+        if (err < 0)
+                panic("Failed to create the TCP control socket.\n");
+        tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
+        inet_sk(tcp_socket->sk)->uc_ttl = -1;
+        /* Unhash it so that IP input processing does not even
+         * see it, we do not wish this socket to see incoming
+         * packets.
+         */
+        tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
+}
+EXPORT_SYMBOL(ipv4_specific);
+EXPORT_SYMBOL(tcp_bind_hash);
+EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(tcp_hashinfo);
+EXPORT_SYMBOL(tcp_inherit_port);
+EXPORT_SYMBOL(tcp_listen_wlock);
+EXPORT_SYMBOL(tcp_port_rover);
+EXPORT_SYMBOL(tcp_prot);
+EXPORT_SYMBOL(tcp_put_port);
+EXPORT_SYMBOL(tcp_unhash);
+EXPORT_SYMBOL(tcp_v4_conn_request);
+EXPORT_SYMBOL(tcp_v4_connect);
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+EXPORT_SYMBOL(tcp_v4_rebuild_header);
+EXPORT_SYMBOL(tcp_v4_remember_stamp);
+EXPORT_SYMBOL(tcp_v4_send_check);
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+#ifdef CONFIG_PROC_FS
+EXPORT_SYMBOL(tcp_proc_register);
+EXPORT_SYMBOL(tcp_proc_unregister);
+#endif
+EXPORT_SYMBOL(sysctl_local_port_range);
+EXPORT_SYMBOL(sysctl_max_syn_backlog);
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
+EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 000000000000..fd70509f0d53
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,1077 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *              Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *              Florian La Roche, <flla@stud.uni-sb.de>
+ *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *              Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *              Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *              Matthew Dillon, <dillon@apollo.west.oic.com>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Jorge Cwik, <jorge@laser.satlink.net>
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#ifdef CONFIG_SYSCTL
+#define SYNC_INIT 0 /* let the user enable it */
+#else
+#define SYNC_INIT 1
+#endif
+int sysctl_tcp_tw_recycle;
+int sysctl_tcp_max_tw_buckets = NR_FILE*2;
+int sysctl_tcp_syncookies = SYNC_INIT; 
+int sysctl_tcp_abort_on_overflow;
+static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+        if (seq == s_win)
+                return 1;
+        if (after(end_seq, s_win) && before(seq, e_win))
+                return 1;
+        return (seq == e_win && seq == end_seq);
+}
+/* New-style handling of TIME_WAIT sockets. */
+int tcp_tw_count;
+/* Must be called with locally disabled BHs. */
+static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
+{
+        struct tcp_ehash_bucket *ehead;
+        struct tcp_bind_hashbucket *bhead;
+        struct tcp_bind_bucket *tb;
+        /* Unlink from established hashes. */
+        ehead = &tcp_ehash[tw->tw_hashent];
+        write_lock(&ehead->lock);
+        if (hlist_unhashed(&tw->tw_node)) {
+                write_unlock(&ehead->lock);
+                return;
+        }
+        __hlist_del(&tw->tw_node);
+        sk_node_init(&tw->tw_node);
+        write_unlock(&ehead->lock);
+        /* Disassociate with bind bucket. */
+        bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
+        spin_lock(&bhead->lock);
+        tb = tw->tw_tb;
+        __hlist_del(&tw->tw_bind_node);
+        tw->tw_tb = NULL;
+        tcp_bucket_destroy(tb);
+        spin_unlock(&bhead->lock);
+#ifdef INET_REFCNT_DEBUG
+        if (atomic_read(&tw->tw_refcnt) != 1) {
+                printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
+                       atomic_read(&tw->tw_refcnt));
+        }
+#endif
+        tcp_tw_put(tw);
+}
+/* 
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ *   (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ *   lifetime in the internet, which results in wrong conclusion, that
+ *   it is set to catch "old duplicate segments" wandering out of their path.
+ *   It is not quite correct. This timeout is calculated so that it exceeds
+ *   maximal retransmission timeout enough to allow to lose one (or more)
+ *   segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ *   finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc.  --ANK
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+                           struct tcphdr *th, unsigned len)
+{
+        struct tcp_options_received tmp_opt;
+        int paws_reject = 0;
+        tmp_opt.saw_tstamp = 0;
+        if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+                tcp_parse_options(skb, &tmp_opt, 0);
+                if (tmp_opt.saw_tstamp) {
+                        tmp_opt.ts_recent          = tw->tw_ts_recent;
+                        tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+                        paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+                }
+        }
+        if (tw->tw_substate == TCP_FIN_WAIT2) {
+                /* Just repeat all the checks of tcp_rcv_state_process() */
+                /* Out of window, send ACK */
+                if (paws_reject ||
+                    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+                                   tw->tw_rcv_nxt,
+                                   tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+                        return TCP_TW_ACK;
+                if (th->rst)
+                        goto kill;
+                if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+                        goto kill_with_rst;
+                /* Dup ACK? */
+                if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+                    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+                        tcp_tw_put(tw);
+                        return TCP_TW_SUCCESS;
+                }
+                /* New data or FIN. If new data arrive after half-duplex close,
+                 * reset.
+                 */
+                if (!th->fin ||
+                    TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+kill_with_rst:
+                        tcp_tw_deschedule(tw);
+                        tcp_tw_put(tw);
+                        return TCP_TW_RST;
+                }
+                /* FIN arrived, enter true time-wait state. */
+                tw->tw_substate = TCP_TIME_WAIT;
+                tw->tw_rcv_nxt  = TCP_SKB_CB(skb)->end_seq;
+                if (tmp_opt.saw_tstamp) {
+                        tw->tw_ts_recent_stamp  = xtime.tv_sec;
+                        tw->tw_ts_recent        = tmp_opt.rcv_tsval;
+                }
+                /* I am shamed, but failed to make it more elegant.
+                 * Yes, it is direct reference to IP, which is impossible
+                 * to generalize to IPv6. Taking into account that IPv6
+                 * do not undertsnad recycling in any case, it not
+                 * a big problem in practice. --ANK */
+                if (tw->tw_family == AF_INET &&
+                    sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+                    tcp_v4_tw_remember_stamp(tw))
+                        tcp_tw_schedule(tw, tw->tw_timeout);
+                else
+                        tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+                return TCP_TW_ACK;
+        }
+        /*
+         *      Now real TIME-WAIT state.
+         *
+         *      RFC 1122:
+         *      "When a connection is [...] on TIME-WAIT state [...]
+         *      [a TCP] MAY accept a new SYN from the remote TCP to
+         *      reopen the connection directly, if it:
+         *      
+         *      (1)  assigns its initial sequence number for the new
+         *      connection to be larger than the largest sequence
+         *      number it used on the previous connection incarnation,
+         *      and
+         *
+         *      (2)  returns to TIME-WAIT state if the SYN turns out 
+         *      to be an old duplicate".
+         */
+        if (!paws_reject &&
+            (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+             (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
+                /* In window segment, it may be only reset or bare ack. */
+                if (th->rst) {
+                        /* This is TIME_WAIT assasination, in two flavors.
+                         * Oh well... nobody has a sufficient solution to this
+                         * protocol bug yet.
+                         */
+                        if (sysctl_tcp_rfc1337 == 0) {
+kill:
+                                tcp_tw_deschedule(tw);
+                                tcp_tw_put(tw);
+                                return TCP_TW_SUCCESS;
+                        }
+                }
+                tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+                if (tmp_opt.saw_tstamp) {
+                        tw->tw_ts_recent        = tmp_opt.rcv_tsval;
+                        tw->tw_ts_recent_stamp  = xtime.tv_sec;
+                }
+                tcp_tw_put(tw);
+                return TCP_TW_SUCCESS;
+        }
+        /* Out of window segment.
+           All the segments are ACKed immediately.
+           The only exception is new SYN. We accept it, if it is
+           not old duplicate and we are not in danger to be killed
+           by delayed old duplicates. RFC check is that it has
+           newer sequence number works at rates <40Mbit/sec.
+           However, if paws works, it is reliable AND even more,
+           we even may relax silly seq space cutoff.
+           RED-PEN: we violate main RFC requirement, if this SYN will appear
+           old duplicate (i.e. we receive RST in reply to SYN-ACK),
+           we must return socket to time-wait state. It is not good,
+           but not fatal yet.
+         */
+        if (th->syn && !th->rst && !th->ack && !paws_reject &&
+            (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
+             (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+                u32 isn = tw->tw_snd_nxt + 65535 + 2;
+                if (isn == 0)
+                        isn++;
+                TCP_SKB_CB(skb)->when = isn;
+                return TCP_TW_SYN;
+        }
+        if (paws_reject)
+                NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+        if(!th->rst) {
+                /* In this case we must reset the TIMEWAIT timer.
+                 *
+                 * If it is ACKless SYN it may be both old duplicate
+                 * and new good SYN with random sequence number <rcv_nxt.
+                 * Do not reschedule in the last case.
+                 */
+                if (paws_reject || th->ack)
+                        tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+                /* Send ACK. Note, we do not put the bucket,
+                 * it will be released by caller.
+                 */
+                return TCP_TW_ACK;
+        }
+        tcp_tw_put(tw);
+        return TCP_TW_SUCCESS;
+}
+/* Enter the time wait state.  This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the
+ * relevant info into it from the SK, and mess with hash chains
+ * and list linkage.
+ */
+static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+{
+        struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
+        struct tcp_bind_hashbucket *bhead;
+        /* Step 1: Put TW into bind hash. Original socket stays there too.
+           Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
+           binding cache, even if it is closed.
+         */
+        bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
+        spin_lock(&bhead->lock);
+        tw->tw_tb = tcp_sk(sk)->bind_hash;
+        BUG_TRAP(tcp_sk(sk)->bind_hash);
+        tw_add_bind_node(tw, &tw->tw_tb->owners);
+        spin_unlock(&bhead->lock);
+        write_lock(&ehead->lock);
+        /* Step 2: Remove SK from established hash. */
+        if (__sk_del_node_init(sk))
+                sock_prot_dec_use(sk->sk_prot);
+        /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+        tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
+        atomic_inc(&tw->tw_refcnt);
+        write_unlock(&ehead->lock);
+}
+/* 
+ * Move a socket to time-wait or dead fin-wait-2 state.
+ */ 
+void tcp_time_wait(struct sock *sk, int state, int timeo)
+{
+        struct tcp_tw_bucket *tw = NULL;
+        struct tcp_sock *tp = tcp_sk(sk);
+        int recycle_ok = 0;
+        if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+                recycle_ok = tp->af_specific->remember_stamp(sk);
+        if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
+                tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+        if(tw != NULL) {
+                struct inet_sock *inet = inet_sk(sk);
+                int rto = (tp->rto<<2) - (tp->rto>>1);
+                /* Give us an identity. */
+                tw->tw_daddr            = inet->daddr;
+                tw->tw_rcv_saddr        = inet->rcv_saddr;
+                tw->tw_bound_dev_if     = sk->sk_bound_dev_if;
+                tw->tw_num              = inet->num;
+                tw->tw_state            = TCP_TIME_WAIT;
+                tw->tw_substate         = state;
+                tw->tw_sport            = inet->sport;
+                tw->tw_dport            = inet->dport;
+                tw->tw_family           = sk->sk_family;
+                tw->tw_reuse            = sk->sk_reuse;
+                tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
+                atomic_set(&tw->tw_refcnt, 1);
+                tw->tw_hashent          = sk->sk_hashent;
+                tw->tw_rcv_nxt          = tp->rcv_nxt;
+                tw->tw_snd_nxt          = tp->snd_nxt;
+                tw->tw_rcv_wnd          = tcp_receive_window(tp);
+                tw->tw_ts_recent        = tp->rx_opt.ts_recent;
+                tw->tw_ts_recent_stamp  = tp->rx_opt.ts_recent_stamp;
+                tw_dead_node_init(tw);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+                if (tw->tw_family == PF_INET6) {
+                        struct ipv6_pinfo *np = inet6_sk(sk);
+                        ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
+                        ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+                        tw->tw_v6_ipv6only = np->ipv6only;
+                } else {
+                        memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
+                        memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
+                        tw->tw_v6_ipv6only = 0;
+                }
+#endif
+                /* Linkage updates. */
+                __tcp_tw_hashdance(sk, tw);
+                /* Get the TIME_WAIT timeout firing. */
+                if (timeo < rto)
+                        timeo = rto;
+                if (recycle_ok) {
+                        tw->tw_timeout = rto;
+                } else {
+                        tw->tw_timeout = TCP_TIMEWAIT_LEN;
+                        if (state == TCP_TIME_WAIT)
+                                timeo = TCP_TIMEWAIT_LEN;
+                }
+                tcp_tw_schedule(tw, timeo);
+                tcp_tw_put(tw);
+        } else {
+                /* Sorry, if we're out of memory, just CLOSE this
+                 * socket up.  We've got bigger problems than
+                 * non-graceful socket closings.
+                 */
+                if (net_ratelimit())
+                        printk(KERN_INFO "TCP: time wait bucket table overflow\n");
+        }
+        tcp_update_metrics(sk);
+        tcp_done(sk);
+}
+/* Kill off TIME_WAIT sockets once their lifetime has expired. */
+static int tcp_tw_death_row_slot;
+static void tcp_twkill(unsigned long);
+/* TIME_WAIT reaping mechanism. */
+#define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
+#define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
+#define TCP_TWKILL_QUOTA        100
+static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
+static DEFINE_SPINLOCK(tw_death_lock);
+static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
+static void twkill_work(void *);
+static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
+static u32 twkill_thread_slots;
+/* Returns non-zero if quota exceeded.  */
+static int tcp_do_twkill_work(int slot, unsigned int quota)
+{
+        struct tcp_tw_bucket *tw;
+        struct hlist_node *node;
+        unsigned int killed;
+        int ret;
+        /* NOTE: compare this to previous version where lock
+         * was released after detaching chain. It was racy,
+         * because tw buckets are scheduled in not serialized context
+         * in 2.3 (with netfilter), and with softnet it is common, because
+         * soft irqs are not sequenced.
+         */
+        killed = 0;
+        ret = 0;
+rescan:
+        tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
+                __tw_del_dead_node(tw);
+                spin_unlock(&tw_death_lock);
+                tcp_timewait_kill(tw);
+                tcp_tw_put(tw);
+                killed++;
+                spin_lock(&tw_death_lock);
+                if (killed > quota) {
+                        ret = 1;
+                        break;
+                }
+                /* While we dropped tw_death_lock, another cpu may have
+                 * killed off the next TW bucket in the list, therefore
+                 * do a fresh re-read of the hlist head node with the
+                 * lock reacquired.  We still use the hlist traversal
+                 * macro in order to get the prefetches.
+                 */
+                goto rescan;
+        }
+        tcp_tw_count -= killed;
+        NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+        return ret;
+}
+static void tcp_twkill(unsigned long dummy)
+{
+        int need_timer, ret;
+        spin_lock(&tw_death_lock);
+        if (tcp_tw_count == 0)
+                goto out;
+        need_timer = 0;
+        ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
+        if (ret) {
+                twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
+                mb();
+                schedule_work(&tcp_twkill_work);
+                need_timer = 1;
+        } else {
+                /* We purged the entire slot, anything left?  */
+                if (tcp_tw_count)
+                        need_timer = 1;
+        }
+        tcp_tw_death_row_slot =
+                ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+        if (need_timer)
+                mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
+out:
+        spin_unlock(&tw_death_lock);
+}
+extern void twkill_slots_invalid(void);
+static void twkill_work(void *dummy)
+{
+        int i;
+        if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
+                twkill_slots_invalid();
+        while (twkill_thread_slots) {
+                spin_lock_bh(&tw_death_lock);
+                for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
+                        if (!(twkill_thread_slots & (1 << i)))
+                                continue;
+                        while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
+                                if (need_resched()) {
+                                        spin_unlock_bh(&tw_death_lock);
+                                        schedule();
+                                        spin_lock_bh(&tw_death_lock);
+                                }
+                        }
+                        twkill_thread_slots &= ~(1 << i);
+                }
+                spin_unlock_bh(&tw_death_lock);
+        }
+}
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
+        spin_lock(&tw_death_lock);
+        if (tw_del_dead_node(tw)) {
+                tcp_tw_put(tw);
+                if (--tcp_tw_count == 0)
+                        del_timer(&tcp_tw_timer);
+        }
+        spin_unlock(&tw_death_lock);
+        tcp_timewait_kill(tw);
+}
+/* Short-time timewait calendar */
+static int tcp_twcal_hand = -1;
+static int tcp_twcal_jiffie;
+static void tcp_twcal_tick(unsigned long);
+static struct timer_list tcp_twcal_timer =
+                TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
+static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
+static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
+{
+        struct hlist_head *list;
+        int slot;
+        /* timeout := RTO * 3.5
+         *
+         * 3.5 = 1+2+0.5 to wait for two retransmits.
+         *
+         * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+         * our ACK acking that FIN can be lost. If N subsequent retransmitted
+         * FINs (or previous seqments) are lost (probability of such event
+         * is p^(N+1), where p is probability to lose single packet and
+         * time to detect the loss is about RTO*(2^N - 1) with exponential
+         * backoff). Normal timewait length is calculated so, that we
+         * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+         * [ BTW Linux. following BSD, violates this requirement waiting
+         *   only for 60sec, we should wait at least for 240 secs.
+         *   Well, 240 consumes too much of resources 8)
+         * ]
+         * This interval is not reduced to catch old duplicate and
+         * responces to our wandering segments living for two MSLs.
+         * However, if we use PAWS to detect
+         * old duplicates, we can reduce the interval to bounds required
+         * by RTO, rather than MSL. So, if peer understands PAWS, we
+         * kill tw bucket after 3.5*RTO (it is important that this number
+         * is greater than TS tick!) and detect old duplicates with help
+         * of PAWS.
+         */
+        slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
+        spin_lock(&tw_death_lock);
+        /* Unlink it, if it was scheduled */
+        if (tw_del_dead_node(tw))
+                tcp_tw_count--;
+        else
+                atomic_inc(&tw->tw_refcnt);
+        if (slot >= TCP_TW_RECYCLE_SLOTS) {
+                /* Schedule to slow timer */
+                if (timeo >= TCP_TIMEWAIT_LEN) {
+                        slot = TCP_TWKILL_SLOTS-1;
+                } else {
+                        slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
+                        if (slot >= TCP_TWKILL_SLOTS)
+                                slot = TCP_TWKILL_SLOTS-1;
+                }
+                tw->tw_ttd = jiffies + timeo;
+                slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
+                list = &tcp_tw_death_row[slot];
+        } else {
+                tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
+                if (tcp_twcal_hand < 0) {
+                        tcp_twcal_hand = 0;
+                        tcp_twcal_jiffie = jiffies;
+                        tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
+                        add_timer(&tcp_twcal_timer);
+                } else {
+                        if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
+                                mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
+                        slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
+                }
+                list = &tcp_twcal_row[slot];
+        }
+        hlist_add_head(&tw->tw_death_node, list);
+        if (tcp_tw_count++ == 0)
+                mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
+        spin_unlock(&tw_death_lock);
+}
+void tcp_twcal_tick(unsigned long dummy)
+{
+        int n, slot;
+        unsigned long j;
+        unsigned long now = jiffies;
+        int killed = 0;
+        int adv = 0;
+        spin_lock(&tw_death_lock);
+        if (tcp_twcal_hand < 0)
+                goto out;
+        slot = tcp_twcal_hand;
+        j = tcp_twcal_jiffie;
+        for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
+                if (time_before_eq(j, now)) {
+                        struct hlist_node *node, *safe;
+                        struct tcp_tw_bucket *tw;
+                        tw_for_each_inmate_safe(tw, node, safe,
+                                           &tcp_twcal_row[slot]) {
+                                __tw_del_dead_node(tw);
+                                tcp_timewait_kill(tw);
+                                tcp_tw_put(tw);
+                                killed++;
+                        }
+                } else {
+                        if (!adv) {
+                                adv = 1;
+                                tcp_twcal_jiffie = j;
+                                tcp_twcal_hand = slot;
+                        }
+                        if (!hlist_empty(&tcp_twcal_row[slot])) {
+                                mod_timer(&tcp_twcal_timer, j);
+                                goto out;
+                        }
+                }
+                j += (1<<TCP_TW_RECYCLE_TICK);
+                slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
+        }
+        tcp_twcal_hand = -1;
+out:
+        if ((tcp_tw_count -= killed) == 0)
+                del_timer(&tcp_tw_timer);
+        NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+        spin_unlock(&tw_death_lock);
+}
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+        /* allocate the newsk from the same slab of the master sock,
+         * if not, at sk_free time we'll try to free it from the wrong
+         * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
+        struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+        if(newsk != NULL) {
+                struct tcp_sock *newtp;
+                struct sk_filter *filter;
+                memcpy(newsk, sk, sizeof(struct tcp_sock));
+                newsk->sk_state = TCP_SYN_RECV;
+                /* SANITY */
+                sk_node_init(&newsk->sk_node);
+                tcp_sk(newsk)->bind_hash = NULL;
+                /* Clone the TCP header template */
+                inet_sk(newsk)->dport = req->rmt_port;
+                sock_lock_init(newsk);
+                bh_lock_sock(newsk);
+                rwlock_init(&newsk->sk_dst_lock);
+                atomic_set(&newsk->sk_rmem_alloc, 0);
+                skb_queue_head_init(&newsk->sk_receive_queue);
+                atomic_set(&newsk->sk_wmem_alloc, 0);
+                skb_queue_head_init(&newsk->sk_write_queue);
+                atomic_set(&newsk->sk_omem_alloc, 0);
+                newsk->sk_wmem_queued = 0;
+                newsk->sk_forward_alloc = 0;
+                sock_reset_flag(newsk, SOCK_DONE);
+                newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+                newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+                newsk->sk_send_head = NULL;
+                rwlock_init(&newsk->sk_callback_lock);
+                skb_queue_head_init(&newsk->sk_error_queue);
+                newsk->sk_write_space = sk_stream_write_space;
+                if ((filter = newsk->sk_filter) != NULL)
+                        sk_filter_charge(newsk, filter);
+                if (unlikely(xfrm_sk_clone_policy(newsk))) {
+                        /* It is still raw copy of parent, so invalidate
+                         * destructor and make plain sk_free() */
+                        newsk->sk_destruct = NULL;
+                        sk_free(newsk);
+                        return NULL;
+                }
+                /* Now setup tcp_sock */
+                newtp = tcp_sk(newsk);
+                newtp->pred_flags = 0;
+                newtp->rcv_nxt = req->rcv_isn + 1;
+                newtp->snd_nxt = req->snt_isn + 1;
+                newtp->snd_una = req->snt_isn + 1;
+                newtp->snd_sml = req->snt_isn + 1;
+                tcp_prequeue_init(newtp);
+                tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
+                newtp->retransmits = 0;
+                newtp->backoff = 0;
+                newtp->srtt = 0;
+                newtp->mdev = TCP_TIMEOUT_INIT;
+                newtp->rto = TCP_TIMEOUT_INIT;
+                newtp->packets_out = 0;
+                newtp->left_out = 0;
+                newtp->retrans_out = 0;
+                newtp->sacked_out = 0;
+                newtp->fackets_out = 0;
+                newtp->snd_ssthresh = 0x7fffffff;
+                /* So many TCP implementations out there (incorrectly) count the
+                 * initial SYN frame in their delayed-ACK and congestion control
+                 * algorithms that we must have the following bandaid to talk
+                 * efficiently to them.  -DaveM
+                 */
+                newtp->snd_cwnd = 2;
+                newtp->snd_cwnd_cnt = 0;
+                newtp->frto_counter = 0;
+                newtp->frto_highmark = 0;
+                tcp_set_ca_state(newtp, TCP_CA_Open);
+                tcp_init_xmit_timers(newsk);
+                skb_queue_head_init(&newtp->out_of_order_queue);
+                newtp->rcv_wup = req->rcv_isn + 1;
+                newtp->write_seq = req->snt_isn + 1;
+                newtp->pushed_seq = newtp->write_seq;
+                newtp->copied_seq = req->rcv_isn + 1;
+                newtp->rx_opt.saw_tstamp = 0;
+                newtp->rx_opt.dsack = 0;
+                newtp->rx_opt.eff_sacks = 0;
+                newtp->probes_out = 0;
+                newtp->rx_opt.num_sacks = 0;
+                newtp->urg_data = 0;
+                newtp->listen_opt = NULL;
+                newtp->accept_queue = newtp->accept_queue_tail = NULL;
+                /* Deinitialize syn_wait_lock to trap illegal accesses. */
+                memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
+                /* Back to base struct sock members. */
+                newsk->sk_err = 0;
+                newsk->sk_priority = 0;
+                atomic_set(&newsk->sk_refcnt, 2);
+#ifdef INET_REFCNT_DEBUG
+                atomic_inc(&inet_sock_nr);
+#endif
+                atomic_inc(&tcp_sockets_allocated);
+                if (sock_flag(newsk, SOCK_KEEPOPEN))
+                        tcp_reset_keepalive_timer(newsk,
+                                                  keepalive_time_when(newtp));
+                newsk->sk_socket = NULL;
+                newsk->sk_sleep = NULL;
+                newtp->rx_opt.tstamp_ok = req->tstamp_ok;
+                if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
+                        if (sysctl_tcp_fack)
+                                newtp->rx_opt.sack_ok |= 2;
+                }
+                newtp->window_clamp = req->window_clamp;
+                newtp->rcv_ssthresh = req->rcv_wnd;
+                newtp->rcv_wnd = req->rcv_wnd;
+                newtp->rx_opt.wscale_ok = req->wscale_ok;
+                if (newtp->rx_opt.wscale_ok) {
+                        newtp->rx_opt.snd_wscale = req->snd_wscale;
+                        newtp->rx_opt.rcv_wscale = req->rcv_wscale;
+                } else {
+                        newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+                        newtp->window_clamp = min(newtp->window_clamp, 65535U);
+                }
+                newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
+                newtp->max_window = newtp->snd_wnd;
+                if (newtp->rx_opt.tstamp_ok) {
+                        newtp->rx_opt.ts_recent = req->ts_recent;
+                        newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+                        newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+                } else {
+                        newtp->rx_opt.ts_recent_stamp = 0;
+                        newtp->tcp_header_len = sizeof(struct tcphdr);
+                }
+                if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
+                        newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+                newtp->rx_opt.mss_clamp = req->mss;
+                TCP_ECN_openreq_child(newtp, req);
+                if (newtp->ecn_flags&TCP_ECN_OK)
+                        sock_set_flag(newsk, SOCK_NO_LARGESEND);
+                tcp_ca_init(newtp);
+                TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
+        }
+        return newsk;
+}
+/* 
+ *      Process an incoming packet for SYN_RECV sockets represented
+ *      as an open_request.
+ */
+struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
+                           struct open_request *req,
+                           struct open_request **prev)
+{
+        struct tcphdr *th = skb->h.th;
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+        int paws_reject = 0;
+        struct tcp_options_received tmp_opt;
+        struct sock *child;
+        tmp_opt.saw_tstamp = 0;
+        if (th->doff > (sizeof(struct tcphdr)>>2)) {
+                tcp_parse_options(skb, &tmp_opt, 0);
+                if (tmp_opt.saw_tstamp) {
+                        tmp_opt.ts_recent = req->ts_recent;
+                        /* We do not store true stamp, but it is not required,
+                         * it can be estimated (approximately)
+                         * from another data.
+                         */
+                        tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+                        paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+                }
+        }
+        /* Check for pure retransmitted SYN. */
+        if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
+            flg == TCP_FLAG_SYN &&
+            !paws_reject) {
+                /*
+                 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+                 * this case on figure 6 and figure 8, but formal
+                 * protocol description says NOTHING.
+                 * To be more exact, it says that we should send ACK,
+                 * because this segment (at least, if it has no data)
+                 * is out of window.
+                 *
+                 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+                 *  describe SYN-RECV state. All the description
+                 *  is wrong, we cannot believe to it and should
+                 *  rely only on common sense and implementation
+                 *  experience.
+                 *
+                 * Enforce "SYN-ACK" according to figure 8, figure 6
+                 * of RFC793, fixed by RFC1122.
+                 */
+                req->class->rtx_syn_ack(sk, req, NULL);
+                return NULL;
+        }
+        /* Further reproduces section "SEGMENT ARRIVES"
+           for state SYN-RECEIVED of RFC793.
+           It is broken, however, it does not work only
+           when SYNs are crossed.
+           You would think that SYN crossing is impossible here, since
+           we should have a SYN_SENT socket (from connect()) on our end,
+           but this is not true if the crossed SYNs were sent to both
+           ends by a malicious third party.  We must defend against this,
+           and to do that we first verify the ACK (as per RFC793, page
+           36) and reset if it is invalid.  Is this a true full defense?
+           To convince ourselves, let us consider a way in which the ACK
+           test can still pass in this 'malicious crossed SYNs' case.
+           Malicious sender sends identical SYNs (and thus identical sequence
+           numbers) to both A and B:
+                A: gets SYN, seq=7
+                B: gets SYN, seq=7
+           By our good fortune, both A and B select the same initial
+           send sequence number of seven :-)
+                A: sends SYN|ACK, seq=7, ack_seq=8
+                B: sends SYN|ACK, seq=7, ack_seq=8
+           So we are now A eating this SYN|ACK, ACK test passes.  So
+           does sequence test, SYN is truncated, and thus we consider
+           it a bare ACK.
+           If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
+           we create an established connection.  Both ends (listening sockets)
+           accept the new incoming connection and try to talk to each other. 8-)
+           Note: This case is both harmless, and rare.  Possibility is about the
+           same as us discovering intelligent life on another plant tomorrow.
+           But generally, we should (RFC lies!) to accept ACK
+           from SYNACK both here and in tcp_rcv_state_process().
+           tcp_rcv_state_process() does not, hence, we do not too.
+           Note that the case is absolutely generic:
+           we cannot optimize anything here without
+           violating protocol. All the checks must be made
+           before attempt to create socket.
+         */
+        /* RFC793 page 36: "If the connection is in any non-synchronized state ...
+         *                  and the incoming segment acknowledges something not yet
+         *                  sent (the segment carries an unaccaptable ACK) ...
+         *                  a reset is sent."
+         *
+         * Invalid ACK: reset will be sent by listening socket
+         */
+        if ((flg & TCP_FLAG_ACK) &&
+            (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
+                return sk;
+        /* Also, it would be not so bad idea to check rcv_tsecr, which
+         * is essentially ACK extension and too early or too late values
+         * should cause reset in unsynchronized states.
+         */
+        /* RFC793: "first check sequence number". */
+        if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+                                          req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
+                /* Out of window: send ACK and drop. */
+                if (!(flg & TCP_FLAG_RST))
+                        req->class->send_ack(skb, req);
+                if (paws_reject)
+                        NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+                return NULL;
+        }
+        /* In sequence, PAWS is OK. */
+        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+                        req->ts_recent = tmp_opt.rcv_tsval;
+                if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+                        /* Truncate SYN, it is out of window starting
+                           at req->rcv_isn+1. */
+                        flg &= ~TCP_FLAG_SYN;
+                }
+                /* RFC793: "second check the RST bit" and
+                 *         "fourth, check the SYN bit"
+                 */
+                if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
+                        goto embryonic_reset;
+                /* ACK sequence verified above, just make sure ACK is
+                 * set.  If ACK not set, just silently drop the packet.
+                 */
+                if (!(flg & TCP_FLAG_ACK))
+                        return NULL;
+                /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+                if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
+                        req->acked = 1;
+                        return NULL;
+                }
+                /* OK, ACK is valid, create big socket and
+                 * feed this segment to it. It will repeat all
+                 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+                 * ESTABLISHED STATE. If it will be dropped after
+                 * socket is created, wait for troubles.
+                 */
+                child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+                if (child == NULL)
+                        goto listen_overflow;
+                tcp_synq_unlink(tp, req, prev);
+                tcp_synq_removed(sk, req);
+                tcp_acceptq_queue(sk, req, child);
+                return child;
+        listen_overflow:
+                if (!sysctl_tcp_abort_on_overflow) {
+                        req->acked = 1;
+                        return NULL;
+                }
+        embryonic_reset:
+                NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
+                if (!(flg & TCP_FLAG_RST))
+                        req->class->send_reset(skb);
+                tcp_synq_drop(sk, req, prev);
+                return NULL;
+}
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+int tcp_child_process(struct sock *parent, struct sock *child,
+                      struct sk_buff *skb)
+{
+        int ret = 0;
+        int state = child->sk_state;
+        if (!sock_owned_by_user(child)) {
+                ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
+                /* Wakeup parent, send SIGIO */
+                if (state == TCP_SYN_RECV && child->sk_state != state)
+                        parent->sk_data_ready(parent, 0);
+        } else {
+                /* Alas, it is possible again, because we do lookup
+                 * in main socket hash table and lock on listening
+                 * socket does not protect us more.
+                 */
+                sk_add_backlog(child, skb);
+        }
+        bh_unlock_sock(child);
+        sock_put(child);
+        return ret;
+}
+EXPORT_SYMBOL(tcp_check_req);
+EXPORT_SYMBOL(tcp_child_process);
+EXPORT_SYMBOL(tcp_create_openreq_child);
+EXPORT_SYMBOL(tcp_timewait_state_process);
+EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 000000000000..13c14cb6dee4
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,1739 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:     $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *              Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *              Florian La Roche, <flla@stud.uni-sb.de>
+ *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *              Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *              Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *              Matthew Dillon, <dillon@apollo.west.oic.com>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Jorge Cwik, <jorge@laser.satlink.net>
+ */
+/*
+ * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
+ *                              :       Fragmentation on mtu decrease
+ *                              :       Segment collapse on retransmit
+ *                              :       AF independence
+ *
+ *              Linus Torvalds  :       send_delayed_ack
+ *              David S. Miller :       Charge memory using the right skb
+ *                                      during syn/ack processing.
+ *              David S. Miller :       Output engine completely rewritten.
+ *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
+ *              Cacophonix Gaul :       draft-minshall-nagle-01
+ *              J Hadi Salim    :       ECN support
+ *
+ */
+#include <net/tcp.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse = 1;
+/* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume.  Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+int sysctl_tcp_tso_win_divisor = 8;
+static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
+                                    struct sk_buff *skb)
+{
+        sk->sk_send_head = skb->next;
+        if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
+                sk->sk_send_head = NULL;
+        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+        tcp_packets_out_inc(sk, tp, skb);
+}
+/* SND.NXT, if window was not shrunk.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
+{
+        if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
+                return tp->snd_nxt;
+        else
+                return tp->snd_una+tp->snd_wnd;
+}
+/* Calculate mss to advertise in SYN segment.
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ *    attached devices, because some buggy hosts are confused by
+ *    large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ *    This may be overridden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ *    probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
+        int mss = tp->advmss;
+        if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
+                mss = dst_metric(dst, RTAX_ADVMSS);
+                tp->advmss = mss;
+        }
+        return (__u16)mss;
+}
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+{
+        s32 delta = tcp_time_stamp - tp->lsndtime;
+        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+        u32 cwnd = tp->snd_cwnd;
+        if (tcp_is_vegas(tp)) 
+                tcp_vegas_enable(tp);
+        tp->snd_ssthresh = tcp_current_ssthresh(tp);
+        restart_cwnd = min(restart_cwnd, cwnd);
+        while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+                cwnd >>= 1;
+        tp->snd_cwnd = max(cwnd, restart_cwnd);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+        tp->snd_cwnd_used = 0;
+}
+static inline void tcp_event_data_sent(struct tcp_sock *tp,
+                                       struct sk_buff *skb, struct sock *sk)
+{
+        u32 now = tcp_time_stamp;
+        if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
+                tcp_cwnd_restart(tp, __sk_dst_get(sk));
+        tp->lsndtime = now;
+        /* If it is a reply for ato after last received
+         * packet, enter pingpong mode.
+         */
+        if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
+                tp->ack.pingpong = 1;
+}
+static __inline__ void tcp_event_ack_sent(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tcp_dec_quickack_mode(tp);
+        tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+}
+/* Determine a window scaling and initial window to offer.
+ * Based on the assumption that the given amount of space
+ * will be offered. Store the results in the tp structure.
+ * NOTE: for smooth operation initial space offering should
+ * be a multiple of mss if possible. We assume here that mss >= 1.
+ * This MUST be enforced by all callers.
+ */
+void tcp_select_initial_window(int __space, __u32 mss,
+                               __u32 *rcv_wnd, __u32 *window_clamp,
+                               int wscale_ok, __u8 *rcv_wscale)
+{
+        unsigned int space = (__space < 0 ? 0 : __space);
+        /* If no clamp set the clamp to the max possible scaled window */
+        if (*window_clamp == 0)
+                (*window_clamp) = (65535 << 14);
+        space = min(*window_clamp, space);
+        /* Quantize space offering to a multiple of mss if possible. */
+        if (space > mss)
+                space = (space / mss) * mss;
+        /* NOTE: offering an initial window larger than 32767
+         * will break some buggy TCP stacks. We try to be nice.
+         * If we are not window scaling, then this truncates
+         * our initial window offering to 32k. There should also
+         * be a sysctl option to stop being nice.
+         */
+        (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+        (*rcv_wscale) = 0;
+        if (wscale_ok) {
+                /* Set window scaling on max possible window
+                 * See RFC1323 for an explanation of the limit to 14 
+                 */
+                space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+                while (space > 65535 && (*rcv_wscale) < 14) {
+                        space >>= 1;
+                        (*rcv_wscale)++;
+                }
+        }
+        /* Set initial window to value enough for senders,
+         * following RFC1414. Senders, not following this RFC,
+         * will be satisfied with 2.
+         */
+        if (mss > (1<<*rcv_wscale)) {
+                int init_cwnd = 4;
+                if (mss > 1460*3)
+                        init_cwnd = 2;
+                else if (mss > 1460)
+                        init_cwnd = 3;
+                if (*rcv_wnd > init_cwnd*mss)
+                        *rcv_wnd = init_cwnd*mss;
+        }
+        /* Set the clamp no higher than max representable value */
+        (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+}
+/* Chose a new window to advertise, update state in tcp_sock for the
+ * socket, and return result with RFC1323 scaling applied.  The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static __inline__ u16 tcp_select_window(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 cur_win = tcp_receive_window(tp);
+        u32 new_win = __tcp_select_window(sk);
+        /* Never shrink the offered window */
+        if(new_win < cur_win) {
+                /* Danger Will Robinson!
+                 * Don't update rcv_wup/rcv_wnd here or else
+                 * we will not be able to advertise a zero
+                 * window in time.  --DaveM
+                 *
+                 * Relax Will Robinson.
+                 */
+                new_win = cur_win;
+        }
+        tp->rcv_wnd = new_win;
+        tp->rcv_wup = tp->rcv_nxt;
+        /* Make sure we do not exceed the maximum possible
+         * scaled window.
+         */
+        if (!tp->rx_opt.rcv_wscale)
+                new_win = min(new_win, MAX_TCP_WINDOW);
+        else
+                new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+        /* RFC1323 scaling applied */
+        new_win >>= tp->rx_opt.rcv_wscale;
+        /* If we advertise zero window, disable fast path. */
+        if (new_win == 0)
+                tp->pred_flags = 0;
+        return new_win;
+}
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg().  This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless.  It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+        if (skb != NULL) {
+                struct inet_sock *inet = inet_sk(sk);
+                struct tcp_sock *tp = tcp_sk(sk);
+                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+                int tcp_header_size = tp->tcp_header_len;
+                struct tcphdr *th;
+                int sysctl_flags;
+                int err;
+                BUG_ON(!tcp_skb_pcount(skb));
+#define SYSCTL_FLAG_TSTAMPS     0x1
+#define SYSCTL_FLAG_WSCALE      0x2
+#define SYSCTL_FLAG_SACK        0x4
+                sysctl_flags = 0;
+                if (tcb->flags & TCPCB_FLAG_SYN) {
+                        tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+                        if(sysctl_tcp_timestamps) {
+                                tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+                                sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
+                        }
+                        if(sysctl_tcp_window_scaling) {
+                                tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+                                sysctl_flags |= SYSCTL_FLAG_WSCALE;
+                        }
+                        if(sysctl_tcp_sack) {
+                                sysctl_flags |= SYSCTL_FLAG_SACK;
+                                if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+                                        tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+                        }
+                } else if (tp->rx_opt.eff_sacks) {
+                        /* A SACK is 2 pad bytes, a 2 byte header, plus
+                         * 2 32-bit sequence numbers for each SACK block.
+                         */
+                        tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+                                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+                }
+                
+                /*
+                 * If the connection is idle and we are restarting,
+                 * then we don't want to do any Vegas calculations
+                 * until we get fresh RTT samples.  So when we
+                 * restart, we reset our Vegas state to a clean
+                 * slate. After we get acks for this flight of
+                 * packets, _then_ we can make Vegas calculations
+                 * again.
+                 */
+                if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
+                        tcp_vegas_enable(tp);
+                th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+                skb->h.th = th;
+                skb_set_owner_w(skb, sk);
+                /* Build TCP header and checksum it. */
+                th->source              = inet->sport;
+                th->dest                = inet->dport;
+                th->seq                 = htonl(tcb->seq);
+                th->ack_seq             = htonl(tp->rcv_nxt);
+                *(((__u16 *)th) + 6)    = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
+                if (tcb->flags & TCPCB_FLAG_SYN) {
+                        /* RFC1323: The window in SYN & SYN/ACK segments
+                         * is never scaled.
+                         */
+                        th->window      = htons(tp->rcv_wnd);
+                } else {
+                        th->window      = htons(tcp_select_window(sk));
+                }
+                th->check               = 0;
+                th->urg_ptr             = 0;
+                if (tp->urg_mode &&
+                    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
+                        th->urg_ptr             = htons(tp->snd_up-tcb->seq);
+                        th->urg                 = 1;
+                }
+                if (tcb->flags & TCPCB_FLAG_SYN) {
+                        tcp_syn_build_options((__u32 *)(th + 1),
+                                              tcp_advertise_mss(sk),
+                                              (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+                                              (sysctl_flags & SYSCTL_FLAG_SACK),
+                                              (sysctl_flags & SYSCTL_FLAG_WSCALE),
+                                              tp->rx_opt.rcv_wscale,
+                                              tcb->when,
+                                              tp->rx_opt.ts_recent);
+                } else {
+                        tcp_build_and_update_options((__u32 *)(th + 1),
+                                                     tp, tcb->when);
+                        TCP_ECN_send(sk, tp, skb, tcp_header_size);
+                }
+                tp->af_specific->send_check(sk, th, skb->len, skb);
+                if (tcb->flags & TCPCB_FLAG_ACK)
+                        tcp_event_ack_sent(sk);
+                if (skb->len != tcp_header_size)
+                        tcp_event_data_sent(tp, skb, sk);
+                TCP_INC_STATS(TCP_MIB_OUTSEGS);
+                err = tp->af_specific->queue_xmit(skb, 0);
+                if (err <= 0)
+                        return err;
+                tcp_enter_cwr(tp);
+                /* NET_XMIT_CN is special. It does not guarantee,
+                 * that this packet is lost. It tells that device
+                 * is about to start to drop packets or already
+                 * drops some packets of the same priority and
+                 * invokes us to send less aggressively.
+                 */
+                return err == NET_XMIT_CN ? 0 : err;
+        }
+        return -ENOBUFS;
+#undef SYSCTL_FLAG_TSTAMPS
+#undef SYSCTL_FLAG_WSCALE
+#undef SYSCTL_FLAG_SACK
+}
+/* This routine just queue's the buffer 
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* Advance write_seq and place onto the write_queue. */
+        tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+        skb_header_release(skb);
+        __skb_queue_tail(&sk->sk_write_queue, skb);
+        sk_charge_skb(sk, skb);
+        /* Queue it, remembering where we must start sending. */
+        if (sk->sk_send_head == NULL)
+                sk->sk_send_head = skb;
+}
+static inline void tcp_tso_set_push(struct sk_buff *skb)
+{
+        /* Force push to be on for any TSO frames to workaround
+         * problems with busted implementations like Mac OS-X that
+         * hold off socket receive wakeups until push is seen.
+         */
+        if (tcp_skb_pcount(skb) > 1)
+                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+}
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned cur_mss)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb = sk->sk_send_head;
+        if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
+                /* Send it out now. */
+                TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                tcp_tso_set_push(skb);
+                if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
+                        sk->sk_send_head = NULL;
+                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+                        tcp_packets_out_inc(sk, tp, skb);
+                        return;
+                }
+        }
+}
+void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
+{
+        if (skb->len <= mss_std) {
+                /* Avoid the costly divide in the normal
+                 * non-TSO case.
+                 */
+                skb_shinfo(skb)->tso_segs = 1;
+                skb_shinfo(skb)->tso_size = 0;
+        } else {
+                unsigned int factor;
+                factor = skb->len + (mss_std - 1);
+                factor /= mss_std;
+                skb_shinfo(skb)->tso_segs = factor;
+                skb_shinfo(skb)->tso_size = mss_std;
+        }
+}
+/* Function to create two new TCP segments.  Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list.  This won't be called frequently, I hope. 
+ * Remember, these are still headerless SKBs at this point.
+ */
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *buff;
+        int nsize;
+        u16 flags;
+        nsize = skb_headlen(skb) - len;
+        if (nsize < 0)
+                nsize = 0;
+        if (skb_cloned(skb) &&
+            skb_is_nonlinear(skb) &&
+            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+                return -ENOMEM;
+        /* Get a new skb... force flag on. */
+        buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+        if (buff == NULL)
+                return -ENOMEM; /* We'll just try again later. */
+        sk_charge_skb(sk, buff);
+        /* Correct the sequence numbers. */
+        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+        /* PSH and FIN should only be set in the second packet. */
+        flags = TCP_SKB_CB(skb)->flags;
+        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+        TCP_SKB_CB(buff)->flags = flags;
+        TCP_SKB_CB(buff)->sacked =
+                (TCP_SKB_CB(skb)->sacked &
+                 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
+        TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
+        if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
+                /* Copy and checksum data tail into the new buffer. */
+                buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+                                                       nsize, 0);
+                skb_trim(skb, len);
+                skb->csum = csum_block_sub(skb->csum, buff->csum, len);
+        } else {
+                skb->ip_summed = CHECKSUM_HW;
+                skb_split(skb, buff, len);
+        }
+        buff->ip_summed = skb->ip_summed;
+        /* Looks stupid, but our code really uses when of
+         * skbs, which it never sent before. --ANK
+         */
+        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+        if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+                tp->lost_out -= tcp_skb_pcount(skb);
+                tp->left_out -= tcp_skb_pcount(skb);
+        }
+        /* Fix up tso_factor for both original and new SKB.  */
+        tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
+        tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
+        if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+                tp->lost_out += tcp_skb_pcount(skb);
+                tp->left_out += tcp_skb_pcount(skb);
+        }
+        if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
+                tp->lost_out += tcp_skb_pcount(buff);
+                tp->left_out += tcp_skb_pcount(buff);
+        }
+        /* Link BUFF into the send queue. */
+        __skb_append(skb, buff);
+        return 0;
+}
+/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
+{
+        int i, k, eat;
+        eat = len;
+        k = 0;
+        for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
+                if (skb_shinfo(skb)->frags[i].size <= eat) {
+                        put_page(skb_shinfo(skb)->frags[i].page);
+                        eat -= skb_shinfo(skb)->frags[i].size;
+                } else {
+                        skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+                        if (eat) {
+                                skb_shinfo(skb)->frags[k].page_offset += eat;
+                                skb_shinfo(skb)->frags[k].size -= eat;
+                                eat = 0;
+                        }
+                        k++;
+                }
+        }
+        skb_shinfo(skb)->nr_frags = k;
+        skb->tail = skb->data;
+        skb->data_len -= len;
+        skb->len = skb->data_len;
+        return skb->tail;
+}
+int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+        if (skb_cloned(skb) &&
+            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+                return -ENOMEM;
+        if (len <= skb_headlen(skb)) {
+                __skb_pull(skb, len);
+        } else {
+                if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
+                        return -ENOMEM;
+        }
+        TCP_SKB_CB(skb)->seq += len;
+        skb->ip_summed = CHECKSUM_HW;
+        skb->truesize        -= len;
+        sk->sk_wmem_queued   -= len;
+        sk->sk_forward_alloc += len;
+        sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+        /* Any change of skb->len requires recalculation of tso
+         * factor and mss.
+         */
+        if (tcp_skb_pcount(skb) > 1)
+                tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
+        return 0;
+}
+/* This function synchronize snd mss to current pmtu/exthdr set.
+   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+   for TCP options, but includes only bare TCP header.
+   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
+   It is minumum of user_mss and mss received with SYN.
+   It also does not include TCP options.
+   tp->pmtu_cookie is last pmtu, seen by this function.
+   tp->mss_cache is current effective sending mss, including
+   all tcp options except for SACKs. It is evaluated,
+   taking into account current pmtu, but never exceeds
+   tp->rx_opt.mss_clamp.
+   NOTE1. rfc1122 clearly states that advertised MSS
+   DOES NOT include either tcp or ip options.
+   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+   this function.                       --ANK (980731)
+ */
+unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int mss_now;
+        /* Calculate base mss without TCP options:
+           It is MMS_S - sizeof(tcphdr) of rfc1122
+         */
+        mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+        /* Clamp it (mss_clamp does not include tcp options) */
+        if (mss_now > tp->rx_opt.mss_clamp)
+                mss_now = tp->rx_opt.mss_clamp;
+        /* Now subtract optional transport overhead */
+        mss_now -= tp->ext_header_len;
+        /* Then reserve room for full set of TCP options and 8 bytes of data */
+        if (mss_now < 48)
+                mss_now = 48;
+        /* Now subtract TCP options size, not including SACKs */
+        mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+        /* Bound mss with half of window */
+        if (tp->max_window && mss_now > (tp->max_window>>1))
+                mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
+        /* And store cached results */
+        tp->pmtu_cookie = pmtu;
+        tp->mss_cache = tp->mss_cache_std = mss_now;
+        return mss_now;
+}
+/* Compute the current effective MSS, taking SACKs and IP options,
+ * and even PMTU discovery events into account.
+ *
+ * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
+ * cannot be large. However, taking into account rare use of URG, this
+ * is not a big flaw.
+ */
+unsigned int tcp_current_mss(struct sock *sk, int large)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
+        unsigned int do_large, mss_now;
+        mss_now = tp->mss_cache_std;
+        if (dst) {
+                u32 mtu = dst_mtu(dst);
+                if (mtu != tp->pmtu_cookie)
+                        mss_now = tcp_sync_mss(sk, mtu);
+        }
+        do_large = (large &&
+                    (sk->sk_route_caps & NETIF_F_TSO) &&
+                    !tp->urg_mode);
+        if (do_large) {
+                unsigned int large_mss, factor, limit;
+                large_mss = 65535 - tp->af_specific->net_header_len -
+                        tp->ext_header_len - tp->tcp_header_len;
+                if (tp->max_window && large_mss > (tp->max_window>>1))
+                        large_mss = max((tp->max_window>>1),
+                                        68U - tp->tcp_header_len);
+                factor = large_mss / mss_now;
+                /* Always keep large mss multiple of real mss, but
+                 * do not exceed 1/tso_win_divisor of the congestion window
+                 * so we can keep the ACK clock ticking and minimize
+                 * bursting.
+                 */
+                limit = tp->snd_cwnd;
+                if (sysctl_tcp_tso_win_divisor)
+                        limit /= sysctl_tcp_tso_win_divisor;
+                limit = max(1U, limit);
+                if (factor > limit)
+                        factor = limit;
+                tp->mss_cache = mss_now * factor;
+                mss_now = tp->mss_cache;
+        }
+        if (tp->rx_opt.eff_sacks)
+                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+        return mss_now;
+}
+/* This routine writes packets to the network.  It advances the
+ * send_head.  This happens as incoming acks open up the remote
+ * window for us.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
+ */
+int tcp_write_xmit(struct sock *sk, int nonagle)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        unsigned int mss_now;
+        /* If we are closed, the bytes will have to remain here.
+         * In time closedown will finish, we empty the write queue and all
+         * will be happy.
+         */
+        if (sk->sk_state != TCP_CLOSE) {
+                struct sk_buff *skb;
+                int sent_pkts = 0;
+                /* Account for SACKS, we may need to fragment due to this.
+                 * It is just like the real MSS changing on us midstream.
+                 * We also handle things correctly when the user adds some
+                 * IP options mid-stream.  Silly to do, but cover it.
+                 */
+                mss_now = tcp_current_mss(sk, 1);
+                while ((skb = sk->sk_send_head) &&
+                       tcp_snd_test(tp, skb, mss_now,
+                                    tcp_skb_is_last(sk, skb) ? nonagle :
+                                                               TCP_NAGLE_PUSH)) {
+                        if (skb->len > mss_now) {
+                                if (tcp_fragment(sk, skb, mss_now))
+                                        break;
+                        }
+                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                        tcp_tso_set_push(skb);
+                        if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+                                break;
+                        /* Advance the send_head.  This one is sent out.
+                         * This call will increment packets_out.
+                         */
+                        update_send_head(sk, tp, skb);
+                        tcp_minshall_update(tp, mss_now, skb);
+                        sent_pkts = 1;
+                }
+                if (sent_pkts) {
+                        tcp_cwnd_validate(sk, tp);
+                        return 0;
+                }
+                return !tp->packets_out && sk->sk_send_head;
+        }
+        return 0;
+}
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *  
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ *  RECV.NEXT + RCV.WIN fixed until:
+ *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ * 
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ * 
+ * BSD seems to make the following compromise:
+ * 
+ *      If the free space is less than the 1/4 of the maximum
+ *      space available and the free space is less than 1/2 mss,
+ *      then set the window to 0.
+ *      [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
+ *      Otherwise, just prevent the window from shrinking
+ *      and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* MSS for the peer's data.  Previous verions used mss_clamp
+         * here.  I don't know if the value based on our guesses
+         * of peer's MSS is better for the performance.  It's more correct
+         * but may be worse for the performance because of rcv_mss
+         * fluctuations.  --SAW  1998/11/1
+         */
+        int mss = tp->ack.rcv_mss;
+        int free_space = tcp_space(sk);
+        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
+        int window;
+        if (mss > full_space)
+                mss = full_space; 
+        if (free_space < full_space/2) {
+                tp->ack.quick = 0;
+                if (tcp_memory_pressure)
+                        tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
+                if (free_space < mss)
+                        return 0;
+        }
+        if (free_space > tp->rcv_ssthresh)
+                free_space = tp->rcv_ssthresh;
+        /* Don't do rounding if we are using window scaling, since the
+         * scaled window will not line up with the MSS boundary anyway.
+         */
+        window = tp->rcv_wnd;
+        if (tp->rx_opt.rcv_wscale) {
+                window = free_space;
+                /* Advertise enough space so that it won't get scaled away.
+                 * Import case: prevent zero window announcement if
+                 * 1<<rcv_wscale > mss.
+                 */
+                if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
+                        window = (((window >> tp->rx_opt.rcv_wscale) + 1)
+                                  << tp->rx_opt.rcv_wscale);
+        } else {
+                /* Get the largest window that is a nice multiple of mss.
+                 * Window clamp already applied above.
+                 * If our current window offering is within 1 mss of the
+                 * free space we just keep it. This prevents the divide
+                 * and multiply from happening most of the time.
+                 * We also don't do any window rounding when the free space
+                 * is too small.
+                 */
+                if (window <= free_space - mss || window > free_space)
+                        window = (free_space/mss)*mss;
+        }
+        return window;
+}
+/* Attempt to collapse two adjacent SKB's during retransmission. */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *next_skb = skb->next;
+        /* The first test we must make is that neither of these two
+         * SKB's are still referenced by someone else.
+         */
+        if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
+                int skb_size = skb->len, next_skb_size = next_skb->len;
+                u16 flags = TCP_SKB_CB(skb)->flags;
+                /* Also punt if next skb has been SACK'd. */
+                if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
+                        return;
+                /* Next skb is out of window. */
+                if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
+                        return;
+                /* Punt if not enough space exists in the first SKB for
+                 * the data in the second, or the total combined payload
+                 * would exceed the MSS.
+                 */
+                if ((next_skb_size > skb_tailroom(skb)) ||
+                    ((skb_size + next_skb_size) > mss_now))
+                        return;
+                BUG_ON(tcp_skb_pcount(skb) != 1 ||
+                       tcp_skb_pcount(next_skb) != 1);
+                /* Ok.  We will be able to collapse the packet. */
+                __skb_unlink(next_skb, next_skb->list);
+                memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
+                if (next_skb->ip_summed == CHECKSUM_HW)
+                        skb->ip_summed = CHECKSUM_HW;
+                if (skb->ip_summed != CHECKSUM_HW)
+                        skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+                /* Update sequence range on original skb. */
+                TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+                /* Merge over control information. */
+                flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
+                TCP_SKB_CB(skb)->flags = flags;
+                /* All done, get rid of second SKB and account for it so
+                 * packet counting does not break.
+                 */
+                TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
+                if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
+                        tp->retrans_out -= tcp_skb_pcount(next_skb);
+                if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
+                        tp->lost_out -= tcp_skb_pcount(next_skb);
+                        tp->left_out -= tcp_skb_pcount(next_skb);
+                }
+                /* Reno case is special. Sigh... */
+                if (!tp->rx_opt.sack_ok && tp->sacked_out) {
+                        tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
+                        tp->left_out -= tcp_skb_pcount(next_skb);
+                }
+                /* Not quite right: it can be > snd.fack, but
+                 * it is better to underestimate fackets.
+                 */
+                tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
+                tcp_packets_out_dec(tp, next_skb);
+                sk_stream_free_skb(sk, next_skb);
+        }
+}
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery. 
+ * The socket is already locked here.
+ */ 
+void tcp_simple_retransmit(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        unsigned int mss = tcp_current_mss(sk, 0);
+        int lost = 0;
+        sk_stream_for_retrans_queue(skb, sk) {
+                if (skb->len > mss && 
+                    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+                        if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+                                tp->retrans_out -= tcp_skb_pcount(skb);
+                        }
+                        if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
+                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+                                tp->lost_out += tcp_skb_pcount(skb);
+                                lost = 1;
+                        }
+                }
+        }
+        if (!lost)
+                return;
+        tcp_sync_left_out(tp);
+        /* Don't muck with the congestion window here.
+         * Reason is that we do not increase amount of _data_
+         * in network, but units changed and effective
+         * cwnd/ssthresh really reduced now.
+         */
+        if (tp->ca_state != TCP_CA_Loss) {
+                tp->high_seq = tp->snd_nxt;
+                tp->snd_ssthresh = tcp_current_ssthresh(tp);
+                tp->prior_ssthresh = 0;
+                tp->undo_marker = 0;
+                tcp_set_ca_state(tp, TCP_CA_Loss);
+        }
+        tcp_xmit_retransmit_queue(sk);
+}
+/* This retransmits one SKB.  Policy decisions and retransmit queue
+ * state updates are done by the caller.  Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        unsigned int cur_mss = tcp_current_mss(sk, 0);
+        int err;
+        /* Do not sent more than we queued. 1/4 is reserved for possible
+         * copying overhead: frgagmentation, tunneling, mangling etc.
+         */
+        if (atomic_read(&sk->sk_wmem_alloc) >
+            min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+                return -EAGAIN;
+        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+                if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+                        BUG();
+                if (sk->sk_route_caps & NETIF_F_TSO) {
+                        sk->sk_route_caps &= ~NETIF_F_TSO;
+                        sock_set_flag(sk, SOCK_NO_LARGESEND);
+                        tp->mss_cache = tp->mss_cache_std;
+                }
+                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+                        return -ENOMEM;
+        }
+        /* If receiver has shrunk his window, and skb is out of
+         * new window, do not retransmit it. The exception is the
+         * case, when window is shrunk to zero. In this case
+         * our retransmit serves as a zero window probe.
+         */
+        if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
+            && TCP_SKB_CB(skb)->seq != tp->snd_una)
+                return -EAGAIN;
+        if (skb->len > cur_mss) {
+                int old_factor = tcp_skb_pcount(skb);
+                int new_factor;
+                if (tcp_fragment(sk, skb, cur_mss))
+                        return -ENOMEM; /* We'll try again later. */
+                /* New SKB created, account for it. */
+                new_factor = tcp_skb_pcount(skb);
+                tp->packets_out -= old_factor - new_factor;
+                tp->packets_out += tcp_skb_pcount(skb->next);
+        }
+        /* Collapse two adjacent packets if worthwhile and we can. */
+        if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
+           (skb->len < (cur_mss >> 1)) &&
+           (skb->next != sk->sk_send_head) &&
+           (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
+           (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
+           (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) &&
+           (sysctl_tcp_retrans_collapse != 0))
+                tcp_retrans_try_collapse(sk, skb, cur_mss);
+        if(tp->af_specific->rebuild_header(sk))
+                return -EHOSTUNREACH; /* Routing failure or similar. */
+        /* Some Solaris stacks overoptimize and ignore the FIN on a
+         * retransmit when old data is attached.  So strip it off
+         * since it is cheap to do so and saves bytes on the network.
+         */
+        if(skb->len > 0 &&
+           (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+           tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+                if (!pskb_trim(skb, 0)) {
+                        TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
+                        skb_shinfo(skb)->tso_segs = 1;
+                        skb_shinfo(skb)->tso_size = 0;
+                        skb->ip_summed = CHECKSUM_NONE;
+                        skb->csum = 0;
+                }
+        }
+        /* Make a copy, if the first transmission SKB clone we made
+         * is still in somebody's hands, else make a clone.
+         */
+        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+        tcp_tso_set_push(skb);
+        err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
+                                    pskb_copy(skb, GFP_ATOMIC):
+                                    skb_clone(skb, GFP_ATOMIC)));
+        if (err == 0) {
+                /* Update global TCP statistics. */
+                TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+                tp->total_retrans++;
+#if FASTRETRANS_DEBUG > 0
+                if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+                        if (net_ratelimit())
+                                printk(KERN_DEBUG "retrans_out leaked.\n");
+                }
+#endif
+                TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+                tp->retrans_out += tcp_skb_pcount(skb);
+                /* Save stamp of the first retransmit. */
+                if (!tp->retrans_stamp)
+                        tp->retrans_stamp = TCP_SKB_CB(skb)->when;
+                tp->undo_retrans++;
+                /* snd_nxt is stored to detect loss of retransmitted segment,
+                 * see tcp_input.c tcp_sacktag_write_queue().
+                 */
+                TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+        }
+        return err;
+}
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged.  It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        int packet_cnt = tp->lost_out;
+        /* First pass: retransmit lost packets. */
+        if (packet_cnt) {
+                sk_stream_for_retrans_queue(skb, sk) {
+                        __u8 sacked = TCP_SKB_CB(skb)->sacked;
+                        /* Assume this retransmit will generate
+                         * only one packet for congestion window
+                         * calculation purposes.  This works because
+                         * tcp_retransmit_skb() will chop up the
+                         * packet to be MSS sized and all the
+                         * packet counting works out.
+                         */
+                        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+                                return;
+                        if (sacked&TCPCB_LOST) {
+                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
+                                        if (tcp_retransmit_skb(sk, skb))
+                                                return;
+                                        if (tp->ca_state != TCP_CA_Loss)
+                                                NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
+                                        else
+                                                NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
+                                        if (skb ==
+                                            skb_peek(&sk->sk_write_queue))
+                                                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                                }
+                                packet_cnt -= tcp_skb_pcount(skb);
+                                if (packet_cnt <= 0)
+                                        break;
+                        }
+                }
+        }
+        /* OK, demanded retransmission is finished. */
+        /* Forward retransmissions are possible only during Recovery. */
+        if (tp->ca_state != TCP_CA_Recovery)
+                return;
+        /* No forward retransmissions in Reno are possible. */
+        if (!tp->rx_opt.sack_ok)
+                return;
+        /* Yeah, we have to make difficult choice between forward transmission
+         * and retransmission... Both ways have their merits...
+         *
+         * For now we do not retransmit anything, while we have some new
+         * segments to send.
+         */
+        if (tcp_may_send_now(sk, tp))
+                return;
+        packet_cnt = 0;
+        sk_stream_for_retrans_queue(skb, sk) {
+                /* Similar to the retransmit loop above we
+                 * can pretend that the retransmitted SKB
+                 * we send out here will be composed of one
+                 * real MSS sized packet because tcp_retransmit_skb()
+                 * will fragment it if necessary.
+                 */
+                if (++packet_cnt > tp->fackets_out)
+                        break;
+                if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+                        break;
+                if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+                        continue;
+                /* Ok, retransmit it. */
+                if (tcp_retransmit_skb(sk, skb))
+                        break;
+                if (skb == skb_peek(&sk->sk_write_queue))
+                        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
+        }
+}
+/* Send a fin.  The caller locks the socket for us.  This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);       
+        struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
+        int mss_now;
+        
+        /* Optimization, tack on the FIN if we have a queue of
+         * unsent frames.  But be careful about outgoing SACKS
+         * and IP options.
+         */
+        mss_now = tcp_current_mss(sk, 1);
+        if (sk->sk_send_head != NULL) {
+                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
+                TCP_SKB_CB(skb)->end_seq++;
+                tp->write_seq++;
+        } else {
+                /* Socket is locked, keep trying until memory is available. */
+                for (;;) {
+                        skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+                        if (skb)
+                                break;
+                        yield();
+                }
+                /* Reserve space for headers and prepare control bits. */
+                skb_reserve(skb, MAX_TCP_HEADER);
+                skb->csum = 0;
+                TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
+                TCP_SKB_CB(skb)->sacked = 0;
+                skb_shinfo(skb)->tso_segs = 1;
+                skb_shinfo(skb)->tso_size = 0;
+                /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+                TCP_SKB_CB(skb)->seq = tp->write_seq;
+                TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+                tcp_queue_skb(sk, skb);
+        }
+        __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
+}
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue.  This behavior is recommended
+ * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk, int priority)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        /* NOTE: No TCP options attached and we never retransmit this. */
+        skb = alloc_skb(MAX_TCP_HEADER, priority);
+        if (!skb) {
+                NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+                return;
+        }
+        /* Reserve space for headers and prepare control bits. */
+        skb_reserve(skb, MAX_TCP_HEADER);
+        skb->csum = 0;
+        TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
+        TCP_SKB_CB(skb)->sacked = 0;
+        skb_shinfo(skb)->tso_segs = 1;
+        skb_shinfo(skb)->tso_size = 0;
+        /* Send it off. */
+        TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
+        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+        if (tcp_transmit_skb(sk, skb))
+                NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+}
+/* WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+        struct sk_buff* skb;
+        skb = skb_peek(&sk->sk_write_queue);
+        if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
+                printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+                return -EFAULT;
+        }
+        if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
+                if (skb_cloned(skb)) {
+                        struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+                        if (nskb == NULL)
+                                return -ENOMEM;
+                        __skb_unlink(skb, &sk->sk_write_queue);
+                        skb_header_release(nskb);
+                        __skb_queue_head(&sk->sk_write_queue, nskb);
+                        sk_stream_free_skb(sk, skb);
+                        sk_charge_skb(sk, nskb);
+                        skb = nskb;
+                }
+                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
+                TCP_ECN_send_synack(tcp_sk(sk), skb);
+        }
+        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+        return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+}
+/*
+ * Prepare a SYN-ACK.
+ */
+struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+                                 struct open_request *req)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcphdr *th;
+        int tcp_header_size;
+        struct sk_buff *skb;
+        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
+        if (skb == NULL)
+                return NULL;
+        /* Reserve space for headers. */
+        skb_reserve(skb, MAX_TCP_HEADER);
+        skb->dst = dst_clone(dst);
+        tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
+                           (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
+                           (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
+                           /* SACK_PERM is in the place of NOP NOP of TS */
+                           ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+        skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+        memset(th, 0, sizeof(struct tcphdr));
+        th->syn = 1;
+        th->ack = 1;
+        if (dst->dev->features&NETIF_F_TSO)
+                req->ecn_ok = 0;
+        TCP_ECN_make_synack(req, th);
+        th->source = inet_sk(sk)->sport;
+        th->dest = req->rmt_port;
+        TCP_SKB_CB(skb)->seq = req->snt_isn;
+        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+        TCP_SKB_CB(skb)->sacked = 0;
+        skb_shinfo(skb)->tso_segs = 1;
+        skb_shinfo(skb)->tso_size = 0;
+        th->seq = htonl(TCP_SKB_CB(skb)->seq);
+        th->ack_seq = htonl(req->rcv_isn + 1);
+        if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+                __u8 rcv_wscale; 
+                /* Set this up on the first call only */
+                req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+                /* tcp_full_space because it is guaranteed to be the first packet */
+                tcp_select_initial_window(tcp_full_space(sk), 
+                        dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+                        &req->rcv_wnd,
+                        &req->window_clamp,
+                        req->wscale_ok,
+                        &rcv_wscale);
+                req->rcv_wscale = rcv_wscale; 
+        }
+        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+        th->window = htons(req->rcv_wnd);
+        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+        tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
+                              req->sack_ok, req->wscale_ok, req->rcv_wscale,
+                              TCP_SKB_CB(skb)->when,
+                              req->ts_recent);
+        skb->csum = 0;
+        th->doff = (tcp_header_size >> 2);
+        TCP_INC_STATS(TCP_MIB_OUTSEGS);
+        return skb;
+}
+/* 
+ * Do all connect socket setups that can be done AF independent.
+ */ 
+static inline void tcp_connect_init(struct sock *sk)
+{
+        struct dst_entry *dst = __sk_dst_get(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        __u8 rcv_wscale;
+        /* We'll fix this up when we get a response from the other end.
+         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+         */
+        tp->tcp_header_len = sizeof(struct tcphdr) +
+                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+        /* If user gave his TCP_MAXSEG, record it to clamp */
+        if (tp->rx_opt.user_mss)
+                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+        tp->max_window = 0;
+        tcp_sync_mss(sk, dst_mtu(dst));
+        if (!tp->window_clamp)
+                tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+        tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+        tcp_initialize_rcv_mss(sk);
+        tcp_ca_init(tp);
+        tcp_select_initial_window(tcp_full_space(sk),
+                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+                                  &tp->rcv_wnd,
+                                  &tp->window_clamp,
+                                  sysctl_tcp_window_scaling,
+                                  &rcv_wscale);
+        tp->rx_opt.rcv_wscale = rcv_wscale;
+        tp->rcv_ssthresh = tp->rcv_wnd;
+        sk->sk_err = 0;
+        sock_reset_flag(sk, SOCK_DONE);
+        tp->snd_wnd = 0;
+        tcp_init_wl(tp, tp->write_seq, 0);
+        tp->snd_una = tp->write_seq;
+        tp->snd_sml = tp->write_seq;
+        tp->rcv_nxt = 0;
+        tp->rcv_wup = 0;
+        tp->copied_seq = 0;
+        tp->rto = TCP_TIMEOUT_INIT;
+        tp->retransmits = 0;
+        tcp_clear_retrans(tp);
+}
+/*
+ * Build a SYN and send it off.
+ */ 
+int tcp_connect(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *buff;
+        tcp_connect_init(sk);
+        buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+        if (unlikely(buff == NULL))
+                return -ENOBUFS;
+        /* Reserve space for headers. */
+        skb_reserve(buff, MAX_TCP_HEADER);
+        TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+        TCP_ECN_send_syn(sk, tp, buff);
+        TCP_SKB_CB(buff)->sacked = 0;
+        skb_shinfo(buff)->tso_segs = 1;
+        skb_shinfo(buff)->tso_size = 0;
+        buff->csum = 0;
+        TCP_SKB_CB(buff)->seq = tp->write_seq++;
+        TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+        tp->snd_nxt = tp->write_seq;
+        tp->pushed_seq = tp->write_seq;
+        tcp_ca_init(tp);
+        /* Send it off. */
+        TCP_SKB_CB(buff)->when = tcp_time_stamp;
+        tp->retrans_stamp = TCP_SKB_CB(buff)->when;
+        skb_header_release(buff);
+        __skb_queue_tail(&sk->sk_write_queue, buff);
+        sk_charge_skb(sk, buff);
+        tp->packets_out += tcp_skb_pcount(buff);
+        tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
+        /* Timer for repeating the SYN until an answer. */
+        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+        return 0;
+}
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int ato = tp->ack.ato;
+        unsigned long timeout;
+        if (ato > TCP_DELACK_MIN) {
+                int max_ato = HZ/2;
+                if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+                        max_ato = TCP_DELACK_MAX;
+                /* Slow path, intersegment interval is "high". */
+                /* If some rtt estimate is known, use it to bound delayed ack.
+                 * Do not use tp->rto here, use results of rtt measurements
+                 * directly.
+                 */
+                if (tp->srtt) {
+                        int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
+                        if (rtt < max_ato)
+                                max_ato = rtt;
+                }
+                ato = min(ato, max_ato);
+        }
+        /* Stay within the limit we were given */
+        timeout = jiffies + ato;
+        /* Use new timeout only if there wasn't a older one earlier. */
+        if (tp->ack.pending&TCP_ACK_TIMER) {
+                /* If delack timer was blocked or is about to expire,
+                 * send ACK now.
+                 */
+                if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+                        tcp_send_ack(sk);
+                        return;
+                }
+                if (!time_before(timeout, tp->ack.timeout))
+                        timeout = tp->ack.timeout;
+        }
+        tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
+        tp->ack.timeout = timeout;
+        sk_reset_timer(sk, &tp->delack_timer, timeout);
+}
+/* This routine sends an ack and also updates the window. */
+void tcp_send_ack(struct sock *sk)
+{
+        /* If we have been reset, we may not send again. */
+        if (sk->sk_state != TCP_CLOSE) {
+                struct tcp_sock *tp = tcp_sk(sk);
+                struct sk_buff *buff;
+                /* We are not putting this on the write queue, so
+                 * tcp_transmit_skb() will set the ownership to this
+                 * sock.
+                 */
+                buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+                if (buff == NULL) {
+                        tcp_schedule_ack(tp);
+                        tp->ack.ato = TCP_ATO_MIN;
+                        tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+                        return;
+                }
+                /* Reserve space for headers and prepare control bits. */
+                skb_reserve(buff, MAX_TCP_HEADER);
+                buff->csum = 0;
+                TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
+                TCP_SKB_CB(buff)->sacked = 0;
+                skb_shinfo(buff)->tso_segs = 1;
+                skb_shinfo(buff)->tso_size = 0;
+                /* Send it off, this clears delayed acks for us. */
+                TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
+                TCP_SKB_CB(buff)->when = tcp_time_stamp;
+                tcp_transmit_skb(sk, buff);
+        }
+}
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ *
+ * Question: what should we make while urgent mode?
+ * 4.4BSD forces sending single byte of data. We cannot send
+ * out of window data, because we have SND.NXT==SND.MAX...
+ *
+ * Current solution: to send TWO zero-length segments in urgent mode:
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        /* We don't queue it, tcp_transmit_skb() sets ownership. */
+        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+        if (skb == NULL) 
+                return -1;
+        /* Reserve space for headers and set control bits. */
+        skb_reserve(skb, MAX_TCP_HEADER);
+        skb->csum = 0;
+        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+        TCP_SKB_CB(skb)->sacked = urgent;
+        skb_shinfo(skb)->tso_segs = 1;
+        skb_shinfo(skb)->tso_size = 0;
+        /* Use a previous sequence.  This should cause the other
+         * end to send an ack.  Don't queue or clone SKB, just
+         * send it.
+         */
+        TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
+        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+        return tcp_transmit_skb(sk, skb);
+}
+int tcp_write_wakeup(struct sock *sk)
+{
+        if (sk->sk_state != TCP_CLOSE) {
+                struct tcp_sock *tp = tcp_sk(sk);
+                struct sk_buff *skb;
+                if ((skb = sk->sk_send_head) != NULL &&
+                    before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
+                        int err;
+                        unsigned int mss = tcp_current_mss(sk, 0);
+                        unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
+                        if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+                                tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+                        /* We are probing the opening of a window
+                         * but the window size is != 0
+                         * must have been a result SWS avoidance ( sender )
+                         */
+                        if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+                            skb->len > mss) {
+                                seg_size = min(seg_size, mss);
+                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+                                if (tcp_fragment(sk, skb, seg_size))
+                                        return -1;
+                                /* SWS override triggered forced fragmentation.
+                                 * Disable TSO, the connection is too sick. */
+                                if (sk->sk_route_caps & NETIF_F_TSO) {
+                                        sock_set_flag(sk, SOCK_NO_LARGESEND);
+                                        sk->sk_route_caps &= ~NETIF_F_TSO;
+                                        tp->mss_cache = tp->mss_cache_std;
+                                }
+                        } else if (!tcp_skb_pcount(skb))
+                                tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
+                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                        tcp_tso_set_push(skb);
+                        err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+                        if (!err) {
+                                update_send_head(sk, tp, skb);
+                        }
+                        return err;
+                } else {
+                        if (tp->urg_mode &&
+                            between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
+                                tcp_xmit_probe_skb(sk, TCPCB_URG);
+                        return tcp_xmit_probe_skb(sk, 0);
+                }
+        }
+        return -1;
+}
+/* A window probe timeout has occurred.  If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int err;
+        err = tcp_write_wakeup(sk);
+        if (tp->packets_out || !sk->sk_send_head) {
+                /* Cancel probe timer, if it is not required. */
+                tp->probes_out = 0;
+                tp->backoff = 0;
+                return;
+        }
+        if (err <= 0) {
+                if (tp->backoff < sysctl_tcp_retries2)
+                        tp->backoff++;
+                tp->probes_out++;
+                tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+                                      min(tp->rto << tp->backoff, TCP_RTO_MAX));
+        } else {
+                /* If packet was not sent due to local congestion,
+                 * do not backoff and do not remember probes_out.
+                 * Let local senders to fight for local resources.
+                 *
+                 * Use accumulated backoff yet.
+                 */
+                if (!tp->probes_out)
+                        tp->probes_out=1;
+                tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+                                      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+        }
+}
+EXPORT_SYMBOL(tcp_connect);
+EXPORT_SYMBOL(tcp_make_synack);
+EXPORT_SYMBOL(tcp_simple_retransmit);
+EXPORT_SYMBOL(tcp_sync_mss);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 000000000000..85b279f1e935
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,656 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:     $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *              Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *              Florian La Roche, <flla@stud.uni-sb.de>
+ *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *              Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *              Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *              Matthew Dillon, <dillon@apollo.west.oic.com>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Jorge Cwik, <jorge@laser.satlink.net>
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
+int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 
+int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+int sysctl_tcp_retries1 = TCP_RETR1;
+int sysctl_tcp_retries2 = TCP_RETR2;
+int sysctl_tcp_orphan_retries;
+static void tcp_write_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
+#ifdef TCP_DEBUG
+const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
+EXPORT_SYMBOL(tcp_timer_bug_msg);
+#endif
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+void tcp_init_xmit_timers(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        init_timer(&tp->retransmit_timer);
+        tp->retransmit_timer.function=&tcp_write_timer;
+        tp->retransmit_timer.data = (unsigned long) sk;
+        tp->pending = 0;
+        init_timer(&tp->delack_timer);
+        tp->delack_timer.function=&tcp_delack_timer;
+        tp->delack_timer.data = (unsigned long) sk;
+        tp->ack.pending = 0;
+        init_timer(&sk->sk_timer);
+        sk->sk_timer.function   = &tcp_keepalive_timer;
+        sk->sk_timer.data       = (unsigned long)sk;
+}
+void tcp_clear_xmit_timers(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tp->pending = 0;
+        sk_stop_timer(sk, &tp->retransmit_timer);
+        tp->ack.pending = 0;
+        tp->ack.blocked = 0;
+        sk_stop_timer(sk, &tp->delack_timer);
+        sk_stop_timer(sk, &sk->sk_timer);
+}
+static void tcp_write_err(struct sock *sk)
+{
+        sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+        sk->sk_error_report(sk);
+        tcp_done(sk);
+        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
+}
+/* Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Criterium is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ *    limit.
+ * 2. If we have strong memory pressure.
+ */
+static int tcp_out_of_resources(struct sock *sk, int do_reset)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int orphans = atomic_read(&tcp_orphan_count);
+        /* If peer does not open window for long time, or did not transmit 
+         * anything for long time, penalize it. */
+        if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+                orphans <<= 1;
+        /* If some dubious ICMP arrived, penalize even more. */
+        if (sk->sk_err_soft)
+                orphans <<= 1;
+        if (orphans >= sysctl_tcp_max_orphans ||
+            (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+             atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+                if (net_ratelimit())
+                        printk(KERN_INFO "Out of socket memory\n");
+                /* Catch exceptional cases, when connection requires reset.
+                 *      1. Last segment was sent recently. */
+                if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+                    /*  2. Window is closed. */
+                    (!tp->snd_wnd && !tp->packets_out))
+                        do_reset = 1;
+                if (do_reset)
+                        tcp_send_active_reset(sk, GFP_ATOMIC);
+                tcp_done(sk);
+                NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+                return 1;
+        }
+        return 0;
+}
+/* Calculate maximal number or retries on an orphaned socket. */
+static int tcp_orphan_retries(struct sock *sk, int alive)
+{
+        int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+        /* We know from an ICMP that something is wrong. */
+        if (sk->sk_err_soft && !alive)
+                retries = 0;
+        /* However, if socket sent something recently, select some safe
+         * number of retries. 8 corresponds to >100 seconds with minimal
+         * RTO of 200msec. */
+        if (retries == 0 && alive)
+                retries = 8;
+        return retries;
+}
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int retry_until;
+        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+                if (tp->retransmits)
+                        dst_negative_advice(&sk->sk_dst_cache);
+                retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+        } else {
+                if (tp->retransmits >= sysctl_tcp_retries1) {
+                        /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+                           hole detection. :-(
+                           It is place to make it. It is not made. I do not want
+                           to make it. It is disguisting. It does not work in any
+                           case. Let me to cite the same draft, which requires for
+                           us to implement this:
+   "The one security concern raised by this memo is that ICMP black holes
+   are often caused by over-zealous security administrators who block
+   all ICMP messages.  It is vitally important that those who design and
+   deploy security systems understand the impact of strict filtering on
+   upper-layer protocols.  The safest web site in the world is worthless
+   if most TCP implementations cannot transfer data from it.  It would
+   be far nicer to have all of the black holes fixed rather than fixing
+   all of the TCP implementations."
+                           Golden words :-).
+                   */
+                        dst_negative_advice(&sk->sk_dst_cache);
+                }
+                retry_until = sysctl_tcp_retries2;
+                if (sock_flag(sk, SOCK_DEAD)) {
+                        int alive = (tp->rto < TCP_RTO_MAX);
+ 
+                        retry_until = tcp_orphan_retries(sk, alive);
+                        if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+                                return 1;
+                }
+        }
+        if (tp->retransmits >= retry_until) {
+                /* Has it gone just too far? */
+                tcp_write_err(sk);
+                return 1;
+        }
+        return 0;
+}
+static void tcp_delack_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock*)data;
+        struct tcp_sock *tp = tcp_sk(sk);
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later. */
+                tp->ack.blocked = 1;
+                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+                sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+                goto out_unlock;
+        }
+        sk_stream_mem_reclaim(sk);
+        if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+                goto out;
+        if (time_after(tp->ack.timeout, jiffies)) {
+                sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+                goto out;
+        }
+        tp->ack.pending &= ~TCP_ACK_TIMER;
+        if (skb_queue_len(&tp->ucopy.prequeue)) {
+                struct sk_buff *skb;
+                NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 
+                                 skb_queue_len(&tp->ucopy.prequeue));
+                while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+                        sk->sk_backlog_rcv(sk, skb);
+                tp->ucopy.memory = 0;
+        }
+        if (tcp_ack_scheduled(tp)) {
+                if (!tp->ack.pingpong) {
+                        /* Delayed ACK missed: inflate ATO. */
+                        tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+                } else {
+                        /* Delayed ACK missed: leave pingpong mode and
+                         * deflate ATO.
+                         */
+                        tp->ack.pingpong = 0;
+                        tp->ack.ato = TCP_ATO_MIN;
+                }
+                tcp_send_ack(sk);
+                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+        }
+        TCP_CHECK_TIMER(sk);
+out:
+        if (tcp_memory_pressure)
+                sk_stream_mem_reclaim(sk);
+out_unlock:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+static void tcp_probe_timer(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int max_probes;
+        if (tp->packets_out || !sk->sk_send_head) {
+                tp->probes_out = 0;
+                return;
+        }
+        /* *WARNING* RFC 1122 forbids this
+         *
+         * It doesn't AFAIK, because we kill the retransmit timer -AK
+         *
+         * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+         * this behaviour in Solaris down as a bug fix. [AC]
+         *
+         * Let me to explain. probes_out is zeroed by incoming ACKs
+         * even if they advertise zero window. Hence, connection is killed only
+         * if we received no ACKs for normal connection timeout. It is not killed
+         * only because window stays zero for some time, window may be zero
+         * until armageddon and even later. We are in full accordance
+         * with RFCs, only probe timer combines both retransmission timeout
+         * and probe timeout in one bottle.                             --ANK
+         */
+        max_probes = sysctl_tcp_retries2;
+        if (sock_flag(sk, SOCK_DEAD)) {
+                int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+ 
+                max_probes = tcp_orphan_retries(sk, alive);
+                if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+                        return;
+        }
+        if (tp->probes_out > max_probes) {
+                tcp_write_err(sk);
+        } else {
+                /* Only send another probe if we didn't close things up. */
+                tcp_send_probe0(sk);
+        }
+}
+/*
+ *      The TCP retransmit timer.
+ */
+static void tcp_retransmit_timer(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (!tp->packets_out)
+                goto out;
+        BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
+        if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+            !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+                /* Receiver dastardly shrinks window. Our retransmits
+                 * become zero probes, but we should not timeout this
+                 * connection. If the socket is an orphan, time it out,
+                 * we cannot allow such beasts to hang infinitely.
+                 */
+#ifdef TCP_DEBUG
+                if (net_ratelimit()) {
+                        struct inet_sock *inet = inet_sk(sk);
+                        printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
+                               NIPQUAD(inet->daddr), htons(inet->dport),
+                               inet->num, tp->snd_una, tp->snd_nxt);
+                }
+#endif
+                if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+                        tcp_write_err(sk);
+                        goto out;
+                }
+                tcp_enter_loss(sk, 0);
+                tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+                __sk_dst_reset(sk);
+                goto out_reset_timer;
+        }
+        if (tcp_write_timeout(sk))
+                goto out;
+        if (tp->retransmits == 0) {
+                if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+                        if (tp->rx_opt.sack_ok) {
+                                if (tp->ca_state == TCP_CA_Recovery)
+                                        NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
+                                else
+                                        NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
+                        } else {
+                                if (tp->ca_state == TCP_CA_Recovery)
+                                        NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
+                                else
+                                        NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
+                        }
+                } else if (tp->ca_state == TCP_CA_Loss) {
+                        NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
+                } else {
+                        NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
+                }
+        }
+        if (tcp_use_frto(sk)) {
+                tcp_enter_frto(sk);
+        } else {
+                tcp_enter_loss(sk, 0);
+        }
+        if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
+                /* Retransmission failed because of local congestion,
+                 * do not backoff.
+                 */
+                if (!tp->retransmits)
+                        tp->retransmits=1;
+                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
+                                     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+                goto out;
+        }
+        /* Increase the timeout each time we retransmit.  Note that
+         * we do not increase the rtt estimate.  rto is initialized
+         * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
+         * that doubling rto each time is the least we can get away with.
+         * In KA9Q, Karn uses this for the first few times, and then
+         * goes to quadratic.  netBSD doubles, but only goes up to *64,
+         * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
+         * defined in the protocol as the maximum possible RTT.  I guess
+         * we'll have to use something other than TCP to talk to the
+         * University of Mars.
+         *
+         * PAWS allows us longer timeouts and large windows, so once
+         * implemented ftp to mars will work nicely. We will have to fix
+         * the 120 second clamps though!
+         */
+        tp->backoff++;
+        tp->retransmits++;
+out_reset_timer:
+        tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
+        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+        if (tp->retransmits > sysctl_tcp_retries1)
+                __sk_dst_reset(sk);
+out:;
+}
+static void tcp_write_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock*)data;
+        struct tcp_sock *tp = tcp_sk(sk);
+        int event;
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later */
+                sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+                goto out_unlock;
+        }
+        if (sk->sk_state == TCP_CLOSE || !tp->pending)
+                goto out;
+        if (time_after(tp->timeout, jiffies)) {
+                sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+                goto out;
+        }
+        event = tp->pending;
+        tp->pending = 0;
+        switch (event) {
+        case TCP_TIME_RETRANS:
+                tcp_retransmit_timer(sk);
+                break;
+        case TCP_TIME_PROBE0:
+                tcp_probe_timer(sk);
+                break;
+        }
+        TCP_CHECK_TIMER(sk);
+out:
+        sk_stream_mem_reclaim(sk);
+out_unlock:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+/*
+ *      Timer for listening sockets
+ */
+static void tcp_synack_timer(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_listen_opt *lopt = tp->listen_opt;
+        int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
+        int thresh = max_retries;
+        unsigned long now = jiffies;
+        struct open_request **reqp, *req;
+        int i, budget;
+        if (lopt == NULL || lopt->qlen == 0)
+                return;
+        /* Normally all the openreqs are young and become mature
+         * (i.e. converted to established socket) for first timeout.
+         * If synack was not acknowledged for 3 seconds, it means
+         * one of the following things: synack was lost, ack was lost,
+         * rtt is high or nobody planned to ack (i.e. synflood).
+         * When server is a bit loaded, queue is populated with old
+         * open requests, reducing effective size of queue.
+         * When server is well loaded, queue size reduces to zero
+         * after several minutes of work. It is not synflood,
+         * it is normal operation. The solution is pruning
+         * too old entries overriding normal timeout, when
+         * situation becomes dangerous.
+         *
+         * Essentially, we reserve half of room for young
+         * embrions; and abort old ones without pity, if old
+         * ones are about to clog our table.
+         */
+        if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+                int young = (lopt->qlen_young<<1);
+                while (thresh > 2) {
+                        if (lopt->qlen < young)
+                                break;
+                        thresh--;
+                        young <<= 1;
+                }
+        }
+        if (tp->defer_accept)
+                max_retries = tp->defer_accept;
+        budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
+        i = lopt->clock_hand;
+        do {
+                reqp=&lopt->syn_table[i];
+                while ((req = *reqp) != NULL) {
+                        if (time_after_eq(now, req->expires)) {
+                                if ((req->retrans < thresh ||
+                                     (req->acked && req->retrans < max_retries))
+                                    && !req->class->rtx_syn_ack(sk, req, NULL)) {
+                                        unsigned long timeo;
+                                        if (req->retrans++ == 0)
+                                                lopt->qlen_young--;
+                                        timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+                                                    TCP_RTO_MAX);
+                                        req->expires = now + timeo;
+                                        reqp = &req->dl_next;
+                                        continue;
+                                }
+                                /* Drop this request */
+                                write_lock(&tp->syn_wait_lock);
+                                *reqp = req->dl_next;
+                                write_unlock(&tp->syn_wait_lock);
+                                lopt->qlen--;
+                                if (req->retrans == 0)
+                                        lopt->qlen_young--;
+                                tcp_openreq_free(req);
+                                continue;
+                        }
+                        reqp = &req->dl_next;
+                }
+                i = (i+1)&(TCP_SYNQ_HSIZE-1);
+        } while (--budget > 0);
+        lopt->clock_hand = i;
+        if (lopt->qlen)
+                tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
+}
+void tcp_delete_keepalive_timer (struct sock *sk)
+{
+        sk_stop_timer(sk, &sk->sk_timer);
+}
+void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
+{
+        sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+        if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+                return;
+        if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+                tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+        else if (!val)
+                tcp_delete_keepalive_timer(sk);
+}
+static void tcp_keepalive_timer (unsigned long data)
+{
+        struct sock *sk = (struct sock *) data;
+        struct tcp_sock *tp = tcp_sk(sk);
+        __u32 elapsed;
+        /* Only process if socket is not in use. */
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later. */ 
+                tcp_reset_keepalive_timer (sk, HZ/20);
+                goto out;
+        }
+        if (sk->sk_state == TCP_LISTEN) {
+                tcp_synack_timer(sk);
+                goto out;
+        }
+        if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+                if (tp->linger2 >= 0) {
+                        int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+                        if (tmo > 0) {
+                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+                                goto out;
+                        }
+                }
+                tcp_send_active_reset(sk, GFP_ATOMIC);
+                goto death;
+        }
+        if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+                goto out;
+        elapsed = keepalive_time_when(tp);
+        /* It is alive without keepalive 8) */
+        if (tp->packets_out || sk->sk_send_head)
+                goto resched;
+        elapsed = tcp_time_stamp - tp->rcv_tstamp;
+        if (elapsed >= keepalive_time_when(tp)) {
+                if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
+                     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+                        tcp_send_active_reset(sk, GFP_ATOMIC);
+                        tcp_write_err(sk);
+                        goto out;
+                }
+                if (tcp_write_wakeup(sk) <= 0) {
+                        tp->probes_out++;
+                        elapsed = keepalive_intvl_when(tp);
+                } else {
+                        /* If keepalive was lost due to local congestion,
+                         * try harder.
+                         */
+                        elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+                }
+        } else {
+                /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+                elapsed = keepalive_time_when(tp) - elapsed;
+        }
+        TCP_CHECK_TIMER(sk);
+        sk_stream_mem_reclaim(sk);
+resched:
+        tcp_reset_keepalive_timer (sk, elapsed);
+        goto out;
+death:  
+        tcp_done(sk);
+out:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+EXPORT_SYMBOL(tcp_clear_xmit_timers);
+EXPORT_SYMBOL(tcp_delete_keepalive_timer);
+EXPORT_SYMBOL(tcp_init_xmit_timers);
+EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 000000000000..6baddfbedca3
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,1575 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              The User Datagram Protocol (UDP).
+ *
+ * Version:     $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
+ *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Alan Cox, <Alan.Cox@linux.org>
+ *              Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * Fixes:
+ *              Alan Cox        :       verify_area() calls
+ *              Alan Cox        :       stopped close while in use off icmp
+ *                                      messages. Not a fix but a botch that
+ *                                      for udp at least is 'valid'.
+ *              Alan Cox        :       Fixed icmp handling properly
+ *              Alan Cox        :       Correct error for oversized datagrams
+ *              Alan Cox        :       Tidied select() semantics. 
+ *              Alan Cox        :       udp_err() fixed properly, also now 
+ *                                      select and read wake correctly on errors
+ *              Alan Cox        :       udp_send verify_area moved to avoid mem leak
+ *              Alan Cox        :       UDP can count its memory
+ *              Alan Cox        :       send to an unknown connection causes
+ *                                      an ECONNREFUSED off the icmp, but
+ *                                      does NOT close.
+ *              Alan Cox        :       Switched to new sk_buff handlers. No more backlog!
+ *              Alan Cox        :       Using generic datagram code. Even smaller and the PEEK
+ *                                      bug no longer crashes it.
+ *              Fred Van Kempen :       Net2e support for sk->broadcast.
+ *              Alan Cox        :       Uses skb_free_datagram
+ *              Alan Cox        :       Added get/set sockopt support.
+ *              Alan Cox        :       Broadcasting without option set returns EACCES.
+ *              Alan Cox        :       No wakeup calls. Instead we now use the callbacks.
+ *              Alan Cox        :       Use ip_tos and ip_ttl
+ *              Alan Cox        :       SNMP Mibs
+ *              Alan Cox        :       MSG_DONTROUTE, and 0.0.0.0 support.
+ *              Matt Dillon     :       UDP length checks.
+ *              Alan Cox        :       Smarter af_inet used properly.
+ *              Alan Cox        :       Use new kernel side addressing.
+ *              Alan Cox        :       Incorrect return on truncated datagram receive.
+ *      Arnt Gulbrandsen        :       New udp_send and stuff
+ *              Alan Cox        :       Cache last socket
+ *              Alan Cox        :       Route cache
+ *              Jon Peatfield   :       Minor efficiency fix to sendto().
+ *              Mike Shaver     :       RFC1122 checks.
+ *              Alan Cox        :       Nonblocking error fix.
+ *      Willy Konynenberg       :       Transparent proxying support.
+ *              Mike McLagan    :       Routing by source
+ *              David S. Miller :       New socket lookup architecture.
+ *                                      Last socket cache retained as it
+ *                                      does have a high hit rate.
+ *              Olaf Kirch      :       Don't linearise iovec on sendmsg.
+ *              Andi Kleen      :       Some cleanups, cache destination entry
+ *                                      for connect. 
+ *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
+ *              Melvin Smith    :       Check msg_name not msg_namelen in sendto(),
+ *                                      return ENOTCONN for unconnected sockets (POSIX)
+ *              Janos Farkas    :       don't deliver multi/broadcasts to a different
+ *                                      bound-to-device socket
+ *      Hirokazu Takahashi      :       HW checksumming for outgoing UDP
+ *                                      datagrams.
+ *      Hirokazu Takahashi      :       sendfile() on UDP works now.
+ *              Arnaldo C. Melo :       convert /proc/net/udp to seq_file
+ *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
+ *      Alexey Kuznetsov:               allow both IPv4 and IPv6 sockets to bind
+ *                                      a single port at the same time.
+ *      Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ *
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+ 
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/config.h>
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+/*
+ *      Snmp MIB for the UDP layer
+ */
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+struct hlist_head udp_hash[UDP_HTABLE_SIZE];
+DEFINE_RWLOCK(udp_hash_lock);
+/* Shared by v4/v6 udp. */
+int udp_port_rover;
+static int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+        struct hlist_node *node;
+        struct sock *sk2;
+        struct inet_sock *inet = inet_sk(sk);
+        write_lock_bh(&udp_hash_lock);
+        if (snum == 0) {
+                int best_size_so_far, best, result, i;
+                if (udp_port_rover > sysctl_local_port_range[1] ||
+                    udp_port_rover < sysctl_local_port_range[0])
+                        udp_port_rover = sysctl_local_port_range[0];
+                best_size_so_far = 32767;
+                best = result = udp_port_rover;
+                for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
+                        struct hlist_head *list;
+                        int size;
+                        list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
+                        if (hlist_empty(list)) {
+                                if (result > sysctl_local_port_range[1])
+                                        result = sysctl_local_port_range[0] +
+                                                ((result - sysctl_local_port_range[0]) &
+                                                 (UDP_HTABLE_SIZE - 1));
+                                goto gotit;
+                        }
+                        size = 0;
+                        sk_for_each(sk2, node, list)
+                                if (++size >= best_size_so_far)
+                                        goto next;
+                        best_size_so_far = size;
+                        best = result;
+                next:;
+                }
+                result = best;
+                for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
+                        if (result > sysctl_local_port_range[1])
+                                result = sysctl_local_port_range[0]
+                                        + ((result - sysctl_local_port_range[0]) &
+                                           (UDP_HTABLE_SIZE - 1));
+                        if (!udp_lport_inuse(result))
+                                break;
+                }
+                if (i >= (1 << 16) / UDP_HTABLE_SIZE)
+                        goto fail;
+gotit:
+                udp_port_rover = snum = result;
+        } else {
+                sk_for_each(sk2, node,
+                            &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
+                        struct inet_sock *inet2 = inet_sk(sk2);
+                        if (inet2->num == snum &&
+                            sk2 != sk &&
+                            !ipv6_only_sock(sk2) &&
+                            (!sk2->sk_bound_dev_if ||
+                             !sk->sk_bound_dev_if ||
+                             sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+                            (!inet2->rcv_saddr ||
+                             !inet->rcv_saddr ||
+                             inet2->rcv_saddr == inet->rcv_saddr) &&
+                            (!sk2->sk_reuse || !sk->sk_reuse))
+                                goto fail;
+                }
+        }
+        inet->num = snum;
+        if (sk_unhashed(sk)) {
+                struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
+                sk_add_node(sk, h);
+                sock_prot_inc_use(sk->sk_prot);
+        }
+        write_unlock_bh(&udp_hash_lock);
+        return 0;
+fail:
+        write_unlock_bh(&udp_hash_lock);
+        return 1;
+}
+static void udp_v4_hash(struct sock *sk)
+{
+        BUG();
+}
+static void udp_v4_unhash(struct sock *sk)
+{
+        write_lock_bh(&udp_hash_lock);
+        if (sk_del_node_init(sk)) {
+                inet_sk(sk)->num = 0;
+                sock_prot_dec_use(sk->sk_prot);
+        }
+        write_unlock_bh(&udp_hash_lock);
+}
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+static struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport,
+                                          u32 daddr, u16 dport, int dif)
+{
+        struct sock *sk, *result = NULL;
+        struct hlist_node *node;
+        unsigned short hnum = ntohs(dport);
+        int badness = -1;
+        sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) {
+                struct inet_sock *inet = inet_sk(sk);
+                if (inet->num == hnum && !ipv6_only_sock(sk)) {
+                        int score = (sk->sk_family == PF_INET ? 1 : 0);
+                        if (inet->rcv_saddr) {
+                                if (inet->rcv_saddr != daddr)
+                                        continue;
+                                score+=2;
+                        }
+                        if (inet->daddr) {
+                                if (inet->daddr != saddr)
+                                        continue;
+                                score+=2;
+                        }
+                        if (inet->dport) {
+                                if (inet->dport != sport)
+                                        continue;
+                                score+=2;
+                        }
+                        if (sk->sk_bound_dev_if) {
+                                if (sk->sk_bound_dev_if != dif)
+                                        continue;
+                                score+=2;
+                        }
+                        if(score == 9) {
+                                result = sk;
+                                break;
+                        } else if(score > badness) {
+                                result = sk;
+                                badness = score;
+                        }
+                }
+        }
+        return result;
+}
+static __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport,
+                                             u32 daddr, u16 dport, int dif)
+{
+        struct sock *sk;
+        read_lock(&udp_hash_lock);
+        sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
+        if (sk)
+                sock_hold(sk);
+        read_unlock(&udp_hash_lock);
+        return sk;
+}
+static inline struct sock *udp_v4_mcast_next(struct sock *sk,
+                                             u16 loc_port, u32 loc_addr,
+                                             u16 rmt_port, u32 rmt_addr,
+                                             int dif)
+{
+        struct hlist_node *node;
+        struct sock *s = sk;
+        unsigned short hnum = ntohs(loc_port);
+        sk_for_each_from(s, node) {
+                struct inet_sock *inet = inet_sk(s);
+                if (inet->num != hnum                                   ||
+                    (inet->daddr && inet->daddr != rmt_addr)            ||
+                    (inet->dport != rmt_port && inet->dport)            ||
+                    (inet->rcv_saddr && inet->rcv_saddr != loc_addr)    ||
+                    ipv6_only_sock(s)                                   ||
+                    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
+                        continue;
+                if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
+                        continue;
+                goto found;
+        }
+        s = NULL;
+found:
+        return s;
+}
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.  
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header.  We need
+ * to find the appropriate port.
+ */
+void udp_err(struct sk_buff *skb, u32 info)
+{
+        struct inet_sock *inet;
+        struct iphdr *iph = (struct iphdr*)skb->data;
+        struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2));
+        int type = skb->h.icmph->type;
+        int code = skb->h.icmph->code;
+        struct sock *sk;
+        int harderr;
+        int err;
+        sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex);
+        if (sk == NULL) {
+                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+                return; /* No socket for error */
+        }
+        err = 0;
+        harderr = 0;
+        inet = inet_sk(sk);
+        switch (type) {
+        default:
+        case ICMP_TIME_EXCEEDED:
+                err = EHOSTUNREACH;
+                break;
+        case ICMP_SOURCE_QUENCH:
+                goto out;
+        case ICMP_PARAMETERPROB:
+                err = EPROTO;
+                harderr = 1;
+                break;
+        case ICMP_DEST_UNREACH:
+                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+                        if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+                                err = EMSGSIZE;
+                                harderr = 1;
+                                break;
+                        }
+                        goto out;
+                }
+                err = EHOSTUNREACH;
+                if (code <= NR_ICMP_UNREACH) {
+                        harderr = icmp_err_convert[code].fatal;
+                        err = icmp_err_convert[code].errno;
+                }
+                break;
+        }
+        /*
+         *      RFC1122: OK.  Passes ICMP errors back to application, as per 
+         *      4.1.3.3.
+         */
+        if (!inet->recverr) {
+                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+                        goto out;
+        } else {
+                ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
+        }
+        sk->sk_err = err;
+        sk->sk_error_report(sk);
+out:
+        sock_put(sk);
+}
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+static void udp_flush_pending_frames(struct sock *sk)
+{
+        struct udp_sock *up = udp_sk(sk);
+        if (up->pending) {
+                up->len = 0;
+                up->pending = 0;
+                ip_flush_pending_frames(sk);
+        }
+}
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct flowi *fl = &inet->cork.fl;
+        struct sk_buff *skb;
+        struct udphdr *uh;
+        int err = 0;
+        /* Grab the skbuff where UDP header space exists. */
+        if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
+                goto out;
+        /*
+         * Create a UDP header
+         */
+        uh = skb->h.uh;
+        uh->source = fl->fl_ip_sport;
+        uh->dest = fl->fl_ip_dport;
+        uh->len = htons(up->len);
+        uh->check = 0;
+        if (sk->sk_no_check == UDP_CSUM_NOXMIT) {
+                skb->ip_summed = CHECKSUM_NONE;
+                goto send;
+        }
+        if (skb_queue_len(&sk->sk_write_queue) == 1) {
+                /*
+                 * Only one fragment on the socket.
+                 */
+                if (skb->ip_summed == CHECKSUM_HW) {
+                        skb->csum = offsetof(struct udphdr, check);
+                        uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+                                        up->len, IPPROTO_UDP, 0);
+                } else {
+                        skb->csum = csum_partial((char *)uh,
+                                        sizeof(struct udphdr), skb->csum);
+                        uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+                                        up->len, IPPROTO_UDP, skb->csum);
+                        if (uh->check == 0)
+                                uh->check = -1;
+                }
+        } else {
+                unsigned int csum = 0;
+                /*
+                 * HW-checksum won't work as there are two or more 
+                 * fragments on the socket so that all csums of sk_buffs
+                 * should be together.
+                 */
+                if (skb->ip_summed == CHECKSUM_HW) {
+                        int offset = (unsigned char *)uh - skb->data;
+                        skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+                        skb->ip_summed = CHECKSUM_NONE;
+                } else {
+                        skb->csum = csum_partial((char *)uh,
+                                        sizeof(struct udphdr), skb->csum);
+                }
+                skb_queue_walk(&sk->sk_write_queue, skb) {
+                        csum = csum_add(csum, skb->csum);
+                }
+                uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+                                up->len, IPPROTO_UDP, csum);
+                if (uh->check == 0)
+                        uh->check = -1;
+        }
+send:
+        err = ip_push_pending_frames(sk);
+out:
+        up->len = 0;
+        up->pending = 0;
+        return err;
+}
+static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
+{
+        return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
+}
+int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                size_t len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct udp_sock *up = udp_sk(sk);
+        int ulen = len;
+        struct ipcm_cookie ipc;
+        struct rtable *rt = NULL;
+        int free = 0;
+        int connected = 0;
+        u32 daddr, faddr, saddr;
+        u16 dport;
+        u8  tos;
+        int err;
+        int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+        if (len > 0xFFFF)
+                return -EMSGSIZE;
+        /* 
+         *      Check the flags.
+         */
+        if (msg->msg_flags&MSG_OOB)     /* Mirror BSD error message compatibility */
+                return -EOPNOTSUPP;
+        ipc.opt = NULL;
+        if (up->pending) {
+                /*
+                 * There are pending frames.
+                 * The socket lock must be held while it's corked.
+                 */
+                lock_sock(sk);
+                if (likely(up->pending)) {
+                        if (unlikely(up->pending != AF_INET)) {
+                                release_sock(sk);
+                                return -EINVAL;
+                        }
+                        goto do_append_data;
+                }
+                release_sock(sk);
+        }
+        ulen += sizeof(struct udphdr);
+        /*
+         *      Get and verify the address. 
+         */
+        if (msg->msg_name) {
+                struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
+                if (msg->msg_namelen < sizeof(*usin))
+                        return -EINVAL;
+                if (usin->sin_family != AF_INET) {
+                        if (usin->sin_family != AF_UNSPEC)
+                                return -EAFNOSUPPORT;
+                }
+                daddr = usin->sin_addr.s_addr;
+                dport = usin->sin_port;
+                if (dport == 0)
+                        return -EINVAL;
+        } else {
+                if (sk->sk_state != TCP_ESTABLISHED)
+                        return -EDESTADDRREQ;
+                daddr = inet->daddr;
+                dport = inet->dport;
+                /* Open fast path for connected socket.
+                   Route will not be used, if at least one option is set.
+                 */
+                connected = 1;
+        }
+        ipc.addr = inet->saddr;
+        ipc.oif = sk->sk_bound_dev_if;
+        if (msg->msg_controllen) {
+                err = ip_cmsg_send(msg, &ipc);
+                if (err)
+                        return err;
+                if (ipc.opt)
+                        free = 1;
+                connected = 0;
+        }
+        if (!ipc.opt)
+                ipc.opt = inet->opt;
+        saddr = ipc.addr;
+        ipc.addr = faddr = daddr;
+        if (ipc.opt && ipc.opt->srr) {
+                if (!daddr)
+                        return -EINVAL;
+                faddr = ipc.opt->faddr;
+                connected = 0;
+        }
+        tos = RT_TOS(inet->tos);
+        if (sock_flag(sk, SOCK_LOCALROUTE) ||
+            (msg->msg_flags & MSG_DONTROUTE) || 
+            (ipc.opt && ipc.opt->is_strictroute)) {
+                tos |= RTO_ONLINK;
+                connected = 0;
+        }
+        if (MULTICAST(daddr)) {
+                if (!ipc.oif)
+                        ipc.oif = inet->mc_index;
+                if (!saddr)
+                        saddr = inet->mc_addr;
+                connected = 0;
+        }
+        if (connected)
+                rt = (struct rtable*)sk_dst_check(sk, 0);
+        if (rt == NULL) {
+                struct flowi fl = { .oif = ipc.oif,
+                                    .nl_u = { .ip4_u =
+                                              { .daddr = faddr,
+                                                .saddr = saddr,
+                                                .tos = tos } },
+                                    .proto = IPPROTO_UDP,
+                                    .uli_u = { .ports =
+                                               { .sport = inet->sport,
+                                                 .dport = dport } } };
+                err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+                if (err)
+                        goto out;
+                err = -EACCES;
+                if ((rt->rt_flags & RTCF_BROADCAST) &&
+                    !sock_flag(sk, SOCK_BROADCAST))
+                        goto out;
+                if (connected)
+                        sk_dst_set(sk, dst_clone(&rt->u.dst));
+        }
+        if (msg->msg_flags&MSG_CONFIRM)
+                goto do_confirm;
+back_from_confirm:
+        saddr = rt->rt_src;
+        if (!ipc.addr)
+                daddr = ipc.addr = rt->rt_dst;
+        lock_sock(sk);
+        if (unlikely(up->pending)) {
+                /* The socket is already corked while preparing it. */
+                /* ... which is an evident application bug. --ANK */
+                release_sock(sk);
+                NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
+                err = -EINVAL;
+                goto out;
+        }
+        /*
+         *      Now cork the socket to pend data.
+         */
+        inet->cork.fl.fl4_dst = daddr;
+        inet->cork.fl.fl_ip_dport = dport;
+        inet->cork.fl.fl4_src = saddr;
+        inet->cork.fl.fl_ip_sport = inet->sport;
+        up->pending = AF_INET;
+do_append_data:
+        up->len += ulen;
+        err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, 
+                        sizeof(struct udphdr), &ipc, rt, 
+                        corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+        if (err)
+                udp_flush_pending_frames(sk);
+        else if (!corkreq)
+                err = udp_push_pending_frames(sk, up);
+        release_sock(sk);
+out:
+        ip_rt_put(rt);
+        if (free)
+                kfree(ipc.opt);
+        if (!err) {
+                UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS);
+                return len;
+        }
+        return err;
+do_confirm:
+        dst_confirm(&rt->u.dst);
+        if (!(msg->msg_flags&MSG_PROBE) || len)
+                goto back_from_confirm;
+        err = 0;
+        goto out;
+}
+static int udp_sendpage(struct sock *sk, struct page *page, int offset,
+                        size_t size, int flags)
+{
+        struct udp_sock *up = udp_sk(sk);
+        int ret;
+        if (!up->pending) {
+                struct msghdr msg = {   .msg_flags = flags|MSG_MORE };
+                /* Call udp_sendmsg to specify destination address which
+                 * sendpage interface can't pass.
+                 * This will succeed only when the socket is connected.
+                 */
+                ret = udp_sendmsg(NULL, sk, &msg, 0);
+                if (ret < 0)
+                        return ret;
+        }
+        lock_sock(sk);
+        if (unlikely(!up->pending)) {
+                release_sock(sk);
+                NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
+                return -EINVAL;
+        }
+        ret = ip_append_page(sk, page, offset, size, flags);
+        if (ret == -EOPNOTSUPP) {
+                release_sock(sk);
+                return sock_no_sendpage(sk->sk_socket, page, offset,
+                                        size, flags);
+        }
+        if (ret < 0) {
+                udp_flush_pending_frames(sk);
+                goto out;
+        }
+        up->len += size;
+        if (!(up->corkflag || (flags&MSG_MORE)))
+                ret = udp_push_pending_frames(sk, up);
+        if (!ret)
+                ret = size;
+out:
+        release_sock(sk);
+        return ret;
+}
+/*
+ *      IOCTL requests applicable to the UDP protocol
+ */
+ 
+int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+        switch(cmd) 
+        {
+                case SIOCOUTQ:
+                {
+                        int amount = atomic_read(&sk->sk_wmem_alloc);
+                        return put_user(amount, (int __user *)arg);
+                }
+                case SIOCINQ:
+                {
+                        struct sk_buff *skb;
+                        unsigned long amount;
+                        amount = 0;
+                        spin_lock_irq(&sk->sk_receive_queue.lock);
+                        skb = skb_peek(&sk->sk_receive_queue);
+                        if (skb != NULL) {
+                                /*
+                                 * We will only return the amount
+                                 * of this packet since that is all
+                                 * that will be read.
+                                 */
+                                amount = skb->len - sizeof(struct udphdr);
+                        }
+                        spin_unlock_irq(&sk->sk_receive_queue.lock);
+                        return put_user(amount, (int __user *)arg);
+                }
+                default:
+                        return -ENOIOCTLCMD;
+        }
+        return(0);
+}
+static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
+{
+        return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+}
+static __inline__ int udp_checksum_complete(struct sk_buff *skb)
+{
+        return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+                __udp_checksum_complete(skb);
+}
+/*
+ *      This should be easy, if there is something there we
+ *      return it, otherwise we block.
+ */
+static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                       size_t len, int noblock, int flags, int *addr_len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+        struct sk_buff *skb;
+        int copied, err;
+        /*
+         *      Check any passed addresses
+         */
+        if (addr_len)
+                *addr_len=sizeof(*sin);
+        if (flags & MSG_ERRQUEUE)
+                return ip_recv_error(sk, msg, len);
+try_again:
+        skb = skb_recv_datagram(sk, flags, noblock, &err);
+        if (!skb)
+                goto out;
+  
+        copied = skb->len - sizeof(struct udphdr);
+        if (copied > len) {
+                copied = len;
+                msg->msg_flags |= MSG_TRUNC;
+        }
+        if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
+                err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+                                              copied);
+        } else if (msg->msg_flags&MSG_TRUNC) {
+                if (__udp_checksum_complete(skb))
+                        goto csum_copy_err;
+                err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+                                              copied);
+        } else {
+                err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
+                if (err == -EINVAL)
+                        goto csum_copy_err;
+        }
+        if (err)
+                goto out_free;
+        sock_recv_timestamp(msg, sk, skb);
+        /* Copy the address. */
+        if (sin)
+        {
+                sin->sin_family = AF_INET;
+                sin->sin_port = skb->h.uh->source;
+                sin->sin_addr.s_addr = skb->nh.iph->saddr;
+                memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+        }
+        if (inet->cmsg_flags)
+                ip_cmsg_recv(msg, skb);
+        err = copied;
+        if (flags & MSG_TRUNC)
+                err = skb->len - sizeof(struct udphdr);
+  
+out_free:
+        skb_free_datagram(sk, skb);
+out:
+        return err;
+csum_copy_err:
+        UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+        /* Clear queue. */
+        if (flags&MSG_PEEK) {
+                int clear = 0;
+                spin_lock_irq(&sk->sk_receive_queue.lock);
+                if (skb == skb_peek(&sk->sk_receive_queue)) {
+                        __skb_unlink(skb, &sk->sk_receive_queue);
+                        clear = 1;
+                }
+                spin_unlock_irq(&sk->sk_receive_queue.lock);
+                if (clear)
+                        kfree_skb(skb);
+        }
+        skb_free_datagram(sk, skb);
+        if (noblock)
+                return -EAGAIN; 
+        goto try_again;
+}
+int udp_disconnect(struct sock *sk, int flags)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        /*
+         *      1003.1g - break association.
+         */
+         
+        sk->sk_state = TCP_CLOSE;
+        inet->daddr = 0;
+        inet->dport = 0;
+        sk->sk_bound_dev_if = 0;
+        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+                inet_reset_saddr(sk);
+        if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
+                sk->sk_prot->unhash(sk);
+                inet->sport = 0;
+        }
+        sk_dst_reset(sk);
+        return 0;
+}
+static void udp_close(struct sock *sk, long timeout)
+{
+        sk_common_release(sk);
+}
+/* return:
+ *      1  if the the UDP system should process it
+ *      0  if we should drop this packet
+ *      -1 if it should get processed by xfrm4_rcv_encap
+ */
+static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
+{
+#ifndef CONFIG_XFRM
+        return 1; 
+#else
+        struct udp_sock *up = udp_sk(sk);
+        struct udphdr *uh = skb->h.uh;
+        struct iphdr *iph;
+        int iphlen, len;
+  
+        __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr);
+        __u32 *udpdata32 = (__u32 *)udpdata;
+        __u16 encap_type = up->encap_type;
+        /* if we're overly short, let UDP handle it */
+        if (udpdata > skb->tail)
+                return 1;
+        /* if this is not encapsulated socket, then just return now */
+        if (!encap_type)
+                return 1;
+        len = skb->tail - udpdata;
+        switch (encap_type) {
+        default:
+        case UDP_ENCAP_ESPINUDP:
+                /* Check if this is a keepalive packet.  If so, eat it. */
+                if (len == 1 && udpdata[0] == 0xff) {
+                        return 0;
+                } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) {
+                        /* ESP Packet without Non-ESP header */
+                        len = sizeof(struct udphdr);
+                } else
+                        /* Must be an IKE packet.. pass it through */
+                        return 1;
+                break;
+        case UDP_ENCAP_ESPINUDP_NON_IKE:
+                /* Check if this is a keepalive packet.  If so, eat it. */
+                if (len == 1 && udpdata[0] == 0xff) {
+                        return 0;
+                } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+                           udpdata32[0] == 0 && udpdata32[1] == 0) {
+                        
+                        /* ESP Packet with Non-IKE marker */
+                        len = sizeof(struct udphdr) + 2 * sizeof(u32);
+                } else
+                        /* Must be an IKE packet.. pass it through */
+                        return 1;
+                break;
+        }
+        /* At this point we are sure that this is an ESPinUDP packet,
+         * so we need to remove 'len' bytes from the packet (the UDP
+         * header and optional ESP marker bytes) and then modify the
+         * protocol to ESP, and then call into the transform receiver.
+         */
+        /* Now we can update and verify the packet length... */
+        iph = skb->nh.iph;
+        iphlen = iph->ihl << 2;
+        iph->tot_len = htons(ntohs(iph->tot_len) - len);
+        if (skb->len < iphlen + len) {
+                /* packet is too small!?! */
+                return 0;
+        }
+        /* pull the data buffer up to the ESP header and set the
+         * transport header to point to ESP.  Keep UDP on the stack
+         * for later.
+         */
+        skb->h.raw = skb_pull(skb, len);
+        /* modify the protocol (it's ESP!) */
+        iph->protocol = IPPROTO_ESP;
+        /* and let the caller know to send this into the ESP processor... */
+        return -1;
+#endif
+}
+/* returns:
+ *  -1: error
+ *   0: success
+ *  >0: "udp encap" protocol resubmission
+ *
+ * Note that in the success and error cases, the skb is assumed to
+ * have either been requeued or freed.
+ */
+static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+{
+        struct udp_sock *up = udp_sk(sk);
+        /*
+         *      Charge it to the socket, dropping if the queue is full.
+         */
+        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+                kfree_skb(skb);
+                return -1;
+        }
+        if (up->encap_type) {
+                /*
+                 * This is an encapsulation socket, so let's see if this is
+                 * an encapsulated packet.
+                 * If it's a keepalive packet, then just eat it.
+                 * If it's an encapsulateed packet, then pass it to the
+                 * IPsec xfrm input and return the response
+                 * appropriately.  Otherwise, just fall through and
+                 * pass this up the UDP socket.
+                 */
+                int ret;
+                ret = udp_encap_rcv(sk, skb);
+                if (ret == 0) {
+                        /* Eat the packet .. */
+                        kfree_skb(skb);
+                        return 0;
+                }
+                if (ret < 0) {
+                        /* process the ESP packet */
+                        ret = xfrm4_rcv_encap(skb, up->encap_type);
+                        UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
+                        return -ret;
+                }
+                /* FALLTHROUGH -- it's a UDP Packet */
+        }
+        if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+                if (__udp_checksum_complete(skb)) {
+                        UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+                        kfree_skb(skb);
+                        return -1;
+                }
+                skb->ip_summed = CHECKSUM_UNNECESSARY;
+        }
+        if (sock_queue_rcv_skb(sk,skb)<0) {
+                UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+                kfree_skb(skb);
+                return -1;
+        }
+        UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
+        return 0;
+}
+/*
+ *      Multicasts and broadcasts go to each listener.
+ *
+ *      Note: called only from the BH handler context,
+ *      so we don't need to lock the hashes.
+ */
+static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
+                                 u32 saddr, u32 daddr)
+{
+        struct sock *sk;
+        int dif;
+        read_lock(&udp_hash_lock);
+        sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
+        dif = skb->dev->ifindex;
+        sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+        if (sk) {
+                struct sock *sknext = NULL;
+                do {
+                        struct sk_buff *skb1 = skb;
+                        sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
+                                                   uh->source, saddr, dif);
+                        if(sknext)
+                                skb1 = skb_clone(skb, GFP_ATOMIC);
+                        if(skb1) {
+                                int ret = udp_queue_rcv_skb(sk, skb1);
+                                if (ret > 0)
+                                        /* we should probably re-process instead
+                                         * of dropping packets here. */
+                                        kfree_skb(skb1);
+                        }
+                        sk = sknext;
+                } while(sknext);
+        } else
+                kfree_skb(skb);
+        read_unlock(&udp_hash_lock);
+        return 0;
+}
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+                             unsigned short ulen, u32 saddr, u32 daddr)
+{
+        if (uh->check == 0) {
+                skb->ip_summed = CHECKSUM_UNNECESSARY;
+        } else if (skb->ip_summed == CHECKSUM_HW) {
+                skb->ip_summed = CHECKSUM_UNNECESSARY;
+                if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
+                        return 0;
+                NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+                skb->ip_summed = CHECKSUM_NONE;
+        }
+        if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+                skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+        /* Probably, we should checksum udp header (it should be in cache
+         * in any case) and data in tiny packets (< rx copybreak).
+         */
+        return 0;
+}
+/*
+ *      All we need to do is get the socket, and then do a checksum. 
+ */
+ 
+int udp_rcv(struct sk_buff *skb)
+{
+        struct sock *sk;
+        struct udphdr *uh;
+        unsigned short ulen;
+        struct rtable *rt = (struct rtable*)skb->dst;
+        u32 saddr = skb->nh.iph->saddr;
+        u32 daddr = skb->nh.iph->daddr;
+        int len = skb->len;
+        /*
+         *      Validate the packet and the UDP length.
+         */
+        if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+                goto no_header;
+        uh = skb->h.uh;
+        ulen = ntohs(uh->len);
+        if (ulen > len || ulen < sizeof(*uh))
+                goto short_packet;
+        if (pskb_trim(skb, ulen))
+                goto short_packet;
+        if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
+                goto csum_error;
+        if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+                return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
+        sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
+        if (sk != NULL) {
+                int ret = udp_queue_rcv_skb(sk, skb);
+                sock_put(sk);
+                /* a return value > 0 means to resubmit the input, but
+                 * it it wants the return to be -protocol, or 0
+                 */
+                if (ret > 0)
+                        return -ret;
+                return 0;
+        }
+        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+                goto drop;
+        /* No socket. Drop packet silently, if checksum is wrong */
+        if (udp_checksum_complete(skb))
+                goto csum_error;
+        UDP_INC_STATS_BH(UDP_MIB_NOPORTS);
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+        /*
+         * Hmm.  We got an UDP packet to a port to which we
+         * don't wanna listen.  Ignore it.
+         */
+        kfree_skb(skb);
+        return(0);
+short_packet:
+        NETDEBUG(if (net_ratelimit())
+                printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+                        NIPQUAD(saddr),
+                        ntohs(uh->source),
+                        ulen,
+                        len,
+                        NIPQUAD(daddr),
+                        ntohs(uh->dest)));
+no_header:
+        UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+        kfree_skb(skb);
+        return(0);
+csum_error:
+        /* 
+         * RFC1122: OK.  Discards the bad packet silently (as far as 
+         * the network is concerned, anyway) as per 4.1.3.4 (MUST). 
+         */
+        NETDEBUG(if (net_ratelimit())
+                 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+                        NIPQUAD(saddr),
+                        ntohs(uh->source),
+                        NIPQUAD(daddr),
+                        ntohs(uh->dest),
+                        ulen));
+drop:
+        UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+        kfree_skb(skb);
+        return(0);
+}
+static int udp_destroy_sock(struct sock *sk)
+{
+        lock_sock(sk);
+        udp_flush_pending_frames(sk);
+        release_sock(sk);
+        return 0;
+}
+/*
+ *      Socket option code for UDP
+ */
+static int udp_setsockopt(struct sock *sk, int level, int optname, 
+                          char __user *optval, int optlen)
+{
+        struct udp_sock *up = udp_sk(sk);
+        int val;
+        int err = 0;
+        if (level != SOL_UDP)
+                return ip_setsockopt(sk, level, optname, optval, optlen);
+        if(optlen<sizeof(int))
+                return -EINVAL;
+        if (get_user(val, (int __user *)optval))
+                return -EFAULT;
+        switch(optname) {
+        case UDP_CORK:
+                if (val != 0) {
+                        up->corkflag = 1;
+                } else {
+                        up->corkflag = 0;
+                        lock_sock(sk);
+                        udp_push_pending_frames(sk, up);
+                        release_sock(sk);
+                }
+                break;
+                
+        case UDP_ENCAP:
+                switch (val) {
+                case 0:
+                case UDP_ENCAP_ESPINUDP:
+                case UDP_ENCAP_ESPINUDP_NON_IKE:
+                        up->encap_type = val;
+                        break;
+                default:
+                        err = -ENOPROTOOPT;
+                        break;
+                }
+                break;
+        default:
+                err = -ENOPROTOOPT;
+                break;
+        };
+        return err;
+}
+static int udp_getsockopt(struct sock *sk, int level, int optname, 
+                          char __user *optval, int __user *optlen)
+{
+        struct udp_sock *up = udp_sk(sk);
+        int val, len;
+        if (level != SOL_UDP)
+                return ip_getsockopt(sk, level, optname, optval, optlen);
+        if(get_user(len,optlen))
+                return -EFAULT;
+        len = min_t(unsigned int, len, sizeof(int));
+        
+        if(len < 0)
+                return -EINVAL;
+        switch(optname) {
+        case UDP_CORK:
+                val = up->corkflag;
+                break;
+        case UDP_ENCAP:
+                val = up->encap_type;
+                break;
+        default:
+                return -ENOPROTOOPT;
+        };
+        if(put_user(len, optlen))
+                return -EFAULT;
+        if(copy_to_user(optval, &val,len))
+                return -EFAULT;
+        return 0;
+}
+/**
+ *      udp_poll - wait for a UDP event.
+ *      @file - file struct
+ *      @sock - socket
+ *      @wait - poll table
+ *
+ *      This is same as datagram poll, except for the special case of 
+ *      blocking sockets. If application is using a blocking fd
+ *      and a packet with checksum error is in the queue;
+ *      then it could get return from select indicating data available
+ *      but then block when reading it. Add special case code
+ *      to work around these arguably broken applications.
+ */
+unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+        unsigned int mask = datagram_poll(file, sock, wait);
+        struct sock *sk = sock->sk;
+        
+        /* Check for false positives due to checksum errors */
+        if ( (mask & POLLRDNORM) &&
+             !(file->f_flags & O_NONBLOCK) &&
+             !(sk->sk_shutdown & RCV_SHUTDOWN)){
+                struct sk_buff_head *rcvq = &sk->sk_receive_queue;
+                struct sk_buff *skb;
+                spin_lock_irq(&rcvq->lock);
+                while ((skb = skb_peek(rcvq)) != NULL) {
+                        if (udp_checksum_complete(skb)) {
+                                UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+                                __skb_unlink(skb, rcvq);
+                                kfree_skb(skb);
+                        } else {
+                                skb->ip_summed = CHECKSUM_UNNECESSARY;
+                                break;
+                        }
+                }
+                spin_unlock_irq(&rcvq->lock);
+                /* nothing to see, move along */
+                if (skb == NULL)
+                        mask &= ~(POLLIN | POLLRDNORM);
+        }
+        return mask;
+        
+}
+struct proto udp_prot = {
+        .name =         "UDP",
+        .owner =        THIS_MODULE,
+        .close =        udp_close,
+        .connect =      ip4_datagram_connect,
+        .disconnect =   udp_disconnect,
+        .ioctl =        udp_ioctl,
+        .destroy =      udp_destroy_sock,
+        .setsockopt =   udp_setsockopt,
+        .getsockopt =   udp_getsockopt,
+        .sendmsg =      udp_sendmsg,
+        .recvmsg =      udp_recvmsg,
+        .sendpage =     udp_sendpage,
+        .backlog_rcv =  udp_queue_rcv_skb,
+        .hash =         udp_v4_hash,
+        .unhash =       udp_v4_unhash,
+        .get_port =     udp_v4_get_port,
+        .obj_size =     sizeof(struct udp_sock),
+};
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+static struct sock *udp_get_first(struct seq_file *seq)
+{
+        struct sock *sk;
+        struct udp_iter_state *state = seq->private;
+        for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+                struct hlist_node *node;
+                sk_for_each(sk, node, &udp_hash[state->bucket]) {
+                        if (sk->sk_family == state->family)
+                                goto found;
+                }
+        }
+        sk = NULL;
+found:
+        return sk;
+}
+static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
+{
+        struct udp_iter_state *state = seq->private;
+        do {
+                sk = sk_next(sk);
+try_again:
+                ;
+        } while (sk && sk->sk_family != state->family);
+        if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
+                sk = sk_head(&udp_hash[state->bucket]);
+                goto try_again;
+        }
+        return sk;
+}
+static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct sock *sk = udp_get_first(seq);
+        if (sk)
+                while(pos && (sk = udp_get_next(seq, sk)) != NULL)
+                        --pos;
+        return pos ? NULL : sk;
+}
+static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock(&udp_hash_lock);
+        return *pos ? udp_get_idx(seq, *pos-1) : (void *)1;
+}
+static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct sock *sk;
+        if (v == (void *)1)
+                sk = udp_get_idx(seq, 0);
+        else
+                sk = udp_get_next(seq, v);
+        ++*pos;
+        return sk;
+}
+static void udp_seq_stop(struct seq_file *seq, void *v)
+{
+        read_unlock(&udp_hash_lock);
+}
+static int udp_seq_open(struct inode *inode, struct file *file)
+{
+        struct udp_seq_afinfo *afinfo = PDE(inode)->data;
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct udp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                goto out;
+        memset(s, 0, sizeof(*s));
+        s->family               = afinfo->family;
+        s->seq_ops.start        = udp_seq_start;
+        s->seq_ops.next         = udp_seq_next;
+        s->seq_ops.show         = afinfo->seq_show;
+        s->seq_ops.stop         = udp_seq_stop;
+        rc = seq_open(file, &s->seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq          = file->private_data;
+        seq->private = s;
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+/* ------------------------------------------------------------------------ */
+int udp_proc_register(struct udp_seq_afinfo *afinfo)
+{
+        struct proc_dir_entry *p;
+        int rc = 0;
+        if (!afinfo)
+                return -EINVAL;
+        afinfo->seq_fops->owner         = afinfo->owner;
+        afinfo->seq_fops->open          = udp_seq_open;
+        afinfo->seq_fops->read          = seq_read;
+        afinfo->seq_fops->llseek        = seq_lseek;
+        afinfo->seq_fops->release       = seq_release_private;
+        p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+        if (p)
+                p->data = afinfo;
+        else
+                rc = -ENOMEM;
+        return rc;
+}
+void udp_proc_unregister(struct udp_seq_afinfo *afinfo)
+{
+        if (!afinfo)
+                return;
+        proc_net_remove(afinfo->name);
+        memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
+}
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, char *tmpbuf, int bucket)
+{
+        struct inet_sock *inet = inet_sk(sp);
+        unsigned int dest = inet->daddr;
+        unsigned int src  = inet->rcv_saddr;
+        __u16 destp       = ntohs(inet->dport);
+        __u16 srcp        = ntohs(inet->sport);
+        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
+                bucket, src, srcp, dest, destp, sp->sk_state, 
+                atomic_read(&sp->sk_wmem_alloc),
+                atomic_read(&sp->sk_rmem_alloc),
+                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+                atomic_read(&sp->sk_refcnt), sp);
+}
+static int udp4_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_printf(seq, "%-127s\n",
+                           "  sl  local_address rem_address   st tx_queue "
+                           "rx_queue tr tm->when retrnsmt   uid  timeout "
+                           "inode");
+        else {
+                char tmpbuf[129];
+                struct udp_iter_state *state = seq->private;
+                udp4_format_sock(v, tmpbuf, state->bucket);
+                seq_printf(seq, "%-127s\n", tmpbuf);
+        }
+        return 0;
+}
+/* ------------------------------------------------------------------------ */
+static struct file_operations udp4_seq_fops;
+static struct udp_seq_afinfo udp4_seq_afinfo = {
+        .owner          = THIS_MODULE,
+        .name           = "udp",
+        .family         = AF_INET,
+        .seq_show       = udp4_seq_show,
+        .seq_fops       = &udp4_seq_fops,
+};
+int __init udp4_proc_init(void)
+{
+        return udp_proc_register(&udp4_seq_afinfo);
+}
+void udp4_proc_exit(void)
+{
+        udp_proc_unregister(&udp4_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+EXPORT_SYMBOL(udp_disconnect);
+EXPORT_SYMBOL(udp_hash);
+EXPORT_SYMBOL(udp_hash_lock);
+EXPORT_SYMBOL(udp_ioctl);
+EXPORT_SYMBOL(udp_port_rover);
+EXPORT_SYMBOL(udp_prot);
+EXPORT_SYMBOL(udp_sendmsg);
+EXPORT_SYMBOL(udp_poll);
+#ifdef CONFIG_PROC_FS
+EXPORT_SYMBOL(udp_proc_register);
+EXPORT_SYMBOL(udp_proc_unregister);
+#endif
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
new file mode 100644
index 000000000000..6aecd7a43534
--- /dev/null
+++ b/net/ipv4/utils.c
@@ -0,0 +1,59 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Various kernel-resident INET utility functions; mainly
+ *              for format conversion and debugging output.
+ *
+ * Version:     $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
+ *
+ * Author:      Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *              Alan Cox        :       verify_area check.
+ *              Alan Cox        :       removed old debugging.
+ *              Andi Kleen      :       add net_ratelimit()  
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+/*
+ *      Convert an ASCII string to binary IP. 
+ */
+ 
+__u32 in_aton(const char *str)
+{
+        unsigned long l;
+        unsigned int val;
+        int i;
+        l = 0;
+        for (i = 0; i < 4; i++) 
+        {
+                l <<= 8;
+                if (*str != '\0') 
+                {
+                        val = 0;
+                        while (*str != '\0' && *str != '.') 
+                        {
+                                val *= 10;
+                                val += *str - '0';
+                                str++;
+                        }
+                        l |= val;
+                        if (*str != '\0') 
+                                str++;
+                }
+        }
+        return(htonl(l));
+}
+EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 000000000000..2d3849c38a0f
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,160 @@
+/*
+ * xfrm4_input.c
+ *
+ * Changes:
+ *      YOSHIFUJI Hideaki @USAGI
+ *              Split up af-specific portion
+ *      Derek Atkins <derek@ihtfp.com>
+ *              Add Encapsulation support
+ *      
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+int xfrm4_rcv(struct sk_buff *skb)
+{
+        return xfrm4_rcv_encap(skb, 0);
+}
+EXPORT_SYMBOL(xfrm4_rcv);
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+        struct iphdr *outer_iph = skb->nh.iph;
+        struct iphdr *inner_iph = skb->h.ipiph;
+        if (INET_ECN_is_ce(outer_iph->tos))
+                IP_ECN_set_ce(inner_iph);
+}
+static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
+{
+        switch (nexthdr) {
+        case IPPROTO_IPIP:
+                if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                        return -EINVAL;
+                *spi = skb->nh.iph->saddr;
+                *seq = 0;
+                return 0;
+        }
+        return xfrm_parse_spi(skb, nexthdr, spi, seq);
+}
+int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
+{
+        int err;
+        u32 spi, seq;
+        struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH];
+        struct xfrm_state *x;
+        int xfrm_nr = 0;
+        int decaps = 0;
+        if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0)
+                goto drop;
+        do {
+                struct iphdr *iph = skb->nh.iph;
+                if (xfrm_nr == XFRM_MAX_DEPTH)
+                        goto drop;
+                x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET);
+                if (x == NULL)
+                        goto drop;
+                spin_lock(&x->lock);
+                if (unlikely(x->km.state != XFRM_STATE_VALID))
+                        goto drop_unlock;
+                if (x->props.replay_window && xfrm_replay_check(x, seq))
+                        goto drop_unlock;
+                if (xfrm_state_check_expire(x))
+                        goto drop_unlock;
+                xfrm_vec[xfrm_nr].decap.decap_type = encap_type;
+                if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb))
+                        goto drop_unlock;
+                /* only the first xfrm gets the encap type */
+                encap_type = 0;
+                if (x->props.replay_window)
+                        xfrm_replay_advance(x, seq);
+                x->curlft.bytes += skb->len;
+                x->curlft.packets++;
+                spin_unlock(&x->lock);
+                xfrm_vec[xfrm_nr++].xvec = x;
+                iph = skb->nh.iph;
+                if (x->props.mode) {
+                        if (iph->protocol != IPPROTO_IPIP)
+                                goto drop;
+                        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                                goto drop;
+                        if (skb_cloned(skb) &&
+                            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+                                goto drop;
+                        if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+                                ipv4_copy_dscp(iph, skb->h.ipiph);
+                        if (!(x->props.flags & XFRM_STATE_NOECN))
+                                ipip_ecn_decapsulate(skb);
+                        skb->mac.raw = memmove(skb->data - skb->mac_len,
+                                               skb->mac.raw, skb->mac_len);
+                        skb->nh.raw = skb->data;
+                        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+                        decaps = 1;
+                        break;
+                }
+                if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0)
+                        goto drop;
+        } while (!err);
+        /* Allocate new secpath or COW existing one. */
+        if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+                struct sec_path *sp;
+                sp = secpath_dup(skb->sp);
+                if (!sp)
+                        goto drop;
+                if (skb->sp)
+                        secpath_put(skb->sp);
+                skb->sp = sp;
+        }
+        if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
+                goto drop;
+        memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
+        skb->sp->len += xfrm_nr;
+        if (decaps) {
+                if (!(skb->dev->flags&IFF_LOOPBACK)) {
+                        dst_release(skb->dst);
+                        skb->dst = NULL;
+                }
+                netif_rx(skb);
+                return 0;
+        } else {
+                return -skb->nh.iph->protocol;
+        }
+drop_unlock:
+        spin_unlock(&x->lock);
+        xfrm_state_put(x);
+drop:
+        while (--xfrm_nr >= 0)
+                xfrm_state_put(xfrm_vec[xfrm_nr].xvec);
+        kfree_skb(skb);
+        return 0;
+}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 000000000000..af2392ae5769
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,141 @@
+/*
+ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+/* Add encapsulation header.
+ *
+ * In transport mode, the IP header will be moved forward to make space
+ * for the encapsulation header.
+ *
+ * In tunnel mode, the top IP header will be constructed per RFC 2401.
+ * The following fields in it shall be filled in by x->type->output:
+ *      tot_len
+ *      check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static void xfrm4_encap(struct sk_buff *skb)
+{
+        struct dst_entry *dst = skb->dst;
+        struct xfrm_state *x = dst->xfrm;
+        struct iphdr *iph, *top_iph;
+        iph = skb->nh.iph;
+        skb->h.ipiph = iph;
+        skb->nh.raw = skb_push(skb, x->props.header_len);
+        top_iph = skb->nh.iph;
+        if (!x->props.mode) {
+                skb->h.raw += iph->ihl*4;
+                memmove(top_iph, iph, iph->ihl*4);
+                return;
+        }
+        top_iph->ihl = 5;
+        top_iph->version = 4;
+        /* DS disclosed */
+        top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+        if (x->props.flags & XFRM_STATE_NOECN)
+                IP_ECN_clear(top_iph);
+        top_iph->frag_off = iph->frag_off & htons(IP_DF);
+        if (!top_iph->frag_off)
+                __ip_select_ident(top_iph, dst, 0);
+        top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+        top_iph->saddr = x->props.saddr.a4;
+        top_iph->daddr = x->id.daddr.a4;
+        top_iph->protocol = IPPROTO_IPIP;
+        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+}
+static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+{
+        int mtu, ret = 0;
+        struct dst_entry *dst;
+        struct iphdr *iph = skb->nh.iph;
+        if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
+                goto out;
+        IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+        
+        if (!(iph->frag_off & htons(IP_DF)) || skb->local_df)
+                goto out;
+        dst = skb->dst;
+        mtu = dst_mtu(dst);
+        if (skb->len > mtu) {
+                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+                ret = -EMSGSIZE;
+        }
+out:
+        return ret;
+}
+int xfrm4_output(struct sk_buff *skb)
+{
+        struct dst_entry *dst = skb->dst;
+        struct xfrm_state *x = dst->xfrm;
+        int err;
+        
+        if (skb->ip_summed == CHECKSUM_HW) {
+                err = skb_checksum_help(skb, 0);
+                if (err)
+                        goto error_nolock;
+        }
+        if (x->props.mode) {
+                err = xfrm4_tunnel_check_size(skb);
+                if (err)
+                        goto error_nolock;
+        }
+        spin_lock_bh(&x->lock);
+        err = xfrm_state_check(x, skb);
+        if (err)
+                goto error;
+        xfrm4_encap(skb);
+        err = x->type->output(x, skb);
+        if (err)
+                goto error;
+        x->curlft.bytes += skb->len;
+        x->curlft.packets++;
+        spin_unlock_bh(&x->lock);
+        
+        if (!(skb->dst = dst_pop(dst))) {
+                err = -EHOSTUNREACH;
+                goto error_nolock;
+        }
+        err = NET_XMIT_BYPASS;
+out_exit:
+        return err;
+error:
+        spin_unlock_bh(&x->lock);
+error_nolock:
+        kfree_skb(skb);
+        goto out_exit;
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 000000000000..7fe2afd2e669
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,281 @@
+/* 
+ * xfrm4_policy.c
+ *
+ * Changes:
+ *      Kazunori MIYAZAWA @USAGI
+ *      YOSHIFUJI Hideaki @USAGI
+ *              Split up af-specific portion
+ *      
+ */
+#include <linux/config.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+static struct dst_ops xfrm4_dst_ops;
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
+static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
+static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
+{
+        return __ip_route_output_key((struct rtable**)dst, fl);
+}
+static struct dst_entry *
+__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
+{
+        struct dst_entry *dst;
+        read_lock_bh(&policy->lock);
+        for (dst = policy->bundles; dst; dst = dst->next) {
+                struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
+                if (xdst->u.rt.fl.oif == fl->oif &&     /*XXX*/
+                    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
+                    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
+                    xfrm_bundle_ok(xdst, fl, AF_INET)) {
+                        dst_clone(dst);
+                        break;
+                }
+        }
+        read_unlock_bh(&policy->lock);
+        return dst;
+}
+/* Allocate chain of dst_entry's, attach known xfrm's, calculate
+ * all the metrics... Shortly, bundle a bundle.
+ */
+static int
+__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
+                      struct flowi *fl, struct dst_entry **dst_p)
+{
+        struct dst_entry *dst, *dst_prev;
+        struct rtable *rt0 = (struct rtable*)(*dst_p);
+        struct rtable *rt = rt0;
+        u32 remote = fl->fl4_dst;
+        u32 local  = fl->fl4_src;
+        struct flowi fl_tunnel = {
+                .nl_u = {
+                        .ip4_u = {
+                                .saddr = local,
+                                .daddr = remote
+                        }
+                }
+        };
+        int i;
+        int err;
+        int header_len = 0;
+        int trailer_len = 0;
+        dst = dst_prev = NULL;
+        dst_hold(&rt->u.dst);
+        for (i = 0; i < nx; i++) {
+                struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
+                struct xfrm_dst *xdst;
+                int tunnel = 0;
+                if (unlikely(dst1 == NULL)) {
+                        err = -ENOBUFS;
+                        dst_release(&rt->u.dst);
+                        goto error;
+                }
+                if (!dst)
+                        dst = dst1;
+                else {
+                        dst_prev->child = dst1;
+                        dst1->flags |= DST_NOHASH;
+                        dst_clone(dst1);
+                }
+                xdst = (struct xfrm_dst *)dst1;
+                xdst->route = &rt->u.dst;
+                dst1->next = dst_prev;
+                dst_prev = dst1;
+                if (xfrm[i]->props.mode) {
+                        remote = xfrm[i]->id.daddr.a4;
+                        local  = xfrm[i]->props.saddr.a4;
+                        tunnel = 1;
+                }
+                header_len += xfrm[i]->props.header_len;
+                trailer_len += xfrm[i]->props.trailer_len;
+                if (tunnel) {
+                        fl_tunnel.fl4_src = local;
+                        fl_tunnel.fl4_dst = remote;
+                        err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
+                                              &fl_tunnel, AF_INET);
+                        if (err)
+                                goto error;
+                } else
+                        dst_hold(&rt->u.dst);
+        }
+        dst_prev->child = &rt->u.dst;
+        dst->path = &rt->u.dst;
+        *dst_p = dst;
+        dst = dst_prev;
+        dst_prev = *dst_p;
+        i = 0;
+        for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
+                struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
+                x->u.rt.fl = *fl;
+                dst_prev->xfrm = xfrm[i++];
+                dst_prev->dev = rt->u.dst.dev;
+                if (rt->u.dst.dev)
+                        dev_hold(rt->u.dst.dev);
+                dst_prev->obsolete      = -1;
+                dst_prev->flags        |= DST_HOST;
+                dst_prev->lastuse       = jiffies;
+                dst_prev->header_len    = header_len;
+                dst_prev->trailer_len   = trailer_len;
+                memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
+                /* Copy neighbout for reachability confirmation */
+                dst_prev->neighbour     = neigh_clone(rt->u.dst.neighbour);
+                dst_prev->input         = rt->u.dst.input;
+                dst_prev->output        = xfrm4_output;
+                if (rt->peer)
+                        atomic_inc(&rt->peer->refcnt);
+                x->u.rt.peer = rt->peer;
+                /* Sheit... I remember I did this right. Apparently,
+                 * it was magically lost, so this code needs audit */
+                x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL);
+                x->u.rt.rt_type = rt->rt_type;
+                x->u.rt.rt_src = rt0->rt_src;
+                x->u.rt.rt_dst = rt0->rt_dst;
+                x->u.rt.rt_gateway = rt->rt_gateway;
+                x->u.rt.rt_spec_dst = rt0->rt_spec_dst;
+                header_len -= x->u.dst.xfrm->props.header_len;
+                trailer_len -= x->u.dst.xfrm->props.trailer_len;
+        }
+        xfrm_init_pmtu(dst);
+        return 0;
+error:
+        if (dst)
+                dst_free(dst);
+        return err;
+}
+static void
+_decode_session4(struct sk_buff *skb, struct flowi *fl)
+{
+        struct iphdr *iph = skb->nh.iph;
+        u8 *xprth = skb->nh.raw + iph->ihl*4;
+        memset(fl, 0, sizeof(struct flowi));
+        if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
+                switch (iph->protocol) {
+                case IPPROTO_UDP:
+                case IPPROTO_TCP:
+                case IPPROTO_SCTP:
+                        if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+                                u16 *ports = (u16 *)xprth;
+                                fl->fl_ip_sport = ports[0];
+                                fl->fl_ip_dport = ports[1];
+                        }
+                        break;
+                case IPPROTO_ICMP:
+                        if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
+                                u8 *icmp = xprth;
+                                fl->fl_icmp_type = icmp[0];
+                                fl->fl_icmp_code = icmp[1];
+                        }
+                        break;
+                case IPPROTO_ESP:
+                        if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+                                u32 *ehdr = (u32 *)xprth;
+                                fl->fl_ipsec_spi = ehdr[0];
+                        }
+                        break;
+                case IPPROTO_AH:
+                        if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
+                                u32 *ah_hdr = (u32*)xprth;
+                                fl->fl_ipsec_spi = ah_hdr[1];
+                        }
+                        break;
+                case IPPROTO_COMP:
+                        if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+                                u16 *ipcomp_hdr = (u16 *)xprth;
+                                fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1]));
+                        }
+                        break;
+                default:
+                        fl->fl_ipsec_spi = 0;
+                        break;
+                };
+        }
+        fl->proto = iph->protocol;
+        fl->fl4_dst = iph->daddr;
+        fl->fl4_src = iph->saddr;
+}
+static inline int xfrm4_garbage_collect(void)
+{
+        read_lock(&xfrm4_policy_afinfo.lock);
+        xfrm4_policy_afinfo.garbage_collect();
+        read_unlock(&xfrm4_policy_afinfo.lock);
+        return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
+}
+static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+        struct dst_entry *path = xdst->route;
+        path->ops->update_pmtu(path, mtu);
+}
+static struct dst_ops xfrm4_dst_ops = {
+        .family =               AF_INET,
+        .protocol =             __constant_htons(ETH_P_IP),
+        .gc =                   xfrm4_garbage_collect,
+        .update_pmtu =          xfrm4_update_pmtu,
+        .gc_thresh =            1024,
+        .entry_size =           sizeof(struct xfrm_dst),
+};
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+        .family =               AF_INET,
+        .lock =                 RW_LOCK_UNLOCKED,
+        .type_map =             &xfrm4_type_map,
+        .dst_ops =              &xfrm4_dst_ops,
+        .dst_lookup =           xfrm4_dst_lookup,
+        .find_bundle =          __xfrm4_find_bundle,
+        .bundle_create =        __xfrm4_bundle_create,
+        .decode_session =       _decode_session4,
+};
+static void __init xfrm4_policy_init(void)
+{
+        xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+}
+static void __exit xfrm4_policy_fini(void)
+{
+        xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+}
+void __init xfrm4_init(void)
+{
+        xfrm4_state_init();
+        xfrm4_policy_init();
+}
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 000000000000..223a2e83853f
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,126 @@
+/*
+ * xfrm4_state.c
+ *
+ * Changes:
+ *      YOSHIFUJI Hideaki @USAGI
+ *              Split up af-specific portion
+ *
+ */
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+static struct xfrm_state_afinfo xfrm4_state_afinfo;
+static void
+__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
+                     struct xfrm_tmpl *tmpl,
+                     xfrm_address_t *daddr, xfrm_address_t *saddr)
+{
+        x->sel.daddr.a4 = fl->fl4_dst;
+        x->sel.saddr.a4 = fl->fl4_src;
+        x->sel.dport = xfrm_flowi_dport(fl);
+        x->sel.dport_mask = ~0;
+        x->sel.sport = xfrm_flowi_sport(fl);
+        x->sel.sport_mask = ~0;
+        x->sel.prefixlen_d = 32;
+        x->sel.prefixlen_s = 32;
+        x->sel.proto = fl->proto;
+        x->sel.ifindex = fl->oif;
+        x->id = tmpl->id;
+        if (x->id.daddr.a4 == 0)
+                x->id.daddr.a4 = daddr->a4;
+        x->props.saddr = tmpl->saddr;
+        if (x->props.saddr.a4 == 0)
+                x->props.saddr.a4 = saddr->a4;
+        x->props.mode = tmpl->mode;
+        x->props.reqid = tmpl->reqid;
+        x->props.family = AF_INET;
+}
+static struct xfrm_state *
+__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
+{
+        unsigned h = __xfrm4_spi_hash(daddr, spi, proto);
+        struct xfrm_state *x;
+        list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) {
+                if (x->props.family == AF_INET &&
+                    spi == x->id.spi &&
+                    daddr->a4 == x->id.daddr.a4 &&
+                    proto == x->id.proto) {
+                        xfrm_state_hold(x);
+                        return x;
+                }
+        }
+        return NULL;
+}
+static struct xfrm_state *
+__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, 
+                 xfrm_address_t *daddr, xfrm_address_t *saddr, 
+                 int create)
+{
+        struct xfrm_state *x, *x0;
+        unsigned h = __xfrm4_dst_hash(daddr);
+        x0 = NULL;
+        list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) {
+                if (x->props.family == AF_INET &&
+                    daddr->a4 == x->id.daddr.a4 &&
+                    mode == x->props.mode &&
+                    proto == x->id.proto &&
+                    saddr->a4 == x->props.saddr.a4 &&
+                    reqid == x->props.reqid &&
+                    x->km.state == XFRM_STATE_ACQ &&
+                    !x->id.spi) {
+                            x0 = x;
+                            break;
+                    }
+        }
+        if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
+                x0->sel.daddr.a4 = daddr->a4;
+                x0->sel.saddr.a4 = saddr->a4;
+                x0->sel.prefixlen_d = 32;
+                x0->sel.prefixlen_s = 32;
+                x0->props.saddr.a4 = saddr->a4;
+                x0->km.state = XFRM_STATE_ACQ;
+                x0->id.daddr.a4 = daddr->a4;
+                x0->id.proto = proto;
+                x0->props.family = AF_INET;
+                x0->props.mode = mode;
+                x0->props.reqid = reqid;
+                x0->props.family = AF_INET;
+                x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
+                xfrm_state_hold(x0);
+                x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
+                add_timer(&x0->timer);
+                xfrm_state_hold(x0);
+                list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h);
+                wake_up(&km_waitq);
+        }
+        if (x0)
+                xfrm_state_hold(x0);
+        return x0;
+}
+static struct xfrm_state_afinfo xfrm4_state_afinfo = {
+        .family                 = AF_INET,
+        .lock                   = RW_LOCK_UNLOCKED,
+        .init_tempsel           = __xfrm4_init_tempsel,
+        .state_lookup           = __xfrm4_state_lookup,
+        .find_acq               = __xfrm4_find_acq,
+};
+void __init xfrm4_state_init(void)
+{
+        xfrm_state_register_afinfo(&xfrm4_state_afinfo);
+}
+void __exit xfrm4_state_fini(void)
+{
+        xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
+}
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 000000000000..413191f585f6
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,144 @@
+/* xfrm4_tunnel.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+        struct iphdr *iph;
+        
+        iph = skb->nh.iph;
+        iph->tot_len = htons(skb->len);
+        ip_send_check(iph);
+        return 0;
+}
+static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+        return 0;
+}
+static struct xfrm_tunnel *ipip_handler;
+static DECLARE_MUTEX(xfrm4_tunnel_sem);
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler)
+{
+        int ret;
+        down(&xfrm4_tunnel_sem);
+        ret = 0;
+        if (ipip_handler != NULL)
+                ret = -EINVAL;
+        if (!ret)
+                ipip_handler = handler;
+        up(&xfrm4_tunnel_sem);
+        return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_register);
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler)
+{
+        int ret;
+        down(&xfrm4_tunnel_sem);
+        ret = 0;
+        if (ipip_handler != handler)
+                ret = -EINVAL;
+        if (!ret)
+                ipip_handler = NULL;
+        up(&xfrm4_tunnel_sem);
+        synchronize_net();
+        return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+static int ipip_rcv(struct sk_buff *skb)
+{
+        struct xfrm_tunnel *handler = ipip_handler;
+        /* Tunnel devices take precedence.  */
+        if (handler && handler->handler(skb) == 0)
+                return 0;
+        return xfrm4_rcv(skb);
+}
+static void ipip_err(struct sk_buff *skb, u32 info)
+{
+        struct xfrm_tunnel *handler = ipip_handler;
+        u32 arg = info;
+        if (handler)
+                handler->err_handler(skb, &arg);
+}
+static int ipip_init_state(struct xfrm_state *x, void *args)
+{
+        if (!x->props.mode)
+                return -EINVAL;
+        if (x->encap)
+                return -EINVAL;
+        x->props.header_len = sizeof(struct iphdr);
+        return 0;
+}
+static void ipip_destroy(struct xfrm_state *x)
+{
+}
+static struct xfrm_type ipip_type = {
+        .description    = "IPIP",
+        .owner          = THIS_MODULE,
+        .proto          = IPPROTO_IPIP,
+        .init_state     = ipip_init_state,
+        .destructor     = ipip_destroy,
+        .input          = ipip_xfrm_rcv,
+        .output         = ipip_output
+};
+static struct net_protocol ipip_protocol = {
+        .handler        =       ipip_rcv,
+        .err_handler    =       ipip_err,
+        .no_policy      =       1,
+};
+static int __init ipip_init(void)
+{
+        if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
+                printk(KERN_INFO "ipip init: can't add xfrm type\n");
+                return -EAGAIN;
+        }
+        if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) {
+                printk(KERN_INFO "ipip init: can't add protocol\n");
+                xfrm_unregister_type(&ipip_type, AF_INET);
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void __exit ipip_fini(void)
+{
+        if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0)
+                printk(KERN_INFO "ipip close: can't remove protocol\n");
+        if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
+                printk(KERN_INFO "ipip close: can't remove xfrm type\n");
+}
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4