Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4/ipvs
27 files changed, 12565 insertions, 0 deletions
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
new file mode 100644
index 000000000000..63a82b4b64bb
--- /dev/null
+++ b/net/ipv4/ipvs/Kconfig
@@ -0,0 +1,244 @@
+#
+# IP Virtual Server configuration
+#
+menu    "IP: Virtual Server Configuration"
+        depends on INET && NETFILTER
+config  IP_VS
+        tristate "IP virtual server support (EXPERIMENTAL)"
+        depends on INET && NETFILTER
+        ---help---
+          IP Virtual Server support will let you build a high-performance
+          virtual server based on cluster of two or more real servers. This
+          option must be enabled for at least one of the clustered computers
+          that will take care of intercepting incoming connections to a
+          single IP address and scheduling them to real servers.
+          Three request dispatching techniques are implemented, they are
+          virtual server via NAT, virtual server via tunneling and virtual
+          server via direct routing. The several scheduling algorithms can
+          be used to choose which server the connection is directed to,
+          thus load balancing can be achieved among the servers.  For more
+          information and its administration program, please visit the
+          following URL: <http://www.linuxvirtualserver.org/>.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_DEBUG
+        bool "IP virtual server debugging"
+        depends on IP_VS
+        ---help---
+          Say Y here if you want to get additional messages useful in
+          debugging the IP virtual server code. You can change the debug
+          level in /proc/sys/net/ipv4/vs/debug_level
+config  IP_VS_TAB_BITS
+        int "IPVS connection table size (the Nth power of 2)"
+        depends on IP_VS 
+        default "12" 
+        ---help---
+          The IPVS connection hash table uses the chaining scheme to handle
+          hash collisions. Using a big IPVS connection hash table will greatly
+          reduce conflicts when there are hundreds of thousands of connections
+          in the hash table.
+          Note the table size must be power of 2. The table size will be the
+          value of 2 to the your input number power. The number to choose is
+          from 8 to 20, the default number is 12, which means the table size
+          is 4096. Don't input the number too small, otherwise you will lose
+          performance on it. You can adapt the table size yourself, according
+          to your virtual server application. It is good to set the table size
+          not far less than the number of connections per second multiplying
+          average lasting time of connection in the table.  For example, your
+          virtual server gets 200 connections per second, the connection lasts
+          for 200 seconds in average in the connection table, the table size
+          should be not far less than 200x200, it is good to set the table
+          size 32768 (2**15).
+          Another note that each connection occupies 128 bytes effectively and
+          each hash entry uses 8 bytes, so you can estimate how much memory is
+          needed for your box.
+comment "IPVS transport protocol load balancing support"
+        depends on IP_VS
+config  IP_VS_PROTO_TCP
+        bool "TCP load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing TCP transport
+          protocol. Say Y if unsure.
+config  IP_VS_PROTO_UDP
+        bool "UDP load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing UDP transport
+          protocol. Say Y if unsure.
+config  IP_VS_PROTO_ESP
+        bool "ESP load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing ESP (Encapsultion
+          Security Payload) transport protocol. Say Y if unsure.
+config  IP_VS_PROTO_AH
+        bool "AH load balancing support"
+        depends on IP_VS
+        ---help---
+          This option enables support for load balancing AH (Authentication
+          Header) transport protocol. Say Y if unsure.
+comment "IPVS scheduler"
+        depends on IP_VS
+config  IP_VS_RR
+        tristate "round-robin scheduling"
+        depends on IP_VS
+        ---help---
+          The robin-robin scheduling algorithm simply directs network
+          connections to different real servers in a round-robin manner.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+ 
+config  IP_VS_WRR
+        tristate "weighted round-robin scheduling" 
+        depends on IP_VS
+        ---help---
+          The weighted robin-robin scheduling algorithm directs network
+          connections to different real servers based on server weights
+          in a round-robin manner. Servers with higher weights receive
+          new connections first than those with less weights, and servers
+          with higher weights get more connections than those with less
+          weights and servers with equal weights get equal connections.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_LC
+        tristate "least-connection scheduling"
+        depends on IP_VS
+        ---help---
+          The least-connection scheduling algorithm directs network
+          connections to the server with the least number of active 
+          connections.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_WLC
+        tristate "weighted least-connection scheduling"
+        depends on IP_VS
+        ---help---
+          The weighted least-connection scheduling algorithm directs network
+          connections to the server with the least active connections
+          normalized by the server weight.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_LBLC
+        tristate "locality-based least-connection scheduling"
+        depends on IP_VS
+        ---help---
+          The locality-based least-connection scheduling algorithm is for
+          destination IP load balancing. It is usually used in cache cluster.
+          This algorithm usually directs packet destined for an IP address to
+          its server if the server is alive and under load. If the server is
+          overloaded (its active connection numbers is larger than its weight)
+          and there is a server in its half load, then allocate the weighted
+          least-connection server to this IP address.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_LBLCR
+        tristate "locality-based least-connection with replication scheduling"
+        depends on IP_VS
+        ---help---
+          The locality-based least-connection with replication scheduling
+          algorithm is also for destination IP load balancing. It is 
+          usually used in cache cluster. It differs from the LBLC scheduling
+          as follows: the load balancer maintains mappings from a target
+          to a set of server nodes that can serve the target. Requests for
+          a target are assigned to the least-connection node in the target's
+          server set. If all the node in the server set are over loaded,
+          it picks up a least-connection node in the cluster and adds it
+          in the sever set for the target. If the server set has not been
+          modified for the specified time, the most loaded node is removed
+          from the server set, in order to avoid high degree of replication.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_DH
+        tristate "destination hashing scheduling"
+        depends on IP_VS
+        ---help---
+          The destination hashing scheduling algorithm assigns network
+          connections to the servers through looking up a statically assigned
+          hash table by their destination IP addresses.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_SH
+        tristate "source hashing scheduling"
+        depends on IP_VS
+        ---help---
+          The source hashing scheduling algorithm assigns network
+          connections to the servers through looking up a statically assigned
+          hash table by their source IP addresses.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_SED
+        tristate "shortest expected delay scheduling"
+        depends on IP_VS
+        ---help---
+          The shortest expected delay scheduling algorithm assigns network
+          connections to the server with the shortest expected delay. The 
+          expected delay that the job will experience is (Ci + 1) / Ui if 
+          sent to the ith server, in which Ci is the number of connections
+          on the the ith server and Ui is the fixed service rate (weight)
+          of the ith server.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+config  IP_VS_NQ
+        tristate "never queue scheduling"
+        depends on IP_VS
+        ---help---
+          The never queue scheduling algorithm adopts a two-speed model.
+          When there is an idle server available, the job will be sent to
+          the idle server, instead of waiting for a fast one. When there
+          is no idle server available, the job will be sent to the server
+          that minimize its expected delay (The Shortest Expected Delay
+          scheduling algorithm).
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+comment 'IPVS application helper'
+        depends on IP_VS
+config  IP_VS_FTP
+        tristate "FTP protocol helper"
+        depends on IP_VS && IP_VS_PROTO_TCP
+        ---help---
+          FTP is a protocol that transfers IP address and/or port number in
+          the payload. In the virtual server via Network Address Translation,
+          the IP address and port number of real servers cannot be sent to
+          clients in ftp connections directly, so FTP protocol helper is
+          required for tracking the connection and mangling it back to that of
+          virtual service.
+          If you want to compile it in kernel, say Y. To compile it as a
+          module, choose M here. If unsure, say N.
+endmenu
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
new file mode 100644
index 000000000000..a788461a40c9
--- /dev/null
+++ b/net/ipv4/ipvs/Makefile
@@ -0,0 +1,34 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
+ip_vs-objs :=   ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o        \
+                ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o                      \
+                ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o               \
+                $(ip_vs_proto-objs-y)
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
new file mode 100644
index 000000000000..d9212addd193
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -0,0 +1,658 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Version:     $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ *              IP_MASQ_APP application masquerading module
+ *
+ * Author:      Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/ip_vs.h>
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+/* ipvs application list head */
+static LIST_HEAD(ip_vs_app_list);
+static DECLARE_MUTEX(__ip_vs_app_mutex);
+/*
+ *      Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+        /* test and get the module atomically */
+        if (app->module)
+                return try_module_get(app->module);
+        else
+                return 1;
+}
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+        if (app->module)
+                module_put(app->module);
+}
+/*
+ *      Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+        struct ip_vs_protocol *pp;
+        struct ip_vs_app *inc;
+        int ret;
+        if (!(pp = ip_vs_proto_get(proto)))
+                return -EPROTONOSUPPORT;
+        if (!pp->unregister_app)
+                return -EOPNOTSUPP;
+        inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
+        if (!inc)
+                return -ENOMEM;
+        memcpy(inc, app, sizeof(*inc));
+        INIT_LIST_HEAD(&inc->p_list);
+        INIT_LIST_HEAD(&inc->incs_list);
+        inc->app = app;
+        inc->port = htons(port);
+        atomic_set(&inc->usecnt, 0);
+        if (app->timeouts) {
+                inc->timeout_table =
+                        ip_vs_create_timeout_table(app->timeouts,
+                                                   app->timeouts_size);
+                if (!inc->timeout_table) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+        }
+        ret = pp->register_app(inc);
+        if (ret)
+                goto out;
+        list_add(&inc->a_list, &app->incs_list);
+        IP_VS_DBG(9, "%s application %s:%u registered\n",
+                  pp->name, inc->name, inc->port);
+        return 0;
+  out:
+        if (inc->timeout_table)
+                kfree(inc->timeout_table);
+        kfree(inc);
+        return ret;
+}
+/*
+ *      Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct ip_vs_app *inc)
+{
+        struct ip_vs_protocol *pp;
+        if (!(pp = ip_vs_proto_get(inc->protocol)))
+                return;
+        if (pp->unregister_app)
+                pp->unregister_app(inc);
+        IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+                  pp->name, inc->name, inc->port);
+        list_del(&inc->a_list);
+        if (inc->timeout_table != NULL)
+                kfree(inc->timeout_table);
+        kfree(inc);
+}
+/*
+ *      Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+        int result;
+        atomic_inc(&inc->usecnt);
+        if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+                atomic_dec(&inc->usecnt);
+        return result;
+}
+/*
+ *      Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+        ip_vs_app_put(inc->app);
+        atomic_dec(&inc->usecnt);
+}
+/*
+ *      Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+        int result;
+        down(&__ip_vs_app_mutex);
+        result = ip_vs_app_inc_new(app, proto, port);
+        up(&__ip_vs_app_mutex);
+        return result;
+}
+/*
+ *      ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct ip_vs_app *app)
+{
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        down(&__ip_vs_app_mutex);
+        list_add(&app->a_list, &ip_vs_app_list);
+        up(&__ip_vs_app_mutex);
+        return 0;
+}
+/*
+ *      ip_vs_app unregistration routine
+ *      We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct ip_vs_app *app)
+{
+        struct ip_vs_app *inc, *nxt;
+        down(&__ip_vs_app_mutex);
+        list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
+                ip_vs_app_inc_release(inc);
+        }
+        list_del(&app->a_list);
+        up(&__ip_vs_app_mutex);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+}
+#if 0000
+/*
+ *      Get reference to app by name (called from user context)
+ */
+struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
+{
+        struct ip_vs_app *app, *a = NULL;
+        down(&__ip_vs_app_mutex);
+        list_for_each_entry(ent, &ip_vs_app_list, a_list) {
+                if (strcmp(app->name, appname))
+                        continue;
+                /* softirq may call ip_vs_app_get too, so the caller
+                   must disable softirq on the current CPU */
+                if (ip_vs_app_get(app))
+                        a = app;
+                break;
+        }
+        up(&__ip_vs_app_mutex);
+        return a;
+}
+#endif
+/*
+ *      Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+{
+        return pp->app_conn_bind(cp);
+}
+/*
+ *      Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+        struct ip_vs_app *inc = cp->app;
+        if (!inc)
+                return;
+        if (inc->unbind_conn)
+                inc->unbind_conn(inc, cp);
+        if (inc->done_conn)
+                inc->done_conn(inc, cp);
+        ip_vs_app_inc_put(inc);
+        cp->app = NULL;
+}
+/*
+ *      Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+        __u32 seq = ntohl(th->seq);
+        /*
+         *      Adjust seq with delta-offset for all packets after
+         *      the most recent resized pkt seq and with previous_delta offset
+         *      for all packets before most recent resized pkt seq.
+         */
+        if (vseq->delta || vseq->previous_delta) {
+                if(after(seq, vseq->init_seq)) {
+                        th->seq = htonl(seq + vseq->delta);
+                        IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
+                                  vseq->delta);
+                } else {
+                        th->seq = htonl(seq + vseq->previous_delta);
+                        IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
+                                  "(%d) to seq\n", vseq->previous_delta);
+                }
+        }
+}
+/*
+ *      Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+        __u32 ack_seq = ntohl(th->ack_seq);
+        /*
+         * Adjust ack_seq with delta-offset for
+         * the packets AFTER most recent resized pkt has caused a shift
+         * for packets before most recent resized pkt, use previous_delta
+         */
+        if (vseq->delta || vseq->previous_delta) {
+                /* since ack_seq is the number of octet that is expected
+                   to receive next, so compare it with init_seq+delta */
+                if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+                        th->ack_seq = htonl(ack_seq - vseq->delta);
+                        IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
+                                  "(%d) from ack_seq\n", vseq->delta);
+                } else {
+                        th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+                        IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
+                                  "previous_delta (%d) from ack_seq\n",
+                                  vseq->previous_delta);
+                }
+        }
+}
+/*
+ *      Updates ip_vs_seq if pkt has been resized
+ *      Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+                                 unsigned flag, __u32 seq, int diff)
+{
+        /* spinlock is to keep updating cp->flags atomic */
+        spin_lock(&cp->lock);
+        if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+                vseq->previous_delta = vseq->delta;
+                vseq->delta += diff;
+                vseq->init_seq = seq;
+                cp->flags |= flag;
+        }
+        spin_unlock(&cp->lock);
+}
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
+                                  struct ip_vs_app *app)
+{
+        int diff;
+        unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+        struct tcphdr *th;
+        __u32 seq;
+        if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+                return 0;
+        th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+        /*
+         *      Remember seq number in case this pkt gets resized
+         */
+        seq = ntohl(th->seq);
+        /*
+         *      Fix seq stuff if flagged as so.
+         */
+        if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+                vs_fix_seq(&cp->out_seq, th);
+        if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+                vs_fix_ack_seq(&cp->in_seq, th);
+        /*
+         *      Call private output hook function
+         */
+        if (app->pkt_out == NULL)
+                return 1;
+        if (!app->pkt_out(app, cp, pskb, &diff))
+                return 0;
+        /*
+         *      Update ip_vs seq stuff if len has changed.
+         */
+        if (diff != 0)
+                vs_seq_update(cp, &cp->out_seq,
+                              IP_VS_CONN_F_OUT_SEQ, seq, diff);
+        return 1;
+}
+/*
+ *      Output pkt hook. Will call bound ip_vs_app specific function
+ *      called by ipvs packet handler, assumes previously checked cp!=NULL
+ *      returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+        struct ip_vs_app *app;
+        /*
+         *      check if application module is bound to
+         *      this ip_vs_conn.
+         */
+        if ((app = cp->app) == NULL)
+                return 1;
+        /* TCP is complicated */
+        if (cp->protocol == IPPROTO_TCP)
+                return app_tcp_pkt_out(cp, pskb, app);
+        /*
+         *      Call private output hook function
+         */
+        if (app->pkt_out == NULL)
+                return 1;
+        return app->pkt_out(app, cp, pskb, NULL);
+}
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
+                                 struct ip_vs_app *app)
+{
+        int diff;
+        unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+        struct tcphdr *th;
+        __u32 seq;
+        if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+                return 0;
+        th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+        /*
+         *      Remember seq number in case this pkt gets resized
+         */
+        seq = ntohl(th->seq);
+        /*
+         *      Fix seq stuff if flagged as so.
+         */
+        if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+                vs_fix_seq(&cp->in_seq, th);
+        if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+                vs_fix_ack_seq(&cp->out_seq, th);
+        /*
+         *      Call private input hook function
+         */
+        if (app->pkt_in == NULL)
+                return 1;
+        if (!app->pkt_in(app, cp, pskb, &diff))
+                return 0;
+        /*
+         *      Update ip_vs seq stuff if len has changed.
+         */
+        if (diff != 0)
+                vs_seq_update(cp, &cp->in_seq,
+                              IP_VS_CONN_F_IN_SEQ, seq, diff);
+        return 1;
+}
+/*
+ *      Input pkt hook. Will call bound ip_vs_app specific function
+ *      called by ipvs packet handler, assumes previously checked cp!=NULL.
+ *      returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+        struct ip_vs_app *app;
+        /*
+         *      check if application module is bound to
+         *      this ip_vs_conn.
+         */
+        if ((app = cp->app) == NULL)
+                return 1;
+        /* TCP is complicated */
+        if (cp->protocol == IPPROTO_TCP)
+                return app_tcp_pkt_in(cp, pskb, app);
+        /*
+         *      Call private input hook function
+         */
+        if (app->pkt_in == NULL)
+                return 1;
+        return app->pkt_in(app, cp, pskb, NULL);
+}
+#ifdef CONFIG_PROC_FS
+/*
+ *      /proc/net/ip_vs_app entry function
+ */
+static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+{
+        struct ip_vs_app *app, *inc;
+        list_for_each_entry(app, &ip_vs_app_list, a_list) {
+                list_for_each_entry(inc, &app->incs_list, a_list) {
+                        if (pos-- == 0)
+                                return inc;
+                }
+        }
+        return NULL;
+}
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        down(&__ip_vs_app_mutex);
+        return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct ip_vs_app *inc, *app;
+        struct list_head *e;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return ip_vs_app_idx(0);
+        inc = v;
+        app = inc->app;
+        if ((e = inc->a_list.next) != &app->incs_list)
+                return list_entry(e, struct ip_vs_app, a_list);
+        /* go on to next application */
+        for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+                app = list_entry(e, struct ip_vs_app, a_list);
+                list_for_each_entry(inc, &app->incs_list, a_list) {
+                        return inc;
+                }
+        }
+        return NULL;
+}
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+        up(&__ip_vs_app_mutex);
+}
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_puts(seq, "prot port    usecnt name\n");
+        else {
+                const struct ip_vs_app *inc = v;
+                seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
+                           ip_vs_proto_name(inc->protocol),
+                           ntohs(inc->port),
+                           atomic_read(&inc->usecnt),
+                           inc->name);
+        }
+        return 0;
+}
+static struct seq_operations ip_vs_app_seq_ops = {
+        .start = ip_vs_app_seq_start,
+        .next  = ip_vs_app_seq_next,
+        .stop  = ip_vs_app_seq_stop,
+        .show  = ip_vs_app_seq_show,
+};
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &ip_vs_app_seq_ops);
+}
+static struct file_operations ip_vs_app_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ip_vs_app_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+#endif
+/*
+ *      Replace a segment of data with a new segment
+ */
+int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+                      char *o_buf, int o_len, char *n_buf, int n_len)
+{
+        struct iphdr *iph;
+        int diff;
+        int o_offset;
+        int o_left;
+        EnterFunction(9);
+        diff = n_len - o_len;
+        o_offset = o_buf - (char *)skb->data;
+        /* The length of left data after o_buf+o_len in the skb data */
+        o_left = skb->len - (o_offset + o_len);
+        if (diff <= 0) {
+                memmove(o_buf + n_len, o_buf + o_len, o_left);
+                memcpy(o_buf, n_buf, n_len);
+                skb_trim(skb, skb->len + diff);
+        } else if (diff <= skb_tailroom(skb)) {
+                skb_put(skb, diff);
+                memmove(o_buf + n_len, o_buf + o_len, o_left);
+                memcpy(o_buf, n_buf, n_len);
+        } else {
+                if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
+                        return -ENOMEM;
+                skb_put(skb, diff);
+                memmove(skb->data + o_offset + n_len,
+                        skb->data + o_offset + o_len, o_left);
+                memcpy(skb->data + o_offset, n_buf, n_len);
+        }
+        /* must update the iph total length here */
+        iph = skb->nh.iph;
+        iph->tot_len = htons(skb->len);
+        LeaveFunction(9);
+        return 0;
+}
+int ip_vs_app_init(void)
+{
+        /* we will replace it with proc_net_ipvs_create() soon */
+        proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
+        return 0;
+}
+void ip_vs_app_cleanup(void)
+{
+        proc_net_remove("ip_vs_app");
+}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
new file mode 100644
index 000000000000..fd6feb5499fe
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -0,0 +1,920 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>              /* for proc_net_* */
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/ip_vs.h>
+/*
+ *  Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+/*  SLAB cache for IPVS connections */
+static kmem_cache_t *ip_vs_conn_cachep;
+/*  counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+/*  counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+/*
+ *  Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS  4
+#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
+struct ip_vs_aligned_lock
+{
+        rwlock_t        l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+static inline void ct_read_lock(unsigned key)
+{
+        read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_read_unlock(unsigned key)
+{
+        read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_lock(unsigned key)
+{
+        write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_unlock(unsigned key)
+{
+        write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_read_lock_bh(unsigned key)
+{
+        read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_read_unlock_bh(unsigned key)
+{
+        read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_lock_bh(unsigned key)
+{
+        write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+static inline void ct_write_unlock_bh(unsigned key)
+{
+        write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+/*
+ *      Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+        return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
+                & IP_VS_CONN_TAB_MASK;
+}
+/*
+ *      Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ *      returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+        unsigned hash;
+        int ret;
+        /* Hash by protocol, client address and port */
+        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+        ct_write_lock(hash);
+        if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+                list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+                cp->flags |= IP_VS_CONN_F_HASHED;
+                atomic_inc(&cp->refcnt);
+                ret = 1;
+        } else {
+                IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                ret = 0;
+        }
+        ct_write_unlock(hash);
+        return ret;
+}
+/*
+ *      UNhashes ip_vs_conn from ip_vs_conn_tab.
+ *      returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+        unsigned hash;
+        int ret;
+        /* unhash it and decrease its reference counter */
+        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+        ct_write_lock(hash);
+        if (cp->flags & IP_VS_CONN_F_HASHED) {
+                list_del(&cp->c_list);
+                cp->flags &= ~IP_VS_CONN_F_HASHED;
+                atomic_dec(&cp->refcnt);
+                ret = 1;
+        } else
+                ret = 0;
+        ct_write_unlock(hash);
+        return ret;
+}
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from OUTside-to-INside.
+ *      s_addr, s_port: pkt source address (foreign host)
+ *      d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+        unsigned hash;
+        struct ip_vs_conn *cp;
+        hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+        ct_read_lock(hash);
+        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+                if (s_addr==cp->caddr && s_port==cp->cport &&
+                    d_port==cp->vport && d_addr==cp->vaddr &&
+                    protocol==cp->protocol) {
+                        /* HIT */
+                        atomic_inc(&cp->refcnt);
+                        ct_read_unlock(hash);
+                        return cp;
+                }
+        }
+        ct_read_unlock(hash);
+        return NULL;
+}
+struct ip_vs_conn *ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+        struct ip_vs_conn *cp;
+        cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
+        if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+                cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
+        IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+                  ip_vs_proto_name(protocol),
+                  NIPQUAD(s_addr), ntohs(s_port),
+                  NIPQUAD(d_addr), ntohs(d_port),
+                  cp?"hit":"not hit");
+        return cp;
+}
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from inside-to-OUTside.
+ *      s_addr, s_port: pkt source address (inside host)
+ *      d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+        unsigned hash;
+        struct ip_vs_conn *cp, *ret=NULL;
+        /*
+         *      Check for "full" addressed entries
+         */
+        hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
+        ct_read_lock(hash);
+        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+                if (d_addr == cp->caddr && d_port == cp->cport &&
+                    s_port == cp->dport && s_addr == cp->daddr &&
+                    protocol == cp->protocol) {
+                        /* HIT */
+                        atomic_inc(&cp->refcnt);
+                        ret = cp;
+                        break;
+                }
+        }
+        ct_read_unlock(hash);
+        IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+                  ip_vs_proto_name(protocol),
+                  NIPQUAD(s_addr), ntohs(s_port),
+                  NIPQUAD(d_addr), ntohs(d_port),
+                  ret?"hit":"not hit");
+        return ret;
+}
+/*
+ *      Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+        /* reset it expire in its timeout */
+        mod_timer(&cp->timer, jiffies+cp->timeout);
+        __ip_vs_conn_put(cp);
+}
+/*
+ *      Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
+{
+        if (ip_vs_conn_unhash(cp)) {
+                spin_lock(&cp->lock);
+                if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+                        atomic_dec(&ip_vs_conn_no_cport_cnt);
+                        cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+                        cp->cport = cport;
+                }
+                spin_unlock(&cp->lock);
+                /* hash on new dport */
+                ip_vs_conn_hash(cp);
+        }
+}
+/*
+ *      Bind a connection entry with the corresponding packet_xmit.
+ *      Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+        switch (IP_VS_FWD_METHOD(cp)) {
+        case IP_VS_CONN_F_MASQ:
+                cp->packet_xmit = ip_vs_nat_xmit;
+                break;
+        case IP_VS_CONN_F_TUNNEL:
+                cp->packet_xmit = ip_vs_tunnel_xmit;
+                break;
+        case IP_VS_CONN_F_DROUTE:
+                cp->packet_xmit = ip_vs_dr_xmit;
+                break;
+        case IP_VS_CONN_F_LOCALNODE:
+                cp->packet_xmit = ip_vs_null_xmit;
+                break;
+        case IP_VS_CONN_F_BYPASS:
+                cp->packet_xmit = ip_vs_bypass_xmit;
+                break;
+        }
+}
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+        return atomic_read(&dest->activeconns)
+                + atomic_read(&dest->inactconns);
+}
+/*
+ *      Bind a connection entry with a virtual service destination
+ *      Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+        /* if dest is NULL, then return directly */
+        if (!dest)
+                return;
+        /* Increase the refcnt counter of the dest */
+        atomic_inc(&dest->refcnt);
+        /* Bind with the destination and its corresponding transmitter */
+        cp->flags |= atomic_read(&dest->conn_flags);
+        cp->dest = dest;
+        IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+                  ip_vs_proto_name(cp->protocol),
+                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                  NIPQUAD(cp->daddr), ntohs(cp->dport),
+                  ip_vs_fwd_tag(cp), cp->state,
+                  cp->flags, atomic_read(&cp->refcnt),
+                  atomic_read(&dest->refcnt));
+        /* Update the connection counters */
+        if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+                /* It is a normal connection, so increase the inactive
+                   connection counter because it is in TCP SYNRECV
+                   state (inactive) or other protocol inacive state */
+                atomic_inc(&dest->inactconns);
+        } else {
+                /* It is a persistent connection/template, so increase
+                   the peristent connection counter */
+                atomic_inc(&dest->persistconns);
+        }
+        if (dest->u_threshold != 0 &&
+            ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+                dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+/*
+ *      Unbind a connection entry with its VS destination
+ *      Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+        struct ip_vs_dest *dest = cp->dest;
+        if (!dest)
+                return;
+        IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+                  ip_vs_proto_name(cp->protocol),
+                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                  NIPQUAD(cp->daddr), ntohs(cp->dport),
+                  ip_vs_fwd_tag(cp), cp->state,
+                  cp->flags, atomic_read(&cp->refcnt),
+                  atomic_read(&dest->refcnt));
+        /* Update the connection counters */
+        if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+                /* It is a normal connection, so decrease the inactconns
+                   or activeconns counter */
+                if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+                        atomic_dec(&dest->inactconns);
+                } else {
+                        atomic_dec(&dest->activeconns);
+                }
+        } else {
+                /* It is a persistent connection/template, so decrease
+                   the peristent connection counter */
+                atomic_dec(&dest->persistconns);
+        }
+        if (dest->l_threshold != 0) {
+                if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        } else if (dest->u_threshold != 0) {
+                if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        } else {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        }
+        /*
+         * Simply decrease the refcnt of the dest, because the
+         * dest will be either in service's destination list
+         * or in the trash.
+         */
+        atomic_dec(&dest->refcnt);
+}
+/*
+ *      Checking if the destination of a connection template is available.
+ *      If available, return 1, otherwise invalidate this connection
+ *      template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+        struct ip_vs_dest *dest = ct->dest;
+        /*
+         * Checking the dest server status.
+         */
+        if ((dest == NULL) ||
+            !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 
+            (sysctl_ip_vs_expire_quiescent_template && 
+             (atomic_read(&dest->weight) == 0))) {
+                IP_VS_DBG(9, "check_template: dest not available for "
+                          "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                          "-> d:%u.%u.%u.%u:%d\n",
+                          ip_vs_proto_name(ct->protocol),
+                          NIPQUAD(ct->caddr), ntohs(ct->cport),
+                          NIPQUAD(ct->vaddr), ntohs(ct->vport),
+                          NIPQUAD(ct->daddr), ntohs(ct->dport));
+                /*
+                 * Invalidate the connection template
+                 */
+                if (ct->cport) {
+                        if (ip_vs_conn_unhash(ct)) {
+                                ct->dport = 65535;
+                                ct->vport = 65535;
+                                ct->cport = 0;
+                                ip_vs_conn_hash(ct);
+                        }
+                }
+                /*
+                 * Simply decrease the refcnt of the template,
+                 * don't restart its timer.
+                 */
+                atomic_dec(&ct->refcnt);
+                return 0;
+        }
+        return 1;
+}
+static void ip_vs_conn_expire(unsigned long data)
+{
+        struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+        cp->timeout = 60*HZ;
+        /*
+         *      hey, I'm using it
+         */
+        atomic_inc(&cp->refcnt);
+        /*
+         *      do I control anybody?
+         */
+        if (atomic_read(&cp->n_control))
+                goto expire_later;
+        /*
+         *      unhash it if it is hashed in the conn table
+         */
+        if (!ip_vs_conn_unhash(cp))
+                goto expire_later;
+        /*
+         *      refcnt==1 implies I'm the only one referrer
+         */
+        if (likely(atomic_read(&cp->refcnt) == 1)) {
+                /* delete the timer if it is activated by other users */
+                if (timer_pending(&cp->timer))
+                        del_timer(&cp->timer);
+                /* does anybody control me? */
+                if (cp->control)
+                        ip_vs_control_del(cp);
+                if (unlikely(cp->app != NULL))
+                        ip_vs_unbind_app(cp);
+                ip_vs_unbind_dest(cp);
+                if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+                        atomic_dec(&ip_vs_conn_no_cport_cnt);
+                atomic_dec(&ip_vs_conn_count);
+                kmem_cache_free(ip_vs_conn_cachep, cp);
+                return;
+        }
+        /* hash it back to the table */
+        ip_vs_conn_hash(cp);
+  expire_later:
+        IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+                  atomic_read(&cp->refcnt)-1,
+                  atomic_read(&cp->n_control));
+        ip_vs_conn_put(cp);
+}
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+        if (del_timer(&cp->timer))
+                mod_timer(&cp->timer, jiffies);
+        __ip_vs_conn_put(cp);
+}
+/*
+ *      Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
+               __u32 daddr, __u16 dport, unsigned flags,
+               struct ip_vs_dest *dest)
+{
+        struct ip_vs_conn *cp;
+        struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+        cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+        if (cp == NULL) {
+                IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+                return NULL;
+        }
+        memset(cp, 0, sizeof(*cp));
+        INIT_LIST_HEAD(&cp->c_list);
+        init_timer(&cp->timer);
+        cp->timer.data     = (unsigned long)cp;
+        cp->timer.function = ip_vs_conn_expire;
+        cp->protocol       = proto;
+        cp->caddr          = caddr;
+        cp->cport          = cport;
+        cp->vaddr          = vaddr;
+        cp->vport          = vport;
+        cp->daddr          = daddr;
+        cp->dport          = dport;
+        cp->flags          = flags;
+        spin_lock_init(&cp->lock);
+        /*
+         * Set the entry is referenced by the current thread before hashing
+         * it in the table, so that other thread run ip_vs_random_dropentry
+         * but cannot drop this entry.
+         */
+        atomic_set(&cp->refcnt, 1);
+        atomic_set(&cp->n_control, 0);
+        atomic_set(&cp->in_pkts, 0);
+        atomic_inc(&ip_vs_conn_count);
+        if (flags & IP_VS_CONN_F_NO_CPORT)
+                atomic_inc(&ip_vs_conn_no_cport_cnt);
+        /* Bind the connection with a destination server */
+        ip_vs_bind_dest(cp, dest);
+        /* Set its state and timeout */
+        cp->state = 0;
+        cp->timeout = 3*HZ;
+        /* Bind its packet transmitter */
+        ip_vs_bind_xmit(cp);
+        if (unlikely(pp && atomic_read(&pp->appcnt)))
+                ip_vs_bind_app(cp, pp);
+        /* Hash it in the ip_vs_conn_tab finally */
+        ip_vs_conn_hash(cp);
+        return cp;
+}
+/*
+ *      /proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+        int idx;
+        struct ip_vs_conn *cp;
+        
+        for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+                ct_read_lock_bh(idx);
+                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                        if (pos-- == 0) {
+                                seq->private = &ip_vs_conn_tab[idx];
+                                return cp;
+                        }
+                }
+                ct_read_unlock_bh(idx);
+        }
+        return NULL;
+}
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        seq->private = NULL;
+        return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct ip_vs_conn *cp = v;
+        struct list_head *e, *l = seq->private;
+        int idx;
+        ++*pos;
+        if (v == SEQ_START_TOKEN) 
+                return ip_vs_conn_array(seq, 0);
+        /* more on same hash chain? */
+        if ((e = cp->c_list.next) != l)
+                return list_entry(e, struct ip_vs_conn, c_list);
+        idx = l - ip_vs_conn_tab;
+        ct_read_unlock_bh(idx);
+        while (++idx < IP_VS_CONN_TAB_SIZE) {
+                ct_read_lock_bh(idx);
+                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                        seq->private = &ip_vs_conn_tab[idx];
+                        return cp;
+                }       
+                ct_read_unlock_bh(idx);
+        }
+        seq->private = NULL;
+        return NULL;
+}
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+{
+        struct list_head *l = seq->private;
+        if (l)
+                ct_read_unlock_bh(l - ip_vs_conn_tab);
+}
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+        else {
+                const struct ip_vs_conn *cp = v;
+                seq_printf(seq,
+                        "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
+                                ip_vs_proto_name(cp->protocol),
+                                ntohl(cp->caddr), ntohs(cp->cport),
+                                ntohl(cp->vaddr), ntohs(cp->vport),
+                                ntohl(cp->daddr), ntohs(cp->dport),
+                                ip_vs_state_name(cp->protocol, cp->state),
+                                (cp->timer.expires-jiffies)/HZ);
+        }
+        return 0;
+}
+static struct seq_operations ip_vs_conn_seq_ops = {
+        .start = ip_vs_conn_seq_start,
+        .next  = ip_vs_conn_seq_next,
+        .stop  = ip_vs_conn_seq_stop,
+        .show  = ip_vs_conn_seq_show,
+};
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &ip_vs_conn_seq_ops);
+}
+static struct file_operations ip_vs_conn_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ip_vs_conn_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+#endif
+/*
+ *      Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+        /*
+         * The drop rate array needs tuning for real environments.
+         * Called from timer bh only => no locking
+         */
+        static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+        static char todrop_counter[9] = {0};
+        int i;
+        /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+           This will leave enough time for normal connection to get
+           through. */
+        if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+                return 0;
+        /* Don't drop the entry if its number of incoming packets is not
+           located in [0, 8] */
+        i = atomic_read(&cp->in_pkts);
+        if (i > 8 || i < 0) return 0;
+        if (!todrop_rate[i]) return 0;
+        if (--todrop_counter[i] > 0) return 0;
+        todrop_counter[i] = todrop_rate[i];
+        return 1;
+}
+void ip_vs_random_dropentry(void)
+{
+        int idx;
+        struct ip_vs_conn *cp;
+        struct ip_vs_conn *ct;
+        /*
+         * Randomly scan 1/32 of the whole table every second
+         */
+        for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
+                unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+                /*
+                 *  Lock is actually needed in this loop.
+                 */
+                ct_write_lock(hash);
+                list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+                        if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+                                /* connection template */
+                                continue;
+                        if (cp->protocol == IPPROTO_TCP) {
+                                switch(cp->state) {
+                                case IP_VS_TCP_S_SYN_RECV:
+                                case IP_VS_TCP_S_SYNACK:
+                                        break;
+                                case IP_VS_TCP_S_ESTABLISHED:
+                                        if (todrop_entry(cp))
+                                                break;
+                                        continue;
+                                default:
+                                        continue;
+                                }
+                        } else {
+                                if (!todrop_entry(cp))
+                                        continue;
+                        }
+                        /*
+                         * Drop the entry, and drop its ct if not referenced
+                         */
+                        atomic_inc(&cp->refcnt);
+                        ct_write_unlock(hash);
+                        if ((ct = cp->control))
+                                atomic_inc(&ct->refcnt);
+                        IP_VS_DBG(4, "del connection\n");
+                        ip_vs_conn_expire_now(cp);
+                        if (ct) {
+                                IP_VS_DBG(4, "del conn template\n");
+                                ip_vs_conn_expire_now(ct);
+                        }
+                        ct_write_lock(hash);
+                }
+                ct_write_unlock(hash);
+        }
+}
+/*
+ *      Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+        int idx;
+        struct ip_vs_conn *cp;
+        struct ip_vs_conn *ct;
+  flush_again:
+        for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+                /*
+                 *  Lock is actually needed in this loop.
+                 */
+                ct_write_lock_bh(idx);
+                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                        atomic_inc(&cp->refcnt);
+                        ct_write_unlock(idx);
+                        if ((ct = cp->control))
+                                atomic_inc(&ct->refcnt);
+                        IP_VS_DBG(4, "del connection\n");
+                        ip_vs_conn_expire_now(cp);
+                        if (ct) {
+                                IP_VS_DBG(4, "del conn template\n");
+                                ip_vs_conn_expire_now(ct);
+                        }
+                        ct_write_lock(idx);
+                }
+                ct_write_unlock_bh(idx);
+        }
+        /* the counter may be not NULL, because maybe some conn entries
+           are run by slow timer handler or unhashed but still referred */
+        if (atomic_read(&ip_vs_conn_count) != 0) {
+                schedule();
+                goto flush_again;
+        }
+}
+int ip_vs_conn_init(void)
+{
+        int idx;
+        /*
+         * Allocate the connection hash table and initialize its list heads
+         */
+        ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+        if (!ip_vs_conn_tab)
+                return -ENOMEM;
+        /* Allocate ip_vs_conn slab cache */
+        ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+                                              sizeof(struct ip_vs_conn), 0,
+                                              SLAB_HWCACHE_ALIGN, NULL, NULL);
+        if (!ip_vs_conn_cachep) {
+                vfree(ip_vs_conn_tab);
+                return -ENOMEM;
+        }
+        IP_VS_INFO("Connection hash table configured "
+                   "(size=%d, memory=%ldKbytes)\n",
+                   IP_VS_CONN_TAB_SIZE,
+                   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+        IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+                  sizeof(struct ip_vs_conn));
+        for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+                INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+        }
+        for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
+                rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+        }
+        proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
+        /* calculate the random value for connection hash */
+        get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+        return 0;
+}
+void ip_vs_conn_cleanup(void)
+{
+        /* flush all the connection entries first */
+        ip_vs_conn_flush();
+        /* Release the empty cache */
+        kmem_cache_destroy(ip_vs_conn_cachep);
+        proc_net_remove("ip_vs_conn");
+        vfree(ip_vs_conn_tab);
+}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
new file mode 100644
index 000000000000..5fb257dd07cb
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -0,0 +1,1191 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ *      Paul `Rusty' Russell            properly handle non-linear skbs
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_skb_replace);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+EXPORT_SYMBOL(ip_vs_make_skb_writable);
+/* ID used in ICMP lookups */
+#define icmp_id(icmph)          (((icmph)->un).echo.id)
+const char *ip_vs_proto_name(unsigned proto)
+{
+        static char buf[20];
+        switch (proto) {
+        case IPPROTO_IP:
+                return "IP";
+        case IPPROTO_UDP:
+                return "UDP";
+        case IPPROTO_TCP:
+                return "TCP";
+        case IPPROTO_ICMP:
+                return "ICMP";
+        default:
+                sprintf(buf, "IP_%d", proto);
+                return buf;
+        }
+}
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+        while (--rows >= 0)
+                INIT_LIST_HEAD(&table[rows]);
+}
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest = cp->dest;
+        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                spin_lock(&dest->stats.lock);
+                dest->stats.inpkts++;
+                dest->stats.inbytes += skb->len;
+                spin_unlock(&dest->stats.lock);
+                spin_lock(&dest->svc->stats.lock);
+                dest->svc->stats.inpkts++;
+                dest->svc->stats.inbytes += skb->len;
+                spin_unlock(&dest->svc->stats.lock);
+                spin_lock(&ip_vs_stats.lock);
+                ip_vs_stats.inpkts++;
+                ip_vs_stats.inbytes += skb->len;
+                spin_unlock(&ip_vs_stats.lock);
+        }
+}
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest = cp->dest;
+        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                spin_lock(&dest->stats.lock);
+                dest->stats.outpkts++;
+                dest->stats.outbytes += skb->len;
+                spin_unlock(&dest->stats.lock);
+                spin_lock(&dest->svc->stats.lock);
+                dest->svc->stats.outpkts++;
+                dest->svc->stats.outbytes += skb->len;
+                spin_unlock(&dest->svc->stats.lock);
+                spin_lock(&ip_vs_stats.lock);
+                ip_vs_stats.outpkts++;
+                ip_vs_stats.outbytes += skb->len;
+                spin_unlock(&ip_vs_stats.lock);
+        }
+}
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+        spin_lock(&cp->dest->stats.lock);
+        cp->dest->stats.conns++;
+        spin_unlock(&cp->dest->stats.lock);
+        spin_lock(&svc->stats.lock);
+        svc->stats.conns++;
+        spin_unlock(&svc->stats.lock);
+        spin_lock(&ip_vs_stats.lock);
+        ip_vs_stats.conns++;
+        spin_unlock(&ip_vs_stats.lock);
+}
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+                const struct sk_buff *skb,
+                struct ip_vs_protocol *pp)
+{
+        if (unlikely(!pp->state_transition))
+                return 0;
+        return pp->state_transition(cp, direction, skb, pp);
+}
+int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
+{
+        struct sk_buff *skb = *pskb;
+        /* skb is already used, better copy skb and its payload */
+        if (unlikely(skb_shared(skb) || skb->sk))
+                goto copy_skb;
+        /* skb data is already used, copy it */
+        if (unlikely(skb_cloned(skb)))
+                goto copy_data;
+        return pskb_may_pull(skb, writable_len);
+  copy_data:
+        if (unlikely(writable_len > skb->len))
+                return 0;
+        return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+  copy_skb:
+        if (unlikely(writable_len > skb->len))
+                return 0;
+        skb = skb_copy(skb, GFP_ATOMIC);
+        if (!skb)
+                return 0;
+        BUG_ON(skb_is_nonlinear(skb));
+        /* Rest of kernel will get very unhappy if we pass it a
+           suddenly-orphaned skbuff */
+        if ((*pskb)->sk)
+                skb_set_owner_w(skb, (*pskb)->sk);
+        kfree_skb(*pskb);
+        *pskb = skb;
+        return 1;
+}
+/*
+ *  IPVS persistent scheduling function
+ *  It creates a connection entry according to its template if exists,
+ *  or selects a server and creates a connection entry plus a template.
+ *  Locking: we are svc user (svc->refcnt), so we hold all dests too
+ *  Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+                    const struct sk_buff *skb,
+                    __u16 ports[2])
+{
+        struct ip_vs_conn *cp = NULL;
+        struct iphdr *iph = skb->nh.iph;
+        struct ip_vs_dest *dest;
+        struct ip_vs_conn *ct;
+        __u16  dport;    /* destination port to forward */
+        __u32  snet;     /* source network of the client, after masking */
+        /* Mask saddr with the netmask to adjust template granularity */
+        snet = iph->saddr & svc->netmask;
+        IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
+                  "mnet %u.%u.%u.%u\n",
+                  NIPQUAD(iph->saddr), ntohs(ports[0]),
+                  NIPQUAD(iph->daddr), ntohs(ports[1]),
+                  NIPQUAD(snet));
+        /*
+         * As far as we know, FTP is a very complicated network protocol, and
+         * it uses control connection and data connections. For active FTP,
+         * FTP server initialize data connection to the client, its source port
+         * is often 20. For passive FTP, FTP server tells the clients the port
+         * that it passively listens to,  and the client issues the data
+         * connection. In the tunneling or direct routing mode, the load
+         * balancer is on the client-to-server half of connection, the port
+         * number is unknown to the load balancer. So, a conn template like
+         * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+         * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+         * is created for other persistent services.
+         */
+        if (ports[1] == svc->port) {
+                /* Check if a template already exists */
+                if (svc->port != FTPPORT)
+                        ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+                                               iph->daddr, ports[1]);
+                else
+                        ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+                                               iph->daddr, 0);
+                if (!ct || !ip_vs_check_template(ct)) {
+                        /*
+                         * No template found or the dest of the connection
+                         * template is not available.
+                         */
+                        dest = svc->scheduler->schedule(svc, skb);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                                return NULL;
+                        }
+                        /*
+                         * Create a template like <protocol,caddr,0,
+                         * vaddr,vport,daddr,dport> for non-ftp service,
+                         * and <protocol,caddr,0,vaddr,0,daddr,0>
+                         * for ftp service.
+                         */
+                        if (svc->port != FTPPORT)
+                                ct = ip_vs_conn_new(iph->protocol,
+                                                    snet, 0,
+                                                    iph->daddr,
+                                                    ports[1],
+                                                    dest->addr, dest->port,
+                                                    0,
+                                                    dest);
+                        else
+                                ct = ip_vs_conn_new(iph->protocol,
+                                                    snet, 0,
+                                                    iph->daddr, 0,
+                                                    dest->addr, 0,
+                                                    0,
+                                                    dest);
+                        if (ct == NULL)
+                                return NULL;
+                        ct->timeout = svc->timeout;
+                } else {
+                        /* set destination with the found template */
+                        dest = ct->dest;
+                }
+                dport = dest->port;
+        } else {
+                /*
+                 * Note: persistent fwmark-based services and persistent
+                 * port zero service are handled here.
+                 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+                 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+                 */
+                if (svc->fwmark)
+                        ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
+                                               htonl(svc->fwmark), 0);
+                else
+                        ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+                                               iph->daddr, 0);
+                if (!ct || !ip_vs_check_template(ct)) {
+                        /*
+                         * If it is not persistent port zero, return NULL,
+                         * otherwise create a connection template.
+                         */
+                        if (svc->port)
+                                return NULL;
+                        dest = svc->scheduler->schedule(svc, skb);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                                return NULL;
+                        }
+                        /*
+                         * Create a template according to the service
+                         */
+                        if (svc->fwmark)
+                                ct = ip_vs_conn_new(IPPROTO_IP,
+                                                    snet, 0,
+                                                    htonl(svc->fwmark), 0,
+                                                    dest->addr, 0,
+                                                    0,
+                                                    dest);
+                        else
+                                ct = ip_vs_conn_new(iph->protocol,
+                                                    snet, 0,
+                                                    iph->daddr, 0,
+                                                    dest->addr, 0,
+                                                    0,
+                                                    dest);
+                        if (ct == NULL)
+                                return NULL;
+                        ct->timeout = svc->timeout;
+                } else {
+                        /* set destination with the found template */
+                        dest = ct->dest;
+                }
+                dport = ports[1];
+        }
+        /*
+         *    Create a new connection according to the template
+         */
+        cp = ip_vs_conn_new(iph->protocol,
+                            iph->saddr, ports[0],
+                            iph->daddr, ports[1],
+                            dest->addr, dport,
+                            0,
+                            dest);
+        if (cp == NULL) {
+                ip_vs_conn_put(ct);
+                return NULL;
+        }
+        /*
+         *    Add its control
+         */
+        ip_vs_control_add(cp, ct);
+        ip_vs_conn_put(ct);
+        ip_vs_conn_stats(cp, svc);
+        return cp;
+}
+/*
+ *  IPVS main scheduling function
+ *  It selects a server according to the virtual service, and
+ *  creates a connection entry.
+ *  Protocols supported: TCP, UDP
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_conn *cp = NULL;
+        struct iphdr *iph = skb->nh.iph;
+        struct ip_vs_dest *dest;
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, iph->ihl*4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        /*
+         *    Persistent service
+         */
+        if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+                return ip_vs_sched_persist(svc, skb, pptr);
+        /*
+         *    Non-persistent service
+         */
+        if (!svc->fwmark && pptr[1] != svc->port) {
+                if (!svc->port)
+                        IP_VS_ERR("Schedule: port zero only supported "
+                                  "in persistent services, "
+                                  "check your ipvs configuration\n");
+                return NULL;
+        }
+        dest = svc->scheduler->schedule(svc, skb);
+        if (dest == NULL) {
+                IP_VS_DBG(1, "Schedule: no dest found.\n");
+                return NULL;
+        }
+        /*
+         *    Create a connection entry.
+         */
+        cp = ip_vs_conn_new(iph->protocol,
+                            iph->saddr, pptr[0],
+                            iph->daddr, pptr[1],
+                            dest->addr, dest->port?dest->port:pptr[1],
+                            0,
+                            dest);
+        if (cp == NULL)
+                return NULL;
+        IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
+                  "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+                  ip_vs_fwd_tag(cp),
+                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                  NIPQUAD(cp->daddr), ntohs(cp->dport),
+                  cp->flags, atomic_read(&cp->refcnt));
+        ip_vs_conn_stats(cp, svc);
+        return cp;
+}
+/*
+ *  Pass or drop the packet.
+ *  Called by ip_vs_in, when the virtual service is available but
+ *  no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+                struct ip_vs_protocol *pp)
+{
+        __u16 _ports[2], *pptr;
+        struct iphdr *iph = skb->nh.iph;
+        pptr = skb_header_pointer(skb, iph->ihl*4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL) {
+                ip_vs_service_put(svc);
+                return NF_DROP;
+        }
+        /* if it is fwmark-based service, the cache_bypass sysctl is up
+           and the destination is RTN_UNICAST (and not local), then create
+           a cache_bypass connection entry */
+        if (sysctl_ip_vs_cache_bypass && svc->fwmark
+            && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+                int ret, cs;
+                struct ip_vs_conn *cp;
+                ip_vs_service_put(svc);
+                /* create a new connection entry */
+                IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
+                cp = ip_vs_conn_new(iph->protocol,
+                                    iph->saddr, pptr[0],
+                                    iph->daddr, pptr[1],
+                                    0, 0,
+                                    IP_VS_CONN_F_BYPASS,
+                                    NULL);
+                if (cp == NULL)
+                        return NF_DROP;
+                /* statistics */
+                ip_vs_in_stats(cp, skb);
+                /* set state */
+                cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+                /* transmit the first SYN packet */
+                ret = cp->packet_xmit(skb, cp, pp);
+                /* do not touch skb anymore */
+                atomic_inc(&cp->in_pkts);
+                ip_vs_conn_put(cp);
+                return ret;
+        }
+        /*
+         * When the virtual ftp service is presented, packets destined
+         * for other services on the VIP may get here (except services
+         * listed in the ipvs table), pass the packets, because it is
+         * not ipvs job to decide to drop the packets.
+         */
+        if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
+                ip_vs_service_put(svc);
+                return NF_ACCEPT;
+        }
+        ip_vs_service_put(svc);
+        /*
+         * Notify the client that the destination is unreachable, and
+         * release the socket buffer.
+         * Since it is in IP layer, the TCP socket is not actually
+         * created, the TCP RST packet cannot be sent, instead that
+         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+         */
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+        return NF_DROP;
+}
+/*
+ *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ *      chain, and is used for VS/NAT.
+ *      It detects packets for VS/NAT connections and sends the packets
+ *      immediately. This can avoid that iptable_nat mangles the packets
+ *      for VS/NAT.
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+                                       struct sk_buff **pskb,
+                                       const struct net_device *in,
+                                       const struct net_device *out,
+                                       int (*okfn)(struct sk_buff *))
+{
+        if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+                return NF_ACCEPT;
+        /* The packet was sent from IPVS, exit this chain */
+        (*okfn)(*pskb);
+        return NF_STOLEN;
+}
+u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+        return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+static inline struct sk_buff *
+ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+        skb = ip_defrag(skb, user);
+        if (skb)
+                ip_send_check(skb->nh.iph);
+        return skb;
+}
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                    struct ip_vs_conn *cp, int inout)
+{
+        struct iphdr *iph        = skb->nh.iph;
+        unsigned int icmp_offset = iph->ihl*4;
+        struct icmphdr *icmph    = (struct icmphdr *)(skb->nh.raw + icmp_offset);
+        struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
+        if (inout) {
+                iph->saddr = cp->vaddr;
+                ip_send_check(iph);
+                ciph->daddr = cp->vaddr;
+                ip_send_check(ciph);
+        } else {
+                iph->daddr = cp->daddr;
+                ip_send_check(iph);
+                ciph->saddr = cp->daddr;
+                ip_send_check(ciph);
+        }
+        /* the TCP/UDP port */
+        if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
+                __u16 *ports = (void *)ciph + ciph->ihl*4;
+                if (inout)
+                        ports[1] = cp->vport;
+                else
+                        ports[0] = cp->dport;
+        }
+        /* And finally the ICMP checksum */
+        icmph->checksum = 0;
+        icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+        skb->ip_summed = CHECKSUM_UNNECESSARY;
+        if (inout)
+                IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                        "Forwarding altered outgoing ICMP");
+        else
+                IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                        "Forwarding altered incoming ICMP");
+}
+/*
+ *      Handle ICMP messages in the inside-to-outside direction (outgoing).
+ *      Find any that might be relevant, check against existing connections,
+ *      forward to the right destination host if relevant.
+ *      Currently handles error types - unreachable, quench, ttl exceeded.
+ *      (Only used in VS/NAT)
+ */
+static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
+{
+        struct sk_buff *skb = *pskb;
+        struct iphdr *iph;
+        struct icmphdr  _icmph, *ic;
+        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
+        struct ip_vs_conn *cp;
+        struct ip_vs_protocol *pp;
+        unsigned int offset, ihl, verdict;
+        *related = 1;
+        /* reassemble IP fragments */
+        if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+                skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+                if (!skb)
+                        return NF_STOLEN;
+                *pskb = skb;
+        }
+        iph = skb->nh.iph;
+        offset = ihl = iph->ihl * 4;
+        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+        if (ic == NULL)
+                return NF_DROP;
+        IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+                  ic->type, ntohs(icmp_id(ic)),
+                  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+        /*
+         * Work through seeing if this is for us.
+         * These checks are supposed to be in an order that means easy
+         * things are checked first to speed up processing.... however
+         * this means that some packets will manage to get a long way
+         * down this stack and then be rejected, but that's life.
+         */
+        if ((ic->type != ICMP_DEST_UNREACH) &&
+            (ic->type != ICMP_SOURCE_QUENCH) &&
+            (ic->type != ICMP_TIME_EXCEEDED)) {
+                *related = 0;
+                return NF_ACCEPT;
+        }
+        /* Now find the contained IP header */
+        offset += sizeof(_icmph);
+        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+        if (cih == NULL)
+                return NF_ACCEPT; /* The packet looks wrong, ignore */
+        pp = ip_vs_proto_get(cih->protocol);
+        if (!pp)
+                return NF_ACCEPT;
+        /* Is the embedded protocol header present? */
+        if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+                     pp->dont_defrag))
+                return NF_ACCEPT;
+        IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+        offset += cih->ihl * 4;
+        /* The embedded headers contain source and dest in reverse order */
+        cp = pp->conn_out_get(skb, pp, cih, offset, 1);
+        if (!cp)
+                return NF_ACCEPT;
+        verdict = NF_DROP;
+        if (IP_VS_FWD_METHOD(cp) != 0) {
+                IP_VS_ERR("shouldn't reach here, because the box is on the"
+                          "half connection in the tun/dr module.\n");
+        }
+        /* Ensure the checksum is correct */
+        if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+            ip_vs_checksum_complete(skb, ihl)) {
+                /* Failed checksum! */
+                IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
+                          NIPQUAD(iph->saddr));
+                goto out;
+        }
+        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+                offset += 2 * sizeof(__u16);
+        if (!ip_vs_make_skb_writable(pskb, offset))
+                goto out;
+        skb = *pskb;
+        ip_vs_nat_icmp(skb, pp, cp, 1);
+        /* do the statistics and put it back */
+        ip_vs_out_stats(cp, skb);
+        skb->nfcache |= NFC_IPVS_PROPERTY;
+        verdict = NF_ACCEPT;
+  out:
+        __ip_vs_conn_put(cp);
+        return verdict;
+}
+static inline int is_tcp_reset(const struct sk_buff *skb)
+{
+        struct tcphdr _tcph, *th;
+        th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL)
+                return 0;
+        return th->rst;
+}
+/*
+ *      It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ *      Check if outgoing packet belongs to the established ip_vs_conn,
+ *      rewrite addresses of the packet and send it on its way...
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
+          const struct net_device *in, const struct net_device *out,
+          int (*okfn)(struct sk_buff *))
+{
+        struct sk_buff  *skb = *pskb;
+        struct iphdr    *iph;
+        struct ip_vs_protocol *pp;
+        struct ip_vs_conn *cp;
+        int ihl;
+        EnterFunction(11);
+        if (skb->nfcache & NFC_IPVS_PROPERTY)
+                return NF_ACCEPT;
+        iph = skb->nh.iph;
+        if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+                int related, verdict = ip_vs_out_icmp(pskb, &related);
+                if (related)
+                        return verdict;
+                skb = *pskb;
+                iph = skb->nh.iph;
+        }
+        pp = ip_vs_proto_get(iph->protocol);
+        if (unlikely(!pp))
+                return NF_ACCEPT;
+        /* reassemble IP fragments */
+        if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
+                     !pp->dont_defrag)) {
+                skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+                if (!skb)
+                        return NF_STOLEN;
+                iph = skb->nh.iph;
+                *pskb = skb;
+        }
+        ihl = iph->ihl << 2;
+        /*
+         * Check if the packet belongs to an existing entry
+         */
+        cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
+        if (unlikely(!cp)) {
+                if (sysctl_ip_vs_nat_icmp_send &&
+                    (pp->protocol == IPPROTO_TCP ||
+                     pp->protocol == IPPROTO_UDP)) {
+                        __u16 _ports[2], *pptr;
+                        pptr = skb_header_pointer(skb, ihl,
+                                                  sizeof(_ports), _ports);
+                        if (pptr == NULL)
+                                return NF_ACCEPT;       /* Not for me */
+                        if (ip_vs_lookup_real_service(iph->protocol,
+                                                      iph->saddr, pptr[0])) {
+                                /*
+                                 * Notify the real server: there is no
+                                 * existing entry if it is not RST
+                                 * packet or not TCP packet.
+                                 */
+                                if (iph->protocol != IPPROTO_TCP
+                                    || !is_tcp_reset(skb)) {
+                                        icmp_send(skb,ICMP_DEST_UNREACH,
+                                                  ICMP_PORT_UNREACH, 0);
+                                        return NF_DROP;
+                                }
+                        }
+                }
+                IP_VS_DBG_PKT(12, pp, skb, 0,
+                              "packet continues traversal as normal");
+                return NF_ACCEPT;
+        }
+        IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+        if (!ip_vs_make_skb_writable(pskb, ihl))
+                goto drop;
+        /* mangle the packet */
+        if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
+                goto drop;
+        skb = *pskb;
+        skb->nh.iph->saddr = cp->vaddr;
+        ip_send_check(skb->nh.iph);
+        IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+        ip_vs_out_stats(cp, skb);
+        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+        ip_vs_conn_put(cp);
+        skb->nfcache |= NFC_IPVS_PROPERTY;
+        LeaveFunction(11);
+        return NF_ACCEPT;
+  drop:
+        ip_vs_conn_put(cp);
+        kfree_skb(*pskb);
+        return NF_STOLEN;
+}
+/*
+ *      Handle ICMP messages in the outside-to-inside direction (incoming).
+ *      Find any that might be relevant, check against existing connections,
+ *      forward to the right destination host if relevant.
+ *      Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int 
+ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
+{
+        struct sk_buff *skb = *pskb;
+        struct iphdr *iph;
+        struct icmphdr  _icmph, *ic;
+        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
+        struct ip_vs_conn *cp;
+        struct ip_vs_protocol *pp;
+        unsigned int offset, ihl, verdict;
+        *related = 1;
+        /* reassemble IP fragments */
+        if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+                skb = ip_vs_gather_frags(skb,
+                                         hooknum == NF_IP_LOCAL_IN ?
+                                         IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
+                if (!skb)
+                        return NF_STOLEN;
+                *pskb = skb;
+        }
+        iph = skb->nh.iph;
+        offset = ihl = iph->ihl * 4;
+        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+        if (ic == NULL)
+                return NF_DROP;
+        IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+                  ic->type, ntohs(icmp_id(ic)),
+                  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+        /*
+         * Work through seeing if this is for us.
+         * These checks are supposed to be in an order that means easy
+         * things are checked first to speed up processing.... however
+         * this means that some packets will manage to get a long way
+         * down this stack and then be rejected, but that's life.
+         */
+        if ((ic->type != ICMP_DEST_UNREACH) &&
+            (ic->type != ICMP_SOURCE_QUENCH) &&
+            (ic->type != ICMP_TIME_EXCEEDED)) {
+                *related = 0;
+                return NF_ACCEPT;
+        }
+        /* Now find the contained IP header */
+        offset += sizeof(_icmph);
+        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+        if (cih == NULL)
+                return NF_ACCEPT; /* The packet looks wrong, ignore */
+        pp = ip_vs_proto_get(cih->protocol);
+        if (!pp)
+                return NF_ACCEPT;
+        /* Is the embedded protocol header present? */
+        if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+                     pp->dont_defrag))
+                return NF_ACCEPT;
+        IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+        offset += cih->ihl * 4;
+        /* The embedded headers contain source and dest in reverse order */
+        cp = pp->conn_in_get(skb, pp, cih, offset, 1);
+        if (!cp)
+                return NF_ACCEPT;
+        verdict = NF_DROP;
+        /* Ensure the checksum is correct */
+        if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+            ip_vs_checksum_complete(skb, ihl)) {
+                /* Failed checksum! */
+                IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
+                          NIPQUAD(iph->saddr));
+                goto out;
+        }
+        /* do the statistics and put it back */
+        ip_vs_in_stats(cp, skb);
+        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+                offset += 2 * sizeof(__u16);
+        verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
+        /* do not touch skb anymore */
+  out:
+        __ip_vs_conn_put(cp);
+        return verdict;
+}
+/*
+ *      Check if it's for virtual services, look it up,
+ *      and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
+         const struct net_device *in, const struct net_device *out,
+         int (*okfn)(struct sk_buff *))
+{
+        struct sk_buff  *skb = *pskb;
+        struct iphdr    *iph;
+        struct ip_vs_protocol *pp;
+        struct ip_vs_conn *cp;
+        int ret, restart;
+        int ihl;
+        /*
+         *      Big tappo: only PACKET_HOST (neither loopback nor mcasts)
+         *      ... don't know why 1st test DOES NOT include 2nd (?)
+         */
+        if (unlikely(skb->pkt_type != PACKET_HOST
+                     || skb->dev == &loopback_dev || skb->sk)) {
+                IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
+                          skb->pkt_type,
+                          skb->nh.iph->protocol,
+                          NIPQUAD(skb->nh.iph->daddr));
+                return NF_ACCEPT;
+        }
+        iph = skb->nh.iph;
+        if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+                int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
+                if (related)
+                        return verdict;
+                skb = *pskb;
+                iph = skb->nh.iph;
+        }
+        /* Protocol supported? */
+        pp = ip_vs_proto_get(iph->protocol);
+        if (unlikely(!pp))
+                return NF_ACCEPT;
+        ihl = iph->ihl << 2;
+        /*
+         * Check if the packet belongs to an existing connection entry
+         */
+        cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
+        if (unlikely(!cp)) {
+                int v;
+                if (!pp->conn_schedule(skb, pp, &v, &cp))
+                        return v;
+        }
+        if (unlikely(!cp)) {
+                /* sorry, all this trouble for a no-hit :) */
+                IP_VS_DBG_PKT(12, pp, skb, 0,
+                              "packet continues traversal as normal");
+                return NF_ACCEPT;
+        }
+        IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+        /* Check the server status */
+        if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                /* the destination server is not available */
+                if (sysctl_ip_vs_expire_nodest_conn) {
+                        /* try to expire the connection immediately */
+                        ip_vs_conn_expire_now(cp);
+                } else {
+                        /* don't restart its timer, and silently
+                           drop the packet. */
+                        __ip_vs_conn_put(cp);
+                }
+                return NF_DROP;
+        }
+        ip_vs_in_stats(cp, skb);
+        restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+        if (cp->packet_xmit)
+                ret = cp->packet_xmit(skb, cp, pp);
+                /* do not touch skb anymore */
+        else {
+                IP_VS_DBG_RL("warning: packet_xmit is null");
+                ret = NF_ACCEPT;
+        }
+        /* increase its packet counter and check if it is needed
+           to be synchronized */
+        atomic_inc(&cp->in_pkts);
+        if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+            (cp->protocol != IPPROTO_TCP ||
+             cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+            (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+             == sysctl_ip_vs_sync_threshold[0]))
+                ip_vs_sync_conn(cp);
+        ip_vs_conn_put(cp);
+        return ret;
+}
+/*
+ *      It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ *      related packets destined for 0.0.0.0/0.
+ *      When fwmark-based virtual service is used, such as transparent
+ *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ *      sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ *      and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
+                   const struct net_device *in, const struct net_device *out,
+                   int (*okfn)(struct sk_buff *))
+{
+        int r;
+        if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
+                return NF_ACCEPT;
+        return ip_vs_in_icmp(pskb, &r, hooknum);
+}
+/* After packet filtering, forward packet through VS/DR, VS/TUN,
+   or VS/NAT(change destination), so that filtering rules can be
+   applied to IPVS. */
+static struct nf_hook_ops ip_vs_in_ops = {
+        .hook           = ip_vs_in,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_LOCAL_IN,
+        .priority       = 100,
+};
+/* After packet filtering, change source only for VS/NAT */
+static struct nf_hook_ops ip_vs_out_ops = {
+        .hook           = ip_vs_out,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_FORWARD,
+        .priority       = 100,
+};
+/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+   destined for 0.0.0.0/0, which is for incoming IPVS connections */
+static struct nf_hook_ops ip_vs_forward_icmp_ops = {
+        .hook           = ip_vs_forward_icmp,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_FORWARD,
+        .priority       = 99,
+};
+/* Before the netfilter connection tracking, exit from POST_ROUTING */
+static struct nf_hook_ops ip_vs_post_routing_ops = {
+        .hook           = ip_vs_post_routing,
+        .owner          = THIS_MODULE,
+        .pf             = PF_INET,
+        .hooknum        = NF_IP_POST_ROUTING,
+        .priority       = NF_IP_PRI_NAT_SRC-1,
+};
+/*
+ *      Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+        int ret;
+        ret = ip_vs_control_init();
+        if (ret < 0) {
+                IP_VS_ERR("can't setup control.\n");
+                goto cleanup_nothing;
+        }
+        ip_vs_protocol_init();
+        ret = ip_vs_app_init();
+        if (ret < 0) {
+                IP_VS_ERR("can't setup application helper.\n");
+                goto cleanup_protocol;
+        }
+        ret = ip_vs_conn_init();
+        if (ret < 0) {
+                IP_VS_ERR("can't setup connection table.\n");
+                goto cleanup_app;
+        }
+        ret = nf_register_hook(&ip_vs_in_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register in hook.\n");
+                goto cleanup_conn;
+        }
+        ret = nf_register_hook(&ip_vs_out_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register out hook.\n");
+                goto cleanup_inops;
+        }
+        ret = nf_register_hook(&ip_vs_post_routing_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register post_routing hook.\n");
+                goto cleanup_outops;
+        }
+        ret = nf_register_hook(&ip_vs_forward_icmp_ops);
+        if (ret < 0) {
+                IP_VS_ERR("can't register forward_icmp hook.\n");
+                goto cleanup_postroutingops;
+        }
+        IP_VS_INFO("ipvs loaded.\n");
+        return ret;
+  cleanup_postroutingops:
+        nf_unregister_hook(&ip_vs_post_routing_ops);
+  cleanup_outops:
+        nf_unregister_hook(&ip_vs_out_ops);
+  cleanup_inops:
+        nf_unregister_hook(&ip_vs_in_ops);
+  cleanup_conn:
+        ip_vs_conn_cleanup();
+  cleanup_app:
+        ip_vs_app_cleanup();
+  cleanup_protocol:
+        ip_vs_protocol_cleanup();
+        ip_vs_control_cleanup();
+  cleanup_nothing:
+        return ret;
+}
+static void __exit ip_vs_cleanup(void)
+{
+        nf_unregister_hook(&ip_vs_forward_icmp_ops);
+        nf_unregister_hook(&ip_vs_post_routing_ops);
+        nf_unregister_hook(&ip_vs_out_ops);
+        nf_unregister_hook(&ip_vs_in_ops);
+        ip_vs_conn_cleanup();
+        ip_vs_app_cleanup();
+        ip_vs_protocol_cleanup();
+        ip_vs_control_cleanup();
+        IP_VS_INFO("ipvs unloaded.\n");
+}
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
new file mode 100644
index 000000000000..218d9701036e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -0,0 +1,2391 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <net/ip_vs.h>
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DECLARE_MUTEX(__ip_vs_mutex);
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_svc_lock);
+/* lock for table with the real services */
+static DEFINE_RWLOCK(__ip_vs_rs_lock);
+/* lock for state and timeout tables */
+static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+/* lock for drop entry handling */
+static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
+/* lock for drop packet handling */
+static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
+/* 1/rate drop and drop-entry variables */
+int ip_vs_drop_rate = 0;
+int ip_vs_drop_counter = 0;
+static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
+/* number of virtual services */
+static int ip_vs_num_services = 0;
+/* sysctl variables */
+static int sysctl_ip_vs_drop_entry = 0;
+static int sysctl_ip_vs_drop_packet = 0;
+static int sysctl_ip_vs_secure_tcp = 0;
+static int sysctl_ip_vs_amemthresh = 1024;
+static int sysctl_ip_vs_am_droprate = 10;
+int sysctl_ip_vs_cache_bypass = 0;
+int sysctl_ip_vs_expire_nodest_conn = 0;
+int sysctl_ip_vs_expire_quiescent_template = 0;
+int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+int ip_vs_get_debug_level(void)
+{
+        return sysctl_ip_vs_debug_level;
+}
+#endif
+/*
+ *      update_defense_level is called from keventd and from sysctl.
+ */
+static void update_defense_level(void)
+{
+        struct sysinfo i;
+        static int old_secure_tcp = 0;
+        int availmem;
+        int nomem;
+        int to_change = -1;
+        /* we only count free and buffered memory (in pages) */
+        si_meminfo(&i);
+        availmem = i.freeram + i.bufferram;
+        /* however in linux 2.5 the i.bufferram is total page cache size,
+           we need adjust it */
+        /* si_swapinfo(&i); */
+        /* availmem = availmem - (i.totalswap - i.freeswap); */
+        nomem = (availmem < sysctl_ip_vs_amemthresh);
+        /* drop_entry */
+        spin_lock(&__ip_vs_dropentry_lock);
+        switch (sysctl_ip_vs_drop_entry) {
+        case 0:
+                atomic_set(&ip_vs_dropentry, 0);
+                break;
+        case 1:
+                if (nomem) {
+                        atomic_set(&ip_vs_dropentry, 1);
+                        sysctl_ip_vs_drop_entry = 2;
+                } else {
+                        atomic_set(&ip_vs_dropentry, 0);
+                }
+                break;
+        case 2:
+                if (nomem) {
+                        atomic_set(&ip_vs_dropentry, 1);
+                } else {
+                        atomic_set(&ip_vs_dropentry, 0);
+                        sysctl_ip_vs_drop_entry = 1;
+                };
+                break;
+        case 3:
+                atomic_set(&ip_vs_dropentry, 1);
+                break;
+        }
+        spin_unlock(&__ip_vs_dropentry_lock);
+        /* drop_packet */
+        spin_lock(&__ip_vs_droppacket_lock);
+        switch (sysctl_ip_vs_drop_packet) {
+        case 0:
+                ip_vs_drop_rate = 0;
+                break;
+        case 1:
+                if (nomem) {
+                        ip_vs_drop_rate = ip_vs_drop_counter
+                                = sysctl_ip_vs_amemthresh /
+                                (sysctl_ip_vs_amemthresh-availmem);
+                        sysctl_ip_vs_drop_packet = 2;
+                } else {
+                        ip_vs_drop_rate = 0;
+                }
+                break;
+        case 2:
+                if (nomem) {
+                        ip_vs_drop_rate = ip_vs_drop_counter
+                                = sysctl_ip_vs_amemthresh /
+                                (sysctl_ip_vs_amemthresh-availmem);
+                } else {
+                        ip_vs_drop_rate = 0;
+                        sysctl_ip_vs_drop_packet = 1;
+                }
+                break;
+        case 3:
+                ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+                break;
+        }
+        spin_unlock(&__ip_vs_droppacket_lock);
+        /* secure_tcp */
+        write_lock(&__ip_vs_securetcp_lock);
+        switch (sysctl_ip_vs_secure_tcp) {
+        case 0:
+                if (old_secure_tcp >= 2)
+                        to_change = 0;
+                break;
+        case 1:
+                if (nomem) {
+                        if (old_secure_tcp < 2)
+                                to_change = 1;
+                        sysctl_ip_vs_secure_tcp = 2;
+                } else {
+                        if (old_secure_tcp >= 2)
+                                to_change = 0;
+                }
+                break;
+        case 2:
+                if (nomem) {
+                        if (old_secure_tcp < 2)
+                                to_change = 1;
+                } else {
+                        if (old_secure_tcp >= 2)
+                                to_change = 0;
+                        sysctl_ip_vs_secure_tcp = 1;
+                }
+                break;
+        case 3:
+                if (old_secure_tcp < 2)
+                        to_change = 1;
+                break;
+        }
+        old_secure_tcp = sysctl_ip_vs_secure_tcp;
+        if (to_change >= 0)
+                ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+        write_unlock(&__ip_vs_securetcp_lock);
+}
+/*
+ *      Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD    1*HZ
+static void defense_work_handler(void *data);
+static DECLARE_WORK(defense_work, defense_work_handler, NULL);
+static void defense_work_handler(void *data)
+{
+        update_defense_level();
+        if (atomic_read(&ip_vs_dropentry))
+                ip_vs_random_dropentry();
+        schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+}
+int
+ip_vs_use_count_inc(void)
+{
+        return try_module_get(THIS_MODULE);
+}
+void
+ip_vs_use_count_dec(void)
+{
+        module_put(THIS_MODULE);
+}
+/*
+ *      Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+/*
+ *      Hash table: for real service lookups
+ */
+#define IP_VS_RTAB_BITS 4
+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
+/*
+ *      Trash for destinations
+ */
+static LIST_HEAD(ip_vs_dest_trash);
+/*
+ *      FTP & NULL virtual service counters
+ */
+static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
+static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+/*
+ *      Returns hash value for virtual service
+ */
+static __inline__ unsigned
+ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+        register unsigned porth = ntohs(port);
+        return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+                & IP_VS_SVC_TAB_MASK;
+}
+/*
+ *      Returns hash value of fwmark for virtual service lookup
+ */
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+{
+        return fwmark & IP_VS_SVC_TAB_MASK;
+}
+/*
+ *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ *      or in the ip_vs_svc_fwm_table by fwmark.
+ *      Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+        unsigned hash;
+        if (svc->flags & IP_VS_SVC_F_HASHED) {
+                IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        if (svc->fwmark == 0) {
+                /*
+                 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
+                 */
+                hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
+                list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+        } else {
+                /*
+                 *  Hash it by fwmark in ip_vs_svc_fwm_table
+                 */
+                hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+                list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+        }
+        svc->flags |= IP_VS_SVC_F_HASHED;
+        /* increase its refcnt because it is referenced by the svc table */
+        atomic_inc(&svc->refcnt);
+        return 1;
+}
+/*
+ *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ *      Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+        if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+                IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        if (svc->fwmark == 0) {
+                /* Remove it from the ip_vs_svc_table table */
+                list_del(&svc->s_list);
+        } else {
+                /* Remove it from the ip_vs_svc_fwm_table table */
+                list_del(&svc->f_list);
+        }
+        svc->flags &= ~IP_VS_SVC_F_HASHED;
+        atomic_dec(&svc->refcnt);
+        return 1;
+}
+/*
+ *      Get service by {proto,addr,port} in the service table.
+ */
+static __inline__ struct ip_vs_service *
+__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
+{
+        unsigned hash;
+        struct ip_vs_service *svc;
+        /* Check for "full" addressed entries */
+        hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
+        list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+                if ((svc->addr == vaddr)
+                    && (svc->port == vport)
+                    && (svc->protocol == protocol)) {
+                        /* HIT */
+                        atomic_inc(&svc->usecnt);
+                        return svc;
+                }
+        }
+        return NULL;
+}
+/*
+ *      Get service by {fwmark} in the service table.
+ */
+static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
+{
+        unsigned hash;
+        struct ip_vs_service *svc;
+        /* Check for fwmark addressed entries */
+        hash = ip_vs_svc_fwm_hashkey(fwmark);
+        list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+                if (svc->fwmark == fwmark) {
+                        /* HIT */
+                        atomic_inc(&svc->usecnt);
+                        return svc;
+                }
+        }
+        return NULL;
+}
+struct ip_vs_service *
+ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
+{
+        struct ip_vs_service *svc;
+        read_lock(&__ip_vs_svc_lock);
+        /*
+         *      Check the table hashed by fwmark first
+         */
+        if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
+                goto out;
+        /*
+         *      Check the table hashed by <protocol,addr,port>
+         *      for "full" addressed entries
+         */
+        svc = __ip_vs_service_get(protocol, vaddr, vport);
+        if (svc == NULL
+            && protocol == IPPROTO_TCP
+            && atomic_read(&ip_vs_ftpsvc_counter)
+            && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+                /*
+                 * Check if ftp service entry exists, the packet
+                 * might belong to FTP data connections.
+                 */
+                svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
+        }
+        if (svc == NULL
+            && atomic_read(&ip_vs_nullsvc_counter)) {
+                /*
+                 * Check if the catch-all port (port zero) exists
+                 */
+                svc = __ip_vs_service_get(protocol, vaddr, 0);
+        }
+  out:
+        read_unlock(&__ip_vs_svc_lock);
+        IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+                  fwmark, ip_vs_proto_name(protocol),
+                  NIPQUAD(vaddr), ntohs(vport),
+                  svc?"hit":"not hit");
+        return svc;
+}
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+        atomic_inc(&svc->refcnt);
+        dest->svc = svc;
+}
+static inline void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+        struct ip_vs_service *svc = dest->svc;
+        dest->svc = NULL;
+        if (atomic_dec_and_test(&svc->refcnt))
+                kfree(svc);
+}
+/*
+ *      Returns hash value for real service
+ */
+static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
+{
+        register unsigned porth = ntohs(port);
+        return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
+                & IP_VS_RTAB_MASK;
+}
+/*
+ *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ *      should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+{
+        unsigned hash;
+        if (!list_empty(&dest->d_list)) {
+                return 0;
+        }
+        /*
+         *      Hash by proto,addr,port,
+         *      which are the parameters of the real service.
+         */
+        hash = ip_vs_rs_hashkey(dest->addr, dest->port);
+        list_add(&dest->d_list, &ip_vs_rtable[hash]);
+        return 1;
+}
+/*
+ *      UNhashes ip_vs_dest from ip_vs_rtable.
+ *      should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+        /*
+         * Remove it from the ip_vs_rtable table.
+         */
+        if (!list_empty(&dest->d_list)) {
+                list_del(&dest->d_list);
+                INIT_LIST_HEAD(&dest->d_list);
+        }
+        return 1;
+}
+/*
+ *      Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
+{
+        unsigned hash;
+        struct ip_vs_dest *dest;
+        /*
+         *      Check for "full" addressed entries
+         *      Return the first found entry
+         */
+        hash = ip_vs_rs_hashkey(daddr, dport);
+        read_lock(&__ip_vs_rs_lock);
+        list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+                if ((dest->addr == daddr)
+                    && (dest->port == dport)
+                    && ((dest->protocol == protocol) ||
+                        dest->vfwmark)) {
+                        /* HIT */
+                        read_unlock(&__ip_vs_rs_lock);
+                        return dest;
+                }
+        }
+        read_unlock(&__ip_vs_rs_lock);
+        return NULL;
+}
+/*
+ *      Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+        struct ip_vs_dest *dest;
+        /*
+         * Find the destination for the given service
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if ((dest->addr == daddr) && (dest->port == dport)) {
+                        /* HIT */
+                        return dest;
+                }
+        }
+        return NULL;
+}
+/*
+ *  Lookup dest by {svc,addr,port} in the destination trash.
+ *  The destination trash is used to hold the destinations that are removed
+ *  from the service table but are still referenced by some conn entries.
+ *  The reason to add the destination trash is when the dest is temporary
+ *  down (either by administrator or by monitor program), the dest can be
+ *  picked back from the trash, the remaining connections to the dest can
+ *  continue, and the counting information of the dest is also useful for
+ *  scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+        struct ip_vs_dest *dest, *nxt;
+        /*
+         * Find the destination in trash
+         */
+        list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+                IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
+                          "refcnt=%d\n",
+                          dest->vfwmark,
+                          NIPQUAD(dest->addr), ntohs(dest->port),
+                          atomic_read(&dest->refcnt));
+                if (dest->addr == daddr &&
+                    dest->port == dport &&
+                    dest->vfwmark == svc->fwmark &&
+                    dest->protocol == svc->protocol &&
+                    (svc->fwmark ||
+                     (dest->vaddr == svc->addr &&
+                      dest->vport == svc->port))) {
+                        /* HIT */
+                        return dest;
+                }
+                /*
+                 * Try to purge the destination from trash if not referenced
+                 */
+                if (atomic_read(&dest->refcnt) == 1) {
+                        IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
+                                  "from trash\n",
+                                  dest->vfwmark,
+                                  NIPQUAD(dest->addr), ntohs(dest->port));
+                        list_del(&dest->n_list);
+                        ip_vs_dst_reset(dest);
+                        __ip_vs_unbind_svc(dest);
+                        kfree(dest);
+                }
+        }
+        return NULL;
+}
+/*
+ *  Clean up all the destinations in the trash
+ *  Called by the ip_vs_control_cleanup()
+ *
+ *  When the ip_vs_control_clearup is activated by ipvs module exit,
+ *  the service tables must have been flushed and all the connections
+ *  are expired, and the refcnt of each destination in the trash must
+ *  be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(void)
+{
+        struct ip_vs_dest *dest, *nxt;
+        list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+                list_del(&dest->n_list);
+                ip_vs_dst_reset(dest);
+                __ip_vs_unbind_svc(dest);
+                kfree(dest);
+        }
+}
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+        spin_lock_bh(&stats->lock);
+        memset(stats, 0, (char *)&stats->lock - (char *)stats);
+        spin_unlock_bh(&stats->lock);
+        ip_vs_zero_estimator(stats);
+}
+/*
+ *      Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc,
+                    struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
+{
+        int conn_flags;
+        /* set the weight and the flags */
+        atomic_set(&dest->weight, udest->weight);
+        conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+        /* check if local node and update the flags */
+        if (inet_addr_type(udest->addr) == RTN_LOCAL) {
+                conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+                        | IP_VS_CONN_F_LOCALNODE;
+        }
+        /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+        if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+                conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+        } else {
+                /*
+                 *    Put the real service in ip_vs_rtable if not present.
+                 *    For now only for NAT!
+                 */
+                write_lock_bh(&__ip_vs_rs_lock);
+                ip_vs_rs_hash(dest);
+                write_unlock_bh(&__ip_vs_rs_lock);
+        }
+        atomic_set(&dest->conn_flags, conn_flags);
+        /* bind the service */
+        if (!dest->svc) {
+                __ip_vs_bind_svc(dest, svc);
+        } else {
+                if (dest->svc != svc) {
+                        __ip_vs_unbind_svc(dest);
+                        ip_vs_zero_stats(&dest->stats);
+                        __ip_vs_bind_svc(dest, svc);
+                }
+        }
+        /* set the dest status flags */
+        dest->flags |= IP_VS_DEST_F_AVAILABLE;
+        if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+                dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+        dest->u_threshold = udest->u_threshold;
+        dest->l_threshold = udest->l_threshold;
+}
+/*
+ *      Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
+               struct ip_vs_dest **dest_p)
+{
+        struct ip_vs_dest *dest;
+        unsigned atype;
+        EnterFunction(2);
+        atype = inet_addr_type(udest->addr);
+        if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+                return -EINVAL;
+        dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+        if (dest == NULL) {
+                IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
+                return -ENOMEM;
+        }
+        memset(dest, 0, sizeof(struct ip_vs_dest));
+        dest->protocol = svc->protocol;
+        dest->vaddr = svc->addr;
+        dest->vport = svc->port;
+        dest->vfwmark = svc->fwmark;
+        dest->addr = udest->addr;
+        dest->port = udest->port;
+        atomic_set(&dest->activeconns, 0);
+        atomic_set(&dest->inactconns, 0);
+        atomic_set(&dest->persistconns, 0);
+        atomic_set(&dest->refcnt, 0);
+        INIT_LIST_HEAD(&dest->d_list);
+        spin_lock_init(&dest->dst_lock);
+        spin_lock_init(&dest->stats.lock);
+        __ip_vs_update_dest(svc, dest, udest);
+        ip_vs_new_estimator(&dest->stats);
+        *dest_p = dest;
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+        struct ip_vs_dest *dest;
+        __u32 daddr = udest->addr;
+        __u16 dport = udest->port;
+        int ret;
+        EnterFunction(2);
+        if (udest->weight < 0) {
+                IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+                return -ERANGE;
+        }
+        if (udest->l_threshold > udest->u_threshold) {
+                IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
+                          "upper threshold\n");
+                return -ERANGE;
+        }
+        /*
+         * Check if the dest already exists in the list
+         */
+        dest = ip_vs_lookup_dest(svc, daddr, dport);
+        if (dest != NULL) {
+                IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
+                return -EEXIST;
+        }
+        /*
+         * Check if the dest already exists in the trash and
+         * is from the same service
+         */
+        dest = ip_vs_trash_get_dest(svc, daddr, dport);
+        if (dest != NULL) {
+                IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
+                          "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+                          NIPQUAD(daddr), ntohs(dport),
+                          atomic_read(&dest->refcnt),
+                          dest->vfwmark,
+                          NIPQUAD(dest->vaddr),
+                          ntohs(dest->vport));
+                __ip_vs_update_dest(svc, dest, udest);
+                /*
+                 * Get the destination from the trash
+                 */
+                list_del(&dest->n_list);
+                ip_vs_new_estimator(&dest->stats);
+                write_lock_bh(&__ip_vs_svc_lock);
+                /*
+                 * Wait until all other svc users go away.
+                 */
+                IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+                list_add(&dest->n_list, &svc->destinations);
+                svc->num_dests++;
+                /* call the update_service function of its scheduler */
+                svc->scheduler->update_service(svc);
+                write_unlock_bh(&__ip_vs_svc_lock);
+                return 0;
+        }
+        /*
+         * Allocate and initialize the dest structure
+         */
+        ret = ip_vs_new_dest(svc, udest, &dest);
+        if (ret) {
+                return ret;
+        }
+        /*
+         * Add the dest entry into the list
+         */
+        atomic_inc(&dest->refcnt);
+        write_lock_bh(&__ip_vs_svc_lock);
+        /*
+         * Wait until all other svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        list_add(&dest->n_list, &svc->destinations);
+        svc->num_dests++;
+        /* call the update_service function of its scheduler */
+        svc->scheduler->update_service(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+        struct ip_vs_dest *dest;
+        __u32 daddr = udest->addr;
+        __u16 dport = udest->port;
+        EnterFunction(2);
+        if (udest->weight < 0) {
+                IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
+                return -ERANGE;
+        }
+        if (udest->l_threshold > udest->u_threshold) {
+                IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
+                          "upper threshold\n");
+                return -ERANGE;
+        }
+        /*
+         *  Lookup the destination list
+         */
+        dest = ip_vs_lookup_dest(svc, daddr, dport);
+        if (dest == NULL) {
+                IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
+                return -ENOENT;
+        }
+        __ip_vs_update_dest(svc, dest, udest);
+        write_lock_bh(&__ip_vs_svc_lock);
+        /* Wait until all other svc users go away */
+        while (atomic_read(&svc->usecnt) > 1) {};
+        /* call the update_service, because server weight may be changed */
+        svc->scheduler->update_service(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+{
+        ip_vs_kill_estimator(&dest->stats);
+        /*
+         *  Remove it from the d-linked list with the real services.
+         */
+        write_lock_bh(&__ip_vs_rs_lock);
+        ip_vs_rs_unhash(dest);
+        write_unlock_bh(&__ip_vs_rs_lock);
+        /*
+         *  Decrease the refcnt of the dest, and free the dest
+         *  if nobody refers to it (refcnt=0). Otherwise, throw
+         *  the destination into the trash.
+         */
+        if (atomic_dec_and_test(&dest->refcnt)) {
+                ip_vs_dst_reset(dest);
+                /* simply decrease svc->refcnt here, let the caller check
+                   and release the service if nobody refers to it.
+                   Only user context can release destination and service,
+                   and only one user context can update virtual service at a
+                   time, so the operation here is OK */
+                atomic_dec(&dest->svc->refcnt);
+                kfree(dest);
+        } else {
+                IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+                          NIPQUAD(dest->addr), ntohs(dest->port),
+                          atomic_read(&dest->refcnt));
+                list_add(&dest->n_list, &ip_vs_dest_trash);
+                atomic_inc(&dest->refcnt);
+        }
+}
+/*
+ *      Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+                                struct ip_vs_dest *dest,
+                                int svcupd)
+{
+        dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+        /*
+         *  Remove it from the d-linked destination list.
+         */
+        list_del(&dest->n_list);
+        svc->num_dests--;
+        if (svcupd) {
+                /*
+                 *  Call the update_service function of its scheduler
+                 */
+                svc->scheduler->update_service(svc);
+        }
+}
+/*
+ *      Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
+{
+        struct ip_vs_dest *dest;
+        __u32 daddr = udest->addr;
+        __u16 dport = udest->port;
+        EnterFunction(2);
+        dest = ip_vs_lookup_dest(svc, daddr, dport);
+        if (dest == NULL) {
+                IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
+                return -ENOENT;
+        }
+        write_lock_bh(&__ip_vs_svc_lock);
+        /*
+         *      Wait until all other svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        /*
+         *      Unlink dest from the service
+         */
+        __ip_vs_unlink_dest(svc, dest, 1);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        /*
+         *      Delete the destination
+         */
+        __ip_vs_del_dest(dest);
+        LeaveFunction(2);
+        return 0;
+}
+/*
+ *      Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
+{
+        int ret = 0;
+        struct ip_vs_scheduler *sched = NULL;
+        struct ip_vs_service *svc = NULL;
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        /* Lookup the scheduler by 'u->sched_name' */
+        sched = ip_vs_scheduler_get(u->sched_name);
+        if (sched == NULL) {
+                IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+                           u->sched_name);
+                ret = -ENOENT;
+                goto out_mod_dec;
+        }
+        svc = (struct ip_vs_service *)
+                kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+        if (svc == NULL) {
+                IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
+                ret = -ENOMEM;
+                goto out_err;
+        }
+        memset(svc, 0, sizeof(struct ip_vs_service));
+        /* I'm the first user of the service */
+        atomic_set(&svc->usecnt, 1);
+        atomic_set(&svc->refcnt, 0);
+        svc->protocol = u->protocol;
+        svc->addr = u->addr;
+        svc->port = u->port;
+        svc->fwmark = u->fwmark;
+        svc->flags = u->flags;
+        svc->timeout = u->timeout * HZ;
+        svc->netmask = u->netmask;
+        INIT_LIST_HEAD(&svc->destinations);
+        rwlock_init(&svc->sched_lock);
+        spin_lock_init(&svc->stats.lock);
+        /* Bind the scheduler */
+        ret = ip_vs_bind_scheduler(svc, sched);
+        if (ret)
+                goto out_err;
+        sched = NULL;
+        /* Update the virtual service counters */
+        if (svc->port == FTPPORT)
+                atomic_inc(&ip_vs_ftpsvc_counter);
+        else if (svc->port == 0)
+                atomic_inc(&ip_vs_nullsvc_counter);
+        ip_vs_new_estimator(&svc->stats);
+        ip_vs_num_services++;
+        /* Hash the service into the service table */
+        write_lock_bh(&__ip_vs_svc_lock);
+        ip_vs_svc_hash(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        *svc_p = svc;
+        return 0;
+  out_err:
+        if (svc != NULL) {
+                if (svc->scheduler)
+                        ip_vs_unbind_scheduler(svc);
+                if (svc->inc) {
+                        local_bh_disable();
+                        ip_vs_app_inc_put(svc->inc);
+                        local_bh_enable();
+                }
+                kfree(svc);
+        }
+        ip_vs_scheduler_put(sched);
+  out_mod_dec:
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        return ret;
+}
+/*
+ *      Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
+{
+        struct ip_vs_scheduler *sched, *old_sched;
+        int ret = 0;
+        /*
+         * Lookup the scheduler, by 'u->sched_name'
+         */
+        sched = ip_vs_scheduler_get(u->sched_name);
+        if (sched == NULL) {
+                IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+                           u->sched_name);
+                return -ENOENT;
+        }
+        old_sched = sched;
+        write_lock_bh(&__ip_vs_svc_lock);
+        /*
+         * Wait until all other svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        /*
+         * Set the flags and timeout value
+         */
+        svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+        svc->timeout = u->timeout * HZ;
+        svc->netmask = u->netmask;
+        old_sched = svc->scheduler;
+        if (sched != old_sched) {
+                /*
+                 * Unbind the old scheduler
+                 */
+                if ((ret = ip_vs_unbind_scheduler(svc))) {
+                        old_sched = sched;
+                        goto out;
+                }
+                /*
+                 * Bind the new scheduler
+                 */
+                if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+                        /*
+                         * If ip_vs_bind_scheduler fails, restore the old
+                         * scheduler.
+                         * The main reason of failure is out of memory.
+                         *
+                         * The question is if the old scheduler can be
+                         * restored all the time. TODO: if it cannot be
+                         * restored some time, we must delete the service,
+                         * otherwise the system may crash.
+                         */
+                        ip_vs_bind_scheduler(svc, old_sched);
+                        old_sched = sched;
+                        goto out;
+                }
+        }
+  out:
+        write_unlock_bh(&__ip_vs_svc_lock);
+        if (old_sched)
+                ip_vs_scheduler_put(old_sched);
+        return ret;
+}
+/*
+ *      Delete a service from the service list
+ *      - The service must be unlinked, unlocked and not referenced!
+ *      - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest, *nxt;
+        struct ip_vs_scheduler *old_sched;
+        ip_vs_num_services--;
+        ip_vs_kill_estimator(&svc->stats);
+        /* Unbind scheduler */
+        old_sched = svc->scheduler;
+        ip_vs_unbind_scheduler(svc);
+        if (old_sched)
+                ip_vs_scheduler_put(old_sched);
+        /* Unbind app inc */
+        if (svc->inc) {
+                ip_vs_app_inc_put(svc->inc);
+                svc->inc = NULL;
+        }
+        /*
+         *    Unlink the whole destination list
+         */
+        list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+                __ip_vs_unlink_dest(svc, dest, 0);
+                __ip_vs_del_dest(dest);
+        }
+        /*
+         *    Update the virtual service counters
+         */
+        if (svc->port == FTPPORT)
+                atomic_dec(&ip_vs_ftpsvc_counter);
+        else if (svc->port == 0)
+                atomic_dec(&ip_vs_nullsvc_counter);
+        /*
+         *    Free the service if nobody refers to it
+         */
+        if (atomic_read(&svc->refcnt) == 0)
+                kfree(svc);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+}
+/*
+ *      Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+        if (svc == NULL)
+                return -EEXIST;
+        /*
+         * Unhash it from the service table
+         */
+        write_lock_bh(&__ip_vs_svc_lock);
+        ip_vs_svc_unhash(svc);
+        /*
+         * Wait until all the svc users go away.
+         */
+        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+        __ip_vs_del_service(svc);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        return 0;
+}
+/*
+ *      Flush all the virtual services
+ */
+static int ip_vs_flush(void)
+{
+        int idx;
+        struct ip_vs_service *svc, *nxt;
+        /*
+         * Flush the service table hashed by <protocol,addr,port>
+         */
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
+                        write_lock_bh(&__ip_vs_svc_lock);
+                        ip_vs_svc_unhash(svc);
+                        /*
+                         * Wait until all the svc users go away.
+                         */
+                        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+                        __ip_vs_del_service(svc);
+                        write_unlock_bh(&__ip_vs_svc_lock);
+                }
+        }
+        /*
+         * Flush the service table hashed by fwmark
+         */
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry_safe(svc, nxt,
+                                         &ip_vs_svc_fwm_table[idx], f_list) {
+                        write_lock_bh(&__ip_vs_svc_lock);
+                        ip_vs_svc_unhash(svc);
+                        /*
+                         * Wait until all the svc users go away.
+                         */
+                        IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+                        __ip_vs_del_service(svc);
+                        write_unlock_bh(&__ip_vs_svc_lock);
+                }
+        }
+        return 0;
+}
+/*
+ *      Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest;
+        write_lock_bh(&__ip_vs_svc_lock);
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                ip_vs_zero_stats(&dest->stats);
+        }
+        ip_vs_zero_stats(&svc->stats);
+        write_unlock_bh(&__ip_vs_svc_lock);
+        return 0;
+}
+static int ip_vs_zero_all(void)
+{
+        int idx;
+        struct ip_vs_service *svc;
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                        ip_vs_zero_service(svc);
+                }
+        }
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                        ip_vs_zero_service(svc);
+                }
+        }
+        ip_vs_zero_stats(&ip_vs_stats);
+        return 0;
+}
+static int
+proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int *valp = table->data;
+        int val = *valp;
+        int rc;
+        rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+        if (write && (*valp != val)) {
+                if ((*valp < 0) || (*valp > 3)) {
+                        /* Restore the correct value */
+                        *valp = val;
+                } else {
+                        local_bh_disable();
+                        update_defense_level();
+                        local_bh_enable();
+                }
+        }
+        return rc;
+}
+static int
+proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int *valp = table->data;
+        int val[2];
+        int rc;
+        /* backup the value first */
+        memcpy(val, valp, sizeof(val));
+        rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+        if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+                /* Restore the correct value */
+                memcpy(valp, val, sizeof(val));
+        }
+        return rc;
+}
+/*
+ *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ */
+static struct ctl_table vs_vars[] = {
+        {
+                .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
+                .procname       = "amemthresh",
+                .data           = &sysctl_ip_vs_amemthresh,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#ifdef CONFIG_IP_VS_DEBUG
+        {
+                .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
+                .procname       = "debug_level",
+                .data           = &sysctl_ip_vs_debug_level,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
+        {
+                .ctl_name       = NET_IPV4_VS_AMDROPRATE,
+                .procname       = "am_droprate",
+                .data           = &sysctl_ip_vs_am_droprate,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
+                .procname       = "drop_entry",
+                .data           = &sysctl_ip_vs_drop_entry,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_defense_mode,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_DROP_PACKET,
+                .procname       = "drop_packet",
+                .data           = &sysctl_ip_vs_drop_packet,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_defense_mode,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_SECURE_TCP,
+                .procname       = "secure_tcp",
+                .data           = &sysctl_ip_vs_secure_tcp,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_defense_mode,
+        },
+#if 0
+        {
+                .ctl_name       = NET_IPV4_VS_TO_ES,
+                .procname       = "timeout_established",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_SS,
+                .procname       = "timeout_synsent",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_SR,
+                .procname       = "timeout_synrecv",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_FW,
+                .procname       = "timeout_finwait",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_TW,
+                .procname       = "timeout_timewait",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_CL,
+                .procname       = "timeout_close",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_CW,
+                .procname       = "timeout_closewait",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_LA,
+                .procname       = "timeout_lastack",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_LI,
+                .procname       = "timeout_listen",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_SA,
+                .procname       = "timeout_synack",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_UDP,
+                .procname       = "timeout_udp",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_TO_ICMP,
+                .procname       = "timeout_icmp",
+                .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+#endif
+        {
+                .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
+                .procname       = "cache_bypass",
+                .data           = &sysctl_ip_vs_cache_bypass,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
+                .procname       = "expire_nodest_conn",
+                .data           = &sysctl_ip_vs_expire_nodest_conn,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
+                .procname       = "expire_quiescent_template",
+                .data           = &sysctl_ip_vs_expire_quiescent_template,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
+                .procname       = "sync_threshold",
+                .data           = &sysctl_ip_vs_sync_threshold,
+                .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
+                .mode           = 0644,
+                .proc_handler   = &proc_do_sync_threshold,
+        },
+        {
+                .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
+                .procname       = "nat_icmp_send",
+                .data           = &sysctl_ip_vs_nat_icmp_send,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS,
+                .procname       = "vs",
+                .mode           = 0555,
+                .child          = vs_vars
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4",
+                .mode           = 0555,
+                .child          = vs_table,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_root_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net",
+                .mode           = 0555,
+                .child          = ipv4_table,
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header * sysctl_header;
+#ifdef CONFIG_PROC_FS
+struct ip_vs_iter {
+        struct list_head *table;
+        int bucket;
+};
+/*
+ *      Write the contents of the VS rule table to a PROCfs file.
+ *      (It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned flags)
+{
+        switch (flags & IP_VS_CONN_F_FWD_MASK) {
+        case IP_VS_CONN_F_LOCALNODE:
+                return "Local";
+        case IP_VS_CONN_F_TUNNEL:
+                return "Tunnel";
+        case IP_VS_CONN_F_DROUTE:
+                return "Route";
+        default:
+                return "Masq";
+        }
+}
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+        struct ip_vs_iter *iter = seq->private;
+        int idx;
+        struct ip_vs_service *svc;
+        /* look in hash by protocol */
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                        if (pos-- == 0){
+                                iter->table = ip_vs_svc_table;
+                                iter->bucket = idx;
+                                return svc;
+                        }
+                }
+        }
+        /* keep looking in fwmark */
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                        if (pos-- == 0) {
+                                iter->table = ip_vs_svc_fwm_table;
+                                iter->bucket = idx;
+                                return svc;
+                        }
+                }
+        }
+        return NULL;
+}
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock_bh(&__ip_vs_svc_lock);
+        return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct list_head *e;
+        struct ip_vs_iter *iter;
+        struct ip_vs_service *svc;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return ip_vs_info_array(seq,0);
+        svc = v;
+        iter = seq->private;
+        if (iter->table == ip_vs_svc_table) {
+                /* next service in table hashed by protocol */
+                if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
+                        return list_entry(e, struct ip_vs_service, s_list);
+                while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+                        list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
+                                            s_list) {
+                                return svc;
+                        }
+                }
+                iter->table = ip_vs_svc_fwm_table;
+                iter->bucket = -1;
+                goto scan_fwmark;
+        }
+        /* next service in hashed by fwmark */
+        if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
+                return list_entry(e, struct ip_vs_service, f_list);
+ scan_fwmark:
+        while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
+                                    f_list)
+                        return svc;
+        }
+        return NULL;
+}
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+{
+        read_unlock_bh(&__ip_vs_svc_lock);
+}
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq,
+                        "IP Virtual Server version %d.%d.%d (size=%d)\n",
+                        NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+                seq_puts(seq,
+                         "Prot LocalAddress:Port Scheduler Flags\n");
+                seq_puts(seq,
+                         "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+        } else {
+                const struct ip_vs_service *svc = v;
+                const struct ip_vs_iter *iter = seq->private;
+                const struct ip_vs_dest *dest;
+                if (iter->table == ip_vs_svc_table)
+                        seq_printf(seq, "%s  %08X:%04X %s ",
+                                   ip_vs_proto_name(svc->protocol),
+                                   ntohl(svc->addr),
+                                   ntohs(svc->port),
+                                   svc->scheduler->name);
+                else
+                        seq_printf(seq, "FWM  %08X %s ",
+                                   svc->fwmark, svc->scheduler->name);
+                if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+                        seq_printf(seq, "persistent %d %08X\n",
+                                svc->timeout,
+                                ntohl(svc->netmask));
+                else
+                        seq_putc(seq, '\n');
+                list_for_each_entry(dest, &svc->destinations, n_list) {
+                        seq_printf(seq,
+                                   "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
+                                   ntohl(dest->addr), ntohs(dest->port),
+                                   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+                                   atomic_read(&dest->weight),
+                                   atomic_read(&dest->activeconns),
+                                   atomic_read(&dest->inactconns));
+                }
+        }
+        return 0;
+}
+static struct seq_operations ip_vs_info_seq_ops = {
+        .start = ip_vs_info_seq_start,
+        .next  = ip_vs_info_seq_next,
+        .stop  = ip_vs_info_seq_stop,
+        .show  = ip_vs_info_seq_show,
+};
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int rc = -ENOMEM;
+        struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                goto out;
+        rc = seq_open(file, &ip_vs_info_seq_ops);
+        if (rc)
+                goto out_kfree;
+        seq          = file->private_data;
+        seq->private = s;
+        memset(s, 0, sizeof(*s));
+out:
+        return rc;
+out_kfree:
+        kfree(s);
+        goto out;
+}
+static struct file_operations ip_vs_info_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ip_vs_info_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+#endif
+struct ip_vs_stats ip_vs_stats;
+#ifdef CONFIG_PROC_FS
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
+        seq_puts(seq,
+                 "   Total Incoming Outgoing         Incoming         Outgoing\n");
+        seq_printf(seq,
+                   "   Conns  Packets  Packets            Bytes            Bytes\n");
+        spin_lock_bh(&ip_vs_stats.lock);
+        seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
+                   ip_vs_stats.inpkts, ip_vs_stats.outpkts,
+                   (unsigned long long) ip_vs_stats.inbytes,
+                   (unsigned long long) ip_vs_stats.outbytes);
+/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+        seq_puts(seq,
+                   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+        seq_printf(seq,"%8X %8X %8X %16X %16X\n",
+                        ip_vs_stats.cps,
+                        ip_vs_stats.inpps,
+                        ip_vs_stats.outpps,
+                        ip_vs_stats.inbps,
+                        ip_vs_stats.outbps);
+        spin_unlock_bh(&ip_vs_stats.lock);
+        return 0;
+}
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, ip_vs_stats_show, NULL);
+}
+static struct file_operations ip_vs_stats_fops = {
+        .owner = THIS_MODULE,
+        .open = ip_vs_stats_seq_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+#endif
+/*
+ *      Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+{
+        IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+                  u->tcp_timeout,
+                  u->tcp_fin_timeout,
+                  u->udp_timeout);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+        if (u->tcp_timeout) {
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+                        = u->tcp_timeout * HZ;
+        }
+        if (u->tcp_fin_timeout) {
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+                        = u->tcp_fin_timeout * HZ;
+        }
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+        if (u->udp_timeout) {
+                ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+                        = u->udp_timeout * HZ;
+        }
+#endif
+        return 0;
+}
+#define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
+                                 sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN             SVCDEST_ARG_LEN
+static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+        [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
+        [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
+        [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
+};
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+        int ret;
+        unsigned char arg[MAX_ARG_LEN];
+        struct ip_vs_service_user *usvc;
+        struct ip_vs_service *svc;
+        struct ip_vs_dest_user *udest;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        if (len != set_arglen[SET_CMDID(cmd)]) {
+                IP_VS_ERR("set_ctl: len %u != %u\n",
+                          len, set_arglen[SET_CMDID(cmd)]);
+                return -EINVAL;
+        }
+        if (copy_from_user(arg, user, len) != 0)
+                return -EFAULT;
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        if (down_interruptible(&__ip_vs_mutex)) {
+                ret = -ERESTARTSYS;
+                goto out_dec;
+        }
+        if (cmd == IP_VS_SO_SET_FLUSH) {
+                /* Flush the virtual service */
+                ret = ip_vs_flush();
+                goto out_unlock;
+        } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+                /* Set timeout values for (tcp tcpfin udp) */
+                ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+                goto out_unlock;
+        } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+                struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+                ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+                goto out_unlock;
+        } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+                struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+                ret = stop_sync_thread(dm->state);
+                goto out_unlock;
+        }
+        usvc = (struct ip_vs_service_user *)arg;
+        udest = (struct ip_vs_dest_user *)(usvc + 1);
+        if (cmd == IP_VS_SO_SET_ZERO) {
+                /* if no service address is set, zero counters in all */
+                if (!usvc->fwmark && !usvc->addr && !usvc->port) {
+                        ret = ip_vs_zero_all();
+                        goto out_unlock;
+                }
+        }
+        /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
+        if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
+                IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
+                          usvc->protocol, NIPQUAD(usvc->addr),
+                          ntohs(usvc->port), usvc->sched_name);
+                ret = -EFAULT;
+                goto out_unlock;
+        }
+        /* Lookup the exact service by <protocol, addr, port> or fwmark */
+        if (usvc->fwmark == 0)
+                svc = __ip_vs_service_get(usvc->protocol,
+                                          usvc->addr, usvc->port);
+        else
+                svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+        if (cmd != IP_VS_SO_SET_ADD
+            && (svc == NULL || svc->protocol != usvc->protocol)) {
+                ret = -ESRCH;
+                goto out_unlock;
+        }
+        switch (cmd) {
+        case IP_VS_SO_SET_ADD:
+                if (svc != NULL)
+                        ret = -EEXIST;
+                else
+                        ret = ip_vs_add_service(usvc, &svc);
+                break;
+        case IP_VS_SO_SET_EDIT:
+                ret = ip_vs_edit_service(svc, usvc);
+                break;
+        case IP_VS_SO_SET_DEL:
+                ret = ip_vs_del_service(svc);
+                if (!ret)
+                        goto out_unlock;
+                break;
+        case IP_VS_SO_SET_ZERO:
+                ret = ip_vs_zero_service(svc);
+                break;
+        case IP_VS_SO_SET_ADDDEST:
+                ret = ip_vs_add_dest(svc, udest);
+                break;
+        case IP_VS_SO_SET_EDITDEST:
+                ret = ip_vs_edit_dest(svc, udest);
+                break;
+        case IP_VS_SO_SET_DELDEST:
+                ret = ip_vs_del_dest(svc, udest);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        if (svc)
+                ip_vs_service_put(svc);
+  out_unlock:
+        up(&__ip_vs_mutex);
+  out_dec:
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        return ret;
+}
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+        spin_lock_bh(&src->lock);
+        memcpy(dst, src, (char*)&src->lock - (char*)src);
+        spin_unlock_bh(&src->lock);
+}
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+        dst->protocol = src->protocol;
+        dst->addr = src->addr;
+        dst->port = src->port;
+        dst->fwmark = src->fwmark;
+        strcpy(dst->sched_name, src->scheduler->name);
+        dst->flags = src->flags;
+        dst->timeout = src->timeout / HZ;
+        dst->netmask = src->netmask;
+        dst->num_dests = src->num_dests;
+        ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+static inline int
+__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+                            struct ip_vs_get_services __user *uptr)
+{
+        int idx, count=0;
+        struct ip_vs_service *svc;
+        struct ip_vs_service_entry entry;
+        int ret = 0;
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                        if (count >= get->num_services)
+                                goto out;
+                        ip_vs_copy_service(&entry, svc);
+                        if (copy_to_user(&uptr->entrytable[count],
+                                         &entry, sizeof(entry))) {
+                                ret = -EFAULT;
+                                goto out;
+                        }
+                        count++;
+                }
+        }
+        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                        if (count >= get->num_services)
+                                goto out;
+                        ip_vs_copy_service(&entry, svc);
+                        if (copy_to_user(&uptr->entrytable[count],
+                                         &entry, sizeof(entry))) {
+                                ret = -EFAULT;
+                                goto out;
+                        }
+                        count++;
+                }
+        }
+  out:
+        return ret;
+}
+static inline int
+__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+                         struct ip_vs_get_dests __user *uptr)
+{
+        struct ip_vs_service *svc;
+        int ret = 0;
+        if (get->fwmark)
+                svc = __ip_vs_svc_fwm_get(get->fwmark);
+        else
+                svc = __ip_vs_service_get(get->protocol,
+                                          get->addr, get->port);
+        if (svc) {
+                int count = 0;
+                struct ip_vs_dest *dest;
+                struct ip_vs_dest_entry entry;
+                list_for_each_entry(dest, &svc->destinations, n_list) {
+                        if (count >= get->num_dests)
+                                break;
+                        entry.addr = dest->addr;
+                        entry.port = dest->port;
+                        entry.conn_flags = atomic_read(&dest->conn_flags);
+                        entry.weight = atomic_read(&dest->weight);
+                        entry.u_threshold = dest->u_threshold;
+                        entry.l_threshold = dest->l_threshold;
+                        entry.activeconns = atomic_read(&dest->activeconns);
+                        entry.inactconns = atomic_read(&dest->inactconns);
+                        entry.persistconns = atomic_read(&dest->persistconns);
+                        ip_vs_copy_stats(&entry.stats, &dest->stats);
+                        if (copy_to_user(&uptr->entrytable[count],
+                                         &entry, sizeof(entry))) {
+                                ret = -EFAULT;
+                                break;
+                        }
+                        count++;
+                }
+                ip_vs_service_put(svc);
+        } else
+                ret = -ESRCH;
+        return ret;
+}
+static inline void
+__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+        u->tcp_timeout =
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+        u->tcp_fin_timeout =
+                ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+        u->udp_timeout =
+                ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+#define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
+static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+        [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
+        [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
+        [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
+};
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+        unsigned char arg[128];
+        int ret = 0;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        if (*len < get_arglen[GET_CMDID(cmd)]) {
+                IP_VS_ERR("get_ctl: len %u < %u\n",
+                          *len, get_arglen[GET_CMDID(cmd)]);
+                return -EINVAL;
+        }
+        if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+                return -EFAULT;
+        if (down_interruptible(&__ip_vs_mutex))
+                return -ERESTARTSYS;
+        switch (cmd) {
+        case IP_VS_SO_GET_VERSION:
+        {
+                char buf[64];
+                sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+                        NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+                if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                *len = strlen(buf)+1;
+        }
+        break;
+        case IP_VS_SO_GET_INFO:
+        {
+                struct ip_vs_getinfo info;
+                info.version = IP_VS_VERSION_CODE;
+                info.size = IP_VS_CONN_TAB_SIZE;
+                info.num_services = ip_vs_num_services;
+                if (copy_to_user(user, &info, sizeof(info)) != 0)
+                        ret = -EFAULT;
+        }
+        break;
+        case IP_VS_SO_GET_SERVICES:
+        {
+                struct ip_vs_get_services *get;
+                int size;
+                get = (struct ip_vs_get_services *)arg;
+                size = sizeof(*get) +
+                        sizeof(struct ip_vs_service_entry) * get->num_services;
+                if (*len != size) {
+                        IP_VS_ERR("length: %u != %u\n", *len, size);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                ret = __ip_vs_get_service_entries(get, user);
+        }
+        break;
+        case IP_VS_SO_GET_SERVICE:
+        {
+                struct ip_vs_service_entry *entry;
+                struct ip_vs_service *svc;
+                entry = (struct ip_vs_service_entry *)arg;
+                if (entry->fwmark)
+                        svc = __ip_vs_svc_fwm_get(entry->fwmark);
+                else
+                        svc = __ip_vs_service_get(entry->protocol,
+                                                  entry->addr, entry->port);
+                if (svc) {
+                        ip_vs_copy_service(entry, svc);
+                        if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+                                ret = -EFAULT;
+                        ip_vs_service_put(svc);
+                } else
+                        ret = -ESRCH;
+        }
+        break;
+        case IP_VS_SO_GET_DESTS:
+        {
+                struct ip_vs_get_dests *get;
+                int size;
+                get = (struct ip_vs_get_dests *)arg;
+                size = sizeof(*get) +
+                        sizeof(struct ip_vs_dest_entry) * get->num_dests;
+                if (*len != size) {
+                        IP_VS_ERR("length: %u != %u\n", *len, size);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                ret = __ip_vs_get_dest_entries(get, user);
+        }
+        break;
+        case IP_VS_SO_GET_TIMEOUT:
+        {
+                struct ip_vs_timeout_user t;
+                __ip_vs_get_timeouts(&t);
+                if (copy_to_user(user, &t, sizeof(t)) != 0)
+                        ret = -EFAULT;
+        }
+        break;
+        case IP_VS_SO_GET_DAEMON:
+        {
+                struct ip_vs_daemon_user d[2];
+                memset(&d, 0, sizeof(d));
+                if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+                        d[0].state = IP_VS_STATE_MASTER;
+                        strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+                        d[0].syncid = ip_vs_master_syncid;
+                }
+                if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+                        d[1].state = IP_VS_STATE_BACKUP;
+                        strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+                        d[1].syncid = ip_vs_backup_syncid;
+                }
+                if (copy_to_user(user, &d, sizeof(d)) != 0)
+                        ret = -EFAULT;
+        }
+        break;
+        default:
+                ret = -EINVAL;
+        }
+  out:
+        up(&__ip_vs_mutex);
+        return ret;
+}
+static struct nf_sockopt_ops ip_vs_sockopts = {
+        .pf             = PF_INET,
+        .set_optmin     = IP_VS_BASE_CTL,
+        .set_optmax     = IP_VS_SO_SET_MAX+1,
+        .set            = do_ip_vs_set_ctl,
+        .get_optmin     = IP_VS_BASE_CTL,
+        .get_optmax     = IP_VS_SO_GET_MAX+1,
+        .get            = do_ip_vs_get_ctl,
+};
+int ip_vs_control_init(void)
+{
+        int ret;
+        int idx;
+        EnterFunction(2);
+        ret = nf_register_sockopt(&ip_vs_sockopts);
+        if (ret) {
+                IP_VS_ERR("cannot register sockopt.\n");
+                return ret;
+        }
+        proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
+        proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
+        sysctl_header = register_sysctl_table(vs_root_table, 0);
+        /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+                INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+                INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+        }
+        for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+                INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+        }
+        memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
+        spin_lock_init(&ip_vs_stats.lock);
+        ip_vs_new_estimator(&ip_vs_stats);
+        /* Hook the defense timer */
+        schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+        LeaveFunction(2);
+        return 0;
+}
+void ip_vs_control_cleanup(void)
+{
+        EnterFunction(2);
+        ip_vs_trash_cleanup();
+        cancel_rearming_delayed_work(&defense_work);
+        ip_vs_kill_estimator(&ip_vs_stats);
+        unregister_sysctl_table(sysctl_header);
+        proc_net_remove("ip_vs_stats");
+        proc_net_remove("ip_vs");
+        nf_unregister_sockopt(&ip_vs_sockopts);
+        LeaveFunction(2);
+}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
new file mode 100644
index 000000000000..f3bc320dce93
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -0,0 +1,258 @@
+/*
+ * IPVS:        Destination Hashing scheduling module
+ *
+ * Version:     $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              Inspired by the consistent hashing scheduler patch from
+ *              Thomas Proell <proellt@gmx.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[dest_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) OR (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+/*
+ *      IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+        struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+/*
+ *     for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS        8
+#endif
+#define IP_VS_DH_TAB_BITS               CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
+/*
+ *      Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
+{
+        return (tbl[ip_vs_dh_hashkey(addr)]).dest;
+}
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_dh_bucket *b;
+        struct list_head *p;
+        struct ip_vs_dest *dest;
+        b = tbl;
+        p = &svc->destinations;
+        for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+                if (list_empty(p)) {
+                        b->dest = NULL;
+                } else {
+                        if (p == &svc->destinations)
+                                p = p->next;
+                        dest = list_entry(p, struct ip_vs_dest, n_list);
+                        atomic_inc(&dest->refcnt);
+                        b->dest = dest;
+                        p = p->next;
+                }
+                b++;
+        }
+        return 0;
+}
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+        int i;
+        struct ip_vs_dh_bucket *b;
+        b = tbl;
+        for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+                if (b->dest) {
+                        atomic_dec(&b->dest->refcnt);
+                        b->dest = NULL;
+                }
+                b++;
+        }
+}
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_dh_bucket *tbl;
+        /* allocate the DH table for this service */
+        tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+                      GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+        /* assign the hash buckets with the updated service */
+        ip_vs_dh_assign(tbl, svc);
+        return 0;
+}
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_dh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_dh_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+        return 0;
+}
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_dh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_dh_flush(tbl);
+        /* assign the hash buckets with the updated service */
+        ip_vs_dh_assign(tbl, svc);
+        return 0;
+}
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+        return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+/*
+ *      Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_dh_bucket *tbl;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+        dest = ip_vs_dh_get(tbl, iph->daddr);
+        if (!dest
+            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+            || atomic_read(&dest->weight) <= 0
+            || is_overloaded(dest)) {
+                return NULL;
+        }
+        IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(iph->daddr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+        .name =                 "dh",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_dh_init_svc,
+        .done_service =         ip_vs_dh_done_svc,
+        .update_service =       ip_vs_dh_update_svc,
+        .schedule =             ip_vs_dh_schedule,
+};
+static int __init ip_vs_dh_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+static void __exit ip_vs_dh_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
new file mode 100644
index 000000000000..67b3e2fc1fa1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -0,0 +1,200 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Version:     $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <net/ip_vs.h>
+/*
+  This code is to estimate rate in a shorter interval (such as 8
+  seconds) for virtual services and real servers. For measure rate in a
+  long interval, it is easy to implement a user level daemon which
+  periodically reads those statistical counters and measure rate.
+  Currently, the measurement is activated by slow timer handler. Hope
+  this measurement will not introduce too much load.
+  We measure rate during the last 8 seconds every 2 seconds:
+    avgrate = avgrate*(1-W) + rate*W
+    where W = 2^(-2)
+  NOTES.
+  * The stored value for average bps is scaled by 2^5, so that maximal
+    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+  * A lot code is taken from net/sched/estimator.c
+ */
+struct ip_vs_estimator
+{
+        struct ip_vs_estimator  *next;
+        struct ip_vs_stats      *stats;
+        u32                     last_conns;
+        u32                     last_inpkts;
+        u32                     last_outpkts;
+        u64                     last_inbytes;
+        u64                     last_outbytes;
+        u32                     cps;
+        u32                     inpps;
+        u32                     outpps;
+        u32                     inbps;
+        u32                     outbps;
+};
+static struct ip_vs_estimator *est_list = NULL;
+static DEFINE_RWLOCK(est_lock);
+static struct timer_list est_timer;
+static void estimation_timer(unsigned long arg)
+{
+        struct ip_vs_estimator *e;
+        struct ip_vs_stats *s;
+        u32 n_conns;
+        u32 n_inpkts, n_outpkts;
+        u64 n_inbytes, n_outbytes;
+        u32 rate;
+        read_lock(&est_lock);
+        for (e = est_list; e; e = e->next) {
+                s = e->stats;
+                spin_lock(&s->lock);
+                n_conns = s->conns;
+                n_inpkts = s->inpkts;
+                n_outpkts = s->outpkts;
+                n_inbytes = s->inbytes;
+                n_outbytes = s->outbytes;
+                /* scaled by 2^10, but divided 2 seconds */
+                rate = (n_conns - e->last_conns)<<9;
+                e->last_conns = n_conns;
+                e->cps += ((long)rate - (long)e->cps)>>2;
+                s->cps = (e->cps+0x1FF)>>10;
+                rate = (n_inpkts - e->last_inpkts)<<9;
+                e->last_inpkts = n_inpkts;
+                e->inpps += ((long)rate - (long)e->inpps)>>2;
+                s->inpps = (e->inpps+0x1FF)>>10;
+                rate = (n_outpkts - e->last_outpkts)<<9;
+                e->last_outpkts = n_outpkts;
+                e->outpps += ((long)rate - (long)e->outpps)>>2;
+                s->outpps = (e->outpps+0x1FF)>>10;
+                rate = (n_inbytes - e->last_inbytes)<<4;
+                e->last_inbytes = n_inbytes;
+                e->inbps += ((long)rate - (long)e->inbps)>>2;
+                s->inbps = (e->inbps+0xF)>>5;
+                rate = (n_outbytes - e->last_outbytes)<<4;
+                e->last_outbytes = n_outbytes;
+                e->outbps += ((long)rate - (long)e->outbps)>>2;
+                s->outbps = (e->outbps+0xF)>>5;
+                spin_unlock(&s->lock);
+        }
+        read_unlock(&est_lock);
+        mod_timer(&est_timer, jiffies + 2*HZ);
+}
+int ip_vs_new_estimator(struct ip_vs_stats *stats)
+{
+        struct ip_vs_estimator *est;
+        est = kmalloc(sizeof(*est), GFP_KERNEL);
+        if (est == NULL)
+                return -ENOMEM;
+        memset(est, 0, sizeof(*est));
+        est->stats = stats;
+        est->last_conns = stats->conns;
+        est->cps = stats->cps<<10;
+        est->last_inpkts = stats->inpkts;
+        est->inpps = stats->inpps<<10;
+        est->last_outpkts = stats->outpkts;
+        est->outpps = stats->outpps<<10;
+        est->last_inbytes = stats->inbytes;
+        est->inbps = stats->inbps<<5;
+        est->last_outbytes = stats->outbytes;
+        est->outbps = stats->outbps<<5;
+        write_lock_bh(&est_lock);
+        est->next = est_list;
+        if (est->next == NULL) {
+                init_timer(&est_timer);
+                est_timer.expires = jiffies + 2*HZ;
+                est_timer.function = estimation_timer;
+                add_timer(&est_timer);
+        }
+        est_list = est;
+        write_unlock_bh(&est_lock);
+        return 0;
+}
+void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+{
+        struct ip_vs_estimator *est, **pest;
+        int killed = 0;
+        write_lock_bh(&est_lock);
+        pest = &est_list;
+        while ((est=*pest) != NULL) {
+                if (est->stats != stats) {
+                        pest = &est->next;
+                        continue;
+                }
+                *pest = est->next;
+                kfree(est);
+                killed++;
+        }
+        if (killed && est_list == NULL)
+                del_timer_sync(&est_timer);
+        write_unlock_bh(&est_lock);
+}
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+        struct ip_vs_estimator *e;
+        write_lock_bh(&est_lock);
+        for (e = est_list; e; e = e->next) {
+                if (e->stats != stats)
+                        continue;
+                /* set counters zero */
+                e->last_conns = 0;
+                e->last_inpkts = 0;
+                e->last_outpkts = 0;
+                e->last_inbytes = 0;
+                e->last_outbytes = 0;
+                e->cps = 0;
+                e->inpps = 0;
+                e->outpps = 0;
+                e->inbps = 0;
+                e->outbps = 0;
+        }
+        write_unlock_bh(&est_lock);
+}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
new file mode 100644
index 000000000000..a19a33ceb811
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -0,0 +1,400 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Version:     $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ *              IP_MASQ_FTP ftp masquerading module
+ *
+ * Version:     @(#)ip_masq_ftp.c 0.04   02/05/96
+ *
+ * Author:      Wouter Gadeyne
+ *
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/ip_vs.h>
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, int, NULL, 0);
+/*
+ *      Debug level
+ */
+#ifdef CONFIG_IP_VS_DEBUG
+static int debug=0;
+module_param(debug, int, 0);
+#endif
+/*      Dummy variable */
+static int ip_vs_ftp_pasv;
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+        return 0;
+}
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+        return 0;
+}
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+                                  const char *pattern, size_t plen, char term,
+                                  __u32 *addr, __u16 *port,
+                                  char **start, char **end)
+{
+        unsigned char p[6];
+        int i = 0;
+        if (data_limit - data < plen) {
+                /* check if there is partial match */
+                if (strnicmp(data, pattern, data_limit - data) == 0)
+                        return -1;
+                else
+                        return 0;
+        }
+        if (strnicmp(data, pattern, plen) != 0) {
+                return 0;
+        }
+        *start = data + plen;
+        for (data = *start; *data != term; data++) {
+                if (data == data_limit)
+                        return -1;
+        }
+        *end = data;
+        memset(p, 0, sizeof(p));
+        for (data = *start; data != *end; data++) {
+                if (*data >= '0' && *data <= '9') {
+                        p[i] = p[i]*10 + *data - '0';
+                } else if (*data == ',' && i < 5) {
+                        i++;
+                } else {
+                        /* unexpected character */
+                        return -1;
+                }
+        }
+        if (i != 5)
+                return -1;
+        *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
+        *port = (p[5]<<8) | p[4];
+        return 1;
+}
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port.  Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+                         struct sk_buff **pskb, int *diff)
+{
+        struct iphdr *iph;
+        struct tcphdr *th;
+        char *data, *data_limit;
+        char *start, *end;
+        __u32 from;
+        __u16 port;
+        struct ip_vs_conn *n_cp;
+        char buf[24];           /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+        unsigned buf_len;
+        int ret;
+        *diff = 0;
+        /* Only useful for established sessions */
+        if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+                return 1;
+        /* Linear packets are much easier to deal with. */
+        if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+                return 0;
+        if (cp->app_data == &ip_vs_ftp_pasv) {
+                iph = (*pskb)->nh.iph;
+                th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+                data = (char *)th + (th->doff << 2);
+                data_limit = (*pskb)->tail;
+                if (ip_vs_ftp_get_addrport(data, data_limit,
+                                           SERVER_STRING,
+                                           sizeof(SERVER_STRING)-1, ')',
+                                           &from, &port,
+                                           &start, &end) != 1)
+                        return 1;
+                IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
+                          "%u.%u.%u.%u:%d detected\n",
+                          NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
+                /*
+                 * Now update or create an connection entry for it
+                 */
+                n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
+                                          cp->caddr, 0);
+                if (!n_cp) {
+                        n_cp = ip_vs_conn_new(IPPROTO_TCP,
+                                              cp->caddr, 0,
+                                              cp->vaddr, port,
+                                              from, port,
+                                              IP_VS_CONN_F_NO_CPORT,
+                                              cp->dest);
+                        if (!n_cp)
+                                return 0;
+                        /* add its controller */
+                        ip_vs_control_add(n_cp, cp);
+                }
+                /*
+                 * Replace the old passive address with the new one
+                 */
+                from = n_cp->vaddr;
+                port = n_cp->vport;
+                sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
+                        port&255, (port>>8)&255);
+                buf_len = strlen(buf);
+                /*
+                 * Calculate required delta-offset to keep TCP happy
+                 */
+                *diff = buf_len - (end-start);
+                if (*diff == 0) {
+                        /* simply replace it with new passive address */
+                        memcpy(start, buf, buf_len);
+                        ret = 1;
+                } else {
+                        ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
+                                          end-start, buf, buf_len);
+                }
+                cp->app_data = NULL;
+                ip_vs_tcp_conn_listen(n_cp);
+                ip_vs_conn_put(n_cp);
+                return ret;
+        }
+        return 1;
+}
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+                        struct sk_buff **pskb, int *diff)
+{
+        struct iphdr *iph;
+        struct tcphdr *th;
+        char *data, *data_start, *data_limit;
+        char *start, *end;
+        __u32 to;
+        __u16 port;
+        struct ip_vs_conn *n_cp;
+        /* no diff required for incoming packets */
+        *diff = 0;
+        /* Only useful for established sessions */
+        if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+                return 1;
+        /* Linear packets are much easier to deal with. */
+        if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+                return 0;
+        /*
+         * Detecting whether it is passive
+         */
+        iph = (*pskb)->nh.iph;
+        th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+        /* Since there may be OPTIONS in the TCP packet and the HLEN is
+           the length of the header in 32-bit multiples, it is accurate
+           to calculate data address by th+HLEN*4 */
+        data = data_start = (char *)th + (th->doff << 2);
+        data_limit = (*pskb)->tail;
+        while (data <= data_limit - 6) {
+                if (strnicmp(data, "PASV\r\n", 6) == 0) {
+                        /* Passive mode on */
+                        IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n",
+                                  data - data_start,
+                                  data_limit - data_start);
+                        cp->app_data = &ip_vs_ftp_pasv;
+                        return 1;
+                }
+                data++;
+        }
+        /*
+         * To support virtual FTP server, the scenerio is as follows:
+         *       FTP client ----> Load Balancer ----> FTP server
+         * First detect the port number in the application data,
+         * then create a new connection entry for the coming data
+         * connection.
+         */
+        if (ip_vs_ftp_get_addrport(data_start, data_limit,
+                                   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+                                   '\r', &to, &port,
+                                   &start, &end) != 1)
+                return 1;
+        IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
+                  NIPQUAD(to), ntohs(port));
+        /* Passive mode off */
+        cp->app_data = NULL;
+        /*
+         * Now update or create a connection entry for it
+         */
+        IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
+                  ip_vs_proto_name(iph->protocol),
+                  NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
+        n_cp = ip_vs_conn_in_get(iph->protocol,
+                                 to, port,
+                                 cp->vaddr, htons(ntohs(cp->vport)-1));
+        if (!n_cp) {
+                n_cp = ip_vs_conn_new(IPPROTO_TCP,
+                                      to, port,
+                                      cp->vaddr, htons(ntohs(cp->vport)-1),
+                                      cp->daddr, htons(ntohs(cp->dport)-1),
+                                      0,
+                                      cp->dest);
+                if (!n_cp)
+                        return 0;
+                /* add its controller */
+                ip_vs_control_add(n_cp, cp);
+        }
+        /*
+         *      Move tunnel to listen state
+         */
+        ip_vs_tcp_conn_listen(n_cp);
+        ip_vs_conn_put(n_cp);
+        return 1;
+}
+static struct ip_vs_app ip_vs_ftp = {
+        .name =         "ftp",
+        .type =         IP_VS_APP_TYPE_FTP,
+        .protocol =     IPPROTO_TCP,
+        .module =       THIS_MODULE,
+        .incs_list =    LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+        .init_conn =    ip_vs_ftp_init_conn,
+        .done_conn =    ip_vs_ftp_done_conn,
+        .bind_conn =    NULL,
+        .unbind_conn =  NULL,
+        .pkt_out =      ip_vs_ftp_out,
+        .pkt_in =       ip_vs_ftp_in,
+};
+/*
+ *      ip_vs_ftp initialization
+ */
+static int __init ip_vs_ftp_init(void)
+{
+        int i, ret;
+        struct ip_vs_app *app = &ip_vs_ftp;
+        ret = register_ip_vs_app(app);
+        if (ret)
+                return ret;
+        for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+                if (!ports[i])
+                        continue;
+                ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+                if (ret)
+                        break;
+                IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
+                          app->name, i, ports[i]);
+        }
+        if (ret)
+                unregister_ip_vs_app(app);
+        return ret;
+}
+/*
+ *      ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+        unregister_ip_vs_app(&ip_vs_ftp);
+}
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
new file mode 100644
index 000000000000..c035838b780a
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -0,0 +1,624 @@
+/*
+ * IPVS:        Locality-Based Least-Connection scheduling module
+ *
+ * Version:     $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Martin Hamilton         :    fixed the terrible locking bugs
+ *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
+ *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
+ *     Wensong Zhang           :    added doing full expiration check to
+ *                                   collect stale entries of 24+ hours when
+ *                                   no partial expire check in a half hour
+ *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
+ *                                   to avoid the possible race between timer
+ *                                   handler and del_timer thread in SMP
+ *
+ */
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ *       if cachenode[dest_ip] is null then
+ *               n, cachenode[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- cachenode[dest_ip];
+ *               if (n is dead) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                 n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ *       return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/ip_vs.h>
+/*
+ *    It is for garbage collection of stale IPVS lblc entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
+/*
+ *     for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS      10
+#endif
+#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
+/*
+ *      IPVS lblc entry represents an association between destination
+ *      IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+        struct list_head        list;
+        __u32                   addr;           /* destination IP address */
+        struct ip_vs_dest       *dest;          /* real server (cache) */
+        unsigned long           lastuse;        /* last used time */
+};
+/*
+ *      IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+        rwlock_t                lock;           /* lock for this table */
+        struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+        atomic_t                entries;        /* number of entries */
+        int                     max_size;       /* maximum size of entries */
+        struct timer_list       periodic_timer; /* collect stale entries */
+        int                     rover;          /* rover for expire check */
+        int                     counter;        /* counter for no expire */
+};
+/*
+ *      IPVS LBLC sysctl table
+ */
+static ctl_table vs_vars_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS_LBLC_EXPIRE,
+                .procname       = "lblc_expiration",
+                .data           = &sysctl_ip_vs_lblc_expiration,
+                .maxlen         = sizeof(int),
+                .mode           = 0644, 
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS,
+                .procname       = "vs",
+                .mode           = 0555, 
+                .child          = vs_vars_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4", 
+                .mode           = 0555,
+                .child          = vs_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table lblc_root_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net", 
+                .mode           = 0555, 
+                .child          = ipv4_table
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header * sysctl_header;
+/*
+ *      new/free a ip_vs_lblc_entry, which is a mapping of a destionation
+ *      IP address to a server.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
+{
+        struct ip_vs_lblc_entry *en;
+        en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
+        if (en == NULL) {
+                IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+                return NULL;
+        }
+        INIT_LIST_HEAD(&en->list);
+        en->addr = daddr;
+        atomic_inc(&dest->refcnt);
+        en->dest = dest;
+        return en;
+}
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+        list_del(&en->list);
+        /*
+         * We don't kfree dest because it is refered either by its service
+         * or the trash dest list.
+         */
+        atomic_dec(&en->dest->refcnt);
+        kfree(en);
+}
+/*
+ *      Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+/*
+ *      Hash an entry in the ip_vs_lblc_table.
+ *      returns bool success.
+ */
+static int
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+        unsigned hash;
+        if (!list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         *      Hash by destination IP address
+         */
+        hash = ip_vs_lblc_hashkey(en->addr);
+        write_lock(&tbl->lock);
+        list_add(&en->list, &tbl->bucket[hash]);
+        atomic_inc(&tbl->entries);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#if 0000
+/*
+ *      Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
+ *      returns bool success.
+ */
+static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
+                             struct ip_vs_lblc_entry *en)
+{
+        if (list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         * Remove it from the table
+         */
+        write_lock(&tbl->lock);
+        list_del(&en->list);
+        INIT_LIST_HEAD(&en->list);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#endif
+/*
+ *  Get ip_vs_lblc_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
+{
+        unsigned hash;
+        struct ip_vs_lblc_entry *en;
+        hash = ip_vs_lblc_hashkey(addr);
+        read_lock(&tbl->lock);
+        list_for_each_entry(en, &tbl->bucket[hash], list) {
+                if (en->addr == addr) {
+                        /* HIT */
+                        read_unlock(&tbl->lock);
+                        return en;
+                }
+        }
+        read_unlock(&tbl->lock);
+        return NULL;
+}
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+        int i;
+        struct ip_vs_lblc_entry *en, *nxt;
+        for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+                        ip_vs_lblc_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+}
+static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
+{
+        unsigned long now = jiffies;
+        int i, j;
+        struct ip_vs_lblc_entry *en, *nxt;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_before(now, 
+                                        en->lastuse + sysctl_ip_vs_lblc_expiration))
+                                continue;
+                        ip_vs_lblc_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+        tbl->rover = j;
+}
+/*
+ *      Periodical timer handler for IPVS lblc table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+        struct ip_vs_lblc_table *tbl;
+        unsigned long now = jiffies;
+        int goal;
+        int i, j;
+        struct ip_vs_lblc_entry *en, *nxt;
+        tbl = (struct ip_vs_lblc_table *)data;
+        if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+                /* do full expiration check */
+                ip_vs_lblc_full_check(tbl);
+                tbl->counter = 1;
+                goto out;
+        }
+        if (atomic_read(&tbl->entries) <= tbl->max_size) {
+                tbl->counter++;
+                goto out;
+        }
+        goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+        if (goal > tbl->max_size/2)
+                goal = tbl->max_size/2;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+                                continue;
+                        ip_vs_lblc_free(en);
+                        atomic_dec(&tbl->entries);
+                        goal--;
+                }
+                write_unlock(&tbl->lock);
+                if (goal <= 0)
+                        break;
+        }
+        tbl->rover = j;
+  out:
+        mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_lblc_table *tbl;
+        /*
+         *    Allocate the ip_vs_lblc_table for this service
+         */
+        tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_lblc_table));
+        /*
+         *    Initialize the hash buckets
+         */
+        for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+                INIT_LIST_HEAD(&tbl->bucket[i]);
+        }
+        rwlock_init(&tbl->lock);
+        tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+        tbl->rover = 0;
+        tbl->counter = 1;
+        /*
+         *    Hook periodic timer for garbage collection
+         */
+        init_timer(&tbl->periodic_timer);
+        tbl->periodic_timer.data = (unsigned long)tbl;
+        tbl->periodic_timer.function = ip_vs_lblc_check_expire;
+        tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+        add_timer(&tbl->periodic_timer);
+        return 0;
+}
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_lblc_table *tbl = svc->sched_data;
+        /* remove periodic timer */
+        del_timer_sync(&tbl->periodic_timer);
+        /* got to clean up table entries here */
+        ip_vs_lblc_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_lblc_table));
+        return 0;
+}
+static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+        struct ip_vs_dest *dest, *least;
+        int loh, doh;
+        /*
+         * We think the overhead of processing active connections is fifty
+         * times higher than that of inactive connections in average. (This
+         * fifty times might not be accurate, we will change it later.) We
+         * use the following formula to estimate the overhead:
+         *                dest->activeconns*50 + dest->inactconns
+         * and the load:
+         *                (dest overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connection.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                if (atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = atomic_read(&least->activeconns) * 50
+                                + atomic_read(&least->inactconns);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+        if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+                struct ip_vs_dest *d;
+                list_for_each_entry(d, &svc->destinations, n_list) {
+                        if (atomic_read(&d->activeconns)*2
+                            < atomic_read(&d->weight)) {
+                                return 1;
+                        }
+                }
+        }
+        return 0;
+}
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_lblc_table *tbl;
+        struct ip_vs_lblc_entry *en;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_lblc_table *)svc->sched_data;
+        en = ip_vs_lblc_get(tbl, iph->daddr);
+        if (en == NULL) {
+                dest = __ip_vs_wlc_schedule(svc, iph);
+                if (dest == NULL) {
+                        IP_VS_DBG(1, "no destination available\n");
+                        return NULL;
+                }
+                en = ip_vs_lblc_new(iph->daddr, dest);
+                if (en == NULL) {
+                        return NULL;
+                }
+                ip_vs_lblc_hash(tbl, en);
+        } else {
+                dest = en->dest;
+                if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
+                    || atomic_read(&dest->weight) <= 0
+                    || is_overloaded(dest, svc)) {
+                        dest = __ip_vs_wlc_schedule(svc, iph);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "no destination available\n");
+                                return NULL;
+                        }
+                        atomic_dec(&en->dest->refcnt);
+                        atomic_inc(&dest->refcnt);
+                        en->dest = dest;
+                }
+        }
+        en->lastuse = jiffies;
+        IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(en->addr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+        .name =                 "lblc",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_lblc_init_svc,
+        .done_service =         ip_vs_lblc_done_svc,
+        .update_service =       ip_vs_lblc_update_svc,
+        .schedule =             ip_vs_lblc_schedule,
+};
+static int __init ip_vs_lblc_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
+        sysctl_header = register_sysctl_table(lblc_root_table, 0);
+        return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+static void __exit ip_vs_lblc_cleanup(void)
+{
+        unregister_sysctl_table(sysctl_header);
+        unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
new file mode 100644
index 000000000000..22b5dd55d271
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,888 @@
+/*
+ * IPVS:        Locality-Based Least-Connection with Replication scheduler
+ *
+ * Version:     $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Julian Anastasov        :    Added the missing (dest->weight>0)
+ *                                  condition in the ip_vs_dest_set_max.
+ *
+ */
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ *       if serverSet[dest_ip] is null then
+ *               n, serverSet[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- {least-conn (alive) node in serverSet[dest_ip]};
+ *               if (n is null) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                   n <- {weighted least-conn node};
+ *                   add n to serverSet[dest_ip];
+ *               if |serverSet[dest_ip]| > 1 AND
+ *                   now - serverSet[dest_ip].lastMod > T then
+ *                   m <- {most conn node in serverSet[dest_ip]};
+ *                   remove m from serverSet[dest_ip];
+ *       if serverSet[dest_ip] changed then
+ *               serverSet[dest_ip].lastMod <- now;
+ *
+ *       return n;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+/* for proc_net_create/proc_net_remove */
+#include <linux/proc_fs.h>
+#include <net/ip_vs.h>
+/*
+ *    It is for garbage collection of stale IPVS lblcr entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
+/*
+ *     for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
+#endif
+#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
+/*
+ *      IPVS destination set structure and operations
+ */
+struct ip_vs_dest_list {
+        struct ip_vs_dest_list  *next;          /* list link */
+        struct ip_vs_dest       *dest;          /* destination server */
+};
+struct ip_vs_dest_set {
+        atomic_t                size;           /* set size */
+        unsigned long           lastmod;        /* last modified time */
+        struct ip_vs_dest_list  *list;          /* destination list */
+        rwlock_t                lock;           /* lock for this list */
+};
+static struct ip_vs_dest_list *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+        struct ip_vs_dest_list *e;
+        for (e=set->list; e!=NULL; e=e->next) {
+                if (e->dest == dest)
+                        /* already existed */
+                        return NULL;
+        }
+        e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+        if (e == NULL) {
+                IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
+                return NULL;
+        }
+        atomic_inc(&dest->refcnt);
+        e->dest = dest;
+        /* link it to the list */
+        write_lock(&set->lock);
+        e->next = set->list;
+        set->list = e;
+        atomic_inc(&set->size);
+        write_unlock(&set->lock);
+        set->lastmod = jiffies;
+        return e;
+}
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+        struct ip_vs_dest_list *e, **ep;
+        write_lock(&set->lock);
+        for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+                if (e->dest == dest) {
+                        /* HIT */
+                        *ep = e->next;
+                        atomic_dec(&set->size);
+                        set->lastmod = jiffies;
+                        atomic_dec(&e->dest->refcnt);
+                        kfree(e);
+                        break;
+                }
+                ep = &e->next;
+        }
+        write_unlock(&set->lock);
+}
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+        struct ip_vs_dest_list *e, **ep;
+        write_lock(&set->lock);
+        for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+                *ep = e->next;
+                /*
+                 * We don't kfree dest because it is refered either
+                 * by its service or by the trash dest list.
+                 */
+                atomic_dec(&e->dest->refcnt);
+                kfree(e);
+        }
+        write_unlock(&set->lock);
+}
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+        register struct ip_vs_dest_list *e;
+        struct ip_vs_dest *dest, *least;
+        int loh, doh;
+        if (set == NULL)
+                return NULL;
+        read_lock(&set->lock);
+        /* select the first destination server, whose weight > 0 */
+        for (e=set->list; e!=NULL; e=e->next) {
+                least = e->dest;
+                if (least->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                if ((atomic_read(&least->weight) > 0)
+                    && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+                        loh = atomic_read(&least->activeconns) * 50
+                                + atomic_read(&least->inactconns);
+                        goto nextstage;
+                }
+        }
+        read_unlock(&set->lock);
+        return NULL;
+        /* find the destination with the weighted least load */
+  nextstage:
+        for (e=e->next; e!=NULL; e=e->next) {
+                dest = e->dest;
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                if ((loh * atomic_read(&dest->weight) >
+                     doh * atomic_read(&least->weight))
+                    && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        read_unlock(&set->lock);
+        IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+        register struct ip_vs_dest_list *e;
+        struct ip_vs_dest *dest, *most;
+        int moh, doh;
+        if (set == NULL)
+                return NULL;
+        read_lock(&set->lock);
+        /* select the first destination server, whose weight > 0 */
+        for (e=set->list; e!=NULL; e=e->next) {
+                most = e->dest;
+                if (atomic_read(&most->weight) > 0) {
+                        moh = atomic_read(&most->activeconns) * 50
+                                + atomic_read(&most->inactconns);
+                        goto nextstage;
+                }
+        }
+        read_unlock(&set->lock);
+        return NULL;
+        /* find the destination with the weighted most load */
+  nextstage:
+        for (e=e->next; e!=NULL; e=e->next) {
+                dest = e->dest;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+                if ((moh * atomic_read(&dest->weight) <
+                     doh * atomic_read(&most->weight))
+                    && (atomic_read(&dest->weight) > 0)) {
+                        most = dest;
+                        moh = doh;
+                }
+        }
+        read_unlock(&set->lock);
+        IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(most->addr), ntohs(most->port),
+                  atomic_read(&most->activeconns),
+                  atomic_read(&most->refcnt),
+                  atomic_read(&most->weight), moh);
+        return most;
+}
+/*
+ *      IPVS lblcr entry represents an association between destination
+ *      IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+        struct list_head        list;
+        __u32                   addr;           /* destination IP address */
+        struct ip_vs_dest_set   set;            /* destination server set */
+        unsigned long           lastuse;        /* last used time */
+};
+/*
+ *      IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+        rwlock_t                lock;           /* lock for this table */
+        struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
+        atomic_t                entries;        /* number of entries */
+        int                     max_size;       /* maximum size of entries */
+        struct timer_list       periodic_timer; /* collect stale entries */
+        int                     rover;          /* rover for expire check */
+        int                     counter;        /* counter for no expire */
+};
+/*
+ *      IPVS LBLCR sysctl table
+ */
+static ctl_table vs_vars_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS_LBLCR_EXPIRE,
+                .procname       = "lblcr_expiration",
+                .data           = &sysctl_ip_vs_lblcr_expiration,
+                .maxlen         = sizeof(int),
+                .mode           = 0644, 
+                .proc_handler   = &proc_dointvec_jiffies,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table vs_table[] = {
+        {
+                .ctl_name       = NET_IPV4_VS,
+                .procname       = "vs",
+                .mode           = 0555,
+                .child          = vs_vars_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ipv4_table[] = {
+        {
+                .ctl_name       = NET_IPV4,
+                .procname       = "ipv4", 
+                .mode           = 0555,
+                .child          = vs_table
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table lblcr_root_table[] = {
+        {
+                .ctl_name       = CTL_NET,
+                .procname       = "net", 
+                .mode           = 0555, 
+                .child          = ipv4_table
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header * sysctl_header;
+/*
+ *      new/free a ip_vs_lblcr_entry, which is a mapping of a destination
+ *      IP address to a server.
+ */
+static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
+{
+        struct ip_vs_lblcr_entry *en;
+        en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
+        if (en == NULL) {
+                IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+                return NULL;
+        }
+        INIT_LIST_HEAD(&en->list);
+        en->addr = daddr;
+        /* initilize its dest set */
+        atomic_set(&(en->set.size), 0);
+        en->set.list = NULL;
+        rwlock_init(&en->set.lock);
+        return en;
+}
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+        list_del(&en->list);
+        ip_vs_dest_set_eraseall(&en->set);
+        kfree(en);
+}
+/*
+ *      Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+/*
+ *      Hash an entry in the ip_vs_lblcr_table.
+ *      returns bool success.
+ */
+static int
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+        unsigned hash;
+        if (!list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         *      Hash by destination IP address
+         */
+        hash = ip_vs_lblcr_hashkey(en->addr);
+        write_lock(&tbl->lock);
+        list_add(&en->list, &tbl->bucket[hash]);
+        atomic_inc(&tbl->entries);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#if 0000
+/*
+ *      Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
+ *      returns bool success.
+ */
+static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
+                             struct ip_vs_lblcr_entry *en)
+{
+        if (list_empty(&en->list)) {
+                IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
+                          "called from %p\n", __builtin_return_address(0));
+                return 0;
+        }
+        /*
+         * Remove it from the table
+         */
+        write_lock(&tbl->lock);
+        list_del(&en->list);
+        INIT_LIST_HEAD(&en->list);
+        write_unlock(&tbl->lock);
+        return 1;
+}
+#endif
+/*
+ *  Get ip_vs_lblcr_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
+{
+        unsigned hash;
+        struct ip_vs_lblcr_entry *en;
+        hash = ip_vs_lblcr_hashkey(addr);
+        read_lock(&tbl->lock);
+        list_for_each_entry(en, &tbl->bucket[hash], list) {
+                if (en->addr == addr) {
+                        /* HIT */
+                        read_unlock(&tbl->lock);
+                        return en;
+                }
+        }
+        read_unlock(&tbl->lock);
+        return NULL;
+}
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+        int i;
+        struct ip_vs_lblcr_entry *en, *nxt;
+        for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+                        ip_vs_lblcr_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+}
+static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+{
+        unsigned long now = jiffies;
+        int i, j;
+        struct ip_vs_lblcr_entry *en, *nxt;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
+                                       now))
+                                continue;
+                        ip_vs_lblcr_free(en);
+                        atomic_dec(&tbl->entries);
+                }
+                write_unlock(&tbl->lock);
+        }
+        tbl->rover = j;
+}
+/*
+ *      Periodical timer handler for IPVS lblcr table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+        struct ip_vs_lblcr_table *tbl;
+        unsigned long now = jiffies;
+        int goal;
+        int i, j;
+        struct ip_vs_lblcr_entry *en, *nxt;
+        tbl = (struct ip_vs_lblcr_table *)data;
+        if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+                /* do full expiration check */
+                ip_vs_lblcr_full_check(tbl);
+                tbl->counter = 1;
+                goto out;
+        }
+        if (atomic_read(&tbl->entries) <= tbl->max_size) {
+                tbl->counter++;
+                goto out;
+        }
+        goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+        if (goal > tbl->max_size/2)
+                goal = tbl->max_size/2;
+        for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+                write_lock(&tbl->lock);
+                list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                        if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+                                continue;
+                        ip_vs_lblcr_free(en);
+                        atomic_dec(&tbl->entries);
+                        goal--;
+                }
+                write_unlock(&tbl->lock);
+                if (goal <= 0)
+                        break;
+        }
+        tbl->rover = j;
+  out:
+        mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+static struct ip_vs_lblcr_table *lblcr_table_list;
+/*
+ *      /proc/net/ip_vs_lblcr to display the mappings of
+ *                  destination IP address <==> its serverSet
+ */
+static int
+ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+        off_t pos=0, begin;
+        int len=0, size;
+        struct ip_vs_lblcr_table *tbl;
+        unsigned long now = jiffies;
+        int i;
+        struct ip_vs_lblcr_entry *en;
+        tbl = lblcr_table_list;
+        size = sprintf(buffer, "LastTime Dest IP address  Server set\n");
+        pos += size;
+        len += size;
+        for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                read_lock_bh(&tbl->lock);
+                list_for_each_entry(en, &tbl->bucket[i], list) {
+                        char tbuf[16];
+                        struct ip_vs_dest_list *d;
+                        sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
+                        size = sprintf(buffer+len, "%8lu %-16s ",
+                                       now-en->lastuse, tbuf);
+                        read_lock(&en->set.lock);
+                        for (d=en->set.list; d!=NULL; d=d->next) {
+                                size += sprintf(buffer+len+size,
+                                                "%u.%u.%u.%u ",
+                                                NIPQUAD(d->dest->addr));
+                        }
+                        read_unlock(&en->set.lock);
+                        size += sprintf(buffer+len+size, "\n");
+                        len += size;
+                        pos += size;
+                        if (pos <= offset)
+                                len=0;
+                        if (pos >= offset+length) {
+                                read_unlock_bh(&tbl->lock);
+                                goto done;
+                        }
+                }
+                read_unlock_bh(&tbl->lock);
+        }
+  done:
+        begin = len - (pos - offset);
+        *start = buffer + begin;
+        len -= begin;
+        if(len>length)
+                len = length;
+        return len;
+}
+#endif
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_lblcr_table *tbl;
+        /*
+         *    Allocate the ip_vs_lblcr_table for this service
+         */
+        tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_lblcr_table));
+        /*
+         *    Initialize the hash buckets
+         */
+        for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+                INIT_LIST_HEAD(&tbl->bucket[i]);
+        }
+        rwlock_init(&tbl->lock);
+        tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+        tbl->rover = 0;
+        tbl->counter = 1;
+        /*
+         *    Hook periodic timer for garbage collection
+         */
+        init_timer(&tbl->periodic_timer);
+        tbl->periodic_timer.data = (unsigned long)tbl;
+        tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
+        tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+        add_timer(&tbl->periodic_timer);
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+        lblcr_table_list = tbl;
+#endif
+        return 0;
+}
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_lblcr_table *tbl = svc->sched_data;
+        /* remove periodic timer */
+        del_timer_sync(&tbl->periodic_timer);
+        /* got to clean up table entries here */
+        ip_vs_lblcr_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_lblcr_table));
+        return 0;
+}
+static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+        struct ip_vs_dest *dest, *least;
+        int loh, doh;
+        /*
+         * We think the overhead of processing active connections is fifty
+         * times higher than that of inactive connections in average. (This
+         * fifty times might not be accurate, we will change it later.) We
+         * use the following formula to estimate the overhead:
+         *                dest->activeconns*50 + dest->inactconns
+         * and the load:
+         *                (dest overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connection.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                if (atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = atomic_read(&least->activeconns) * 50
+                                + atomic_read(&least->inactconns);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = atomic_read(&dest->activeconns) * 50
+                        + atomic_read(&dest->inactconns);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+        if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+                struct ip_vs_dest *d;
+                list_for_each_entry(d, &svc->destinations, n_list) {
+                        if (atomic_read(&d->activeconns)*2
+                            < atomic_read(&d->weight)) {
+                                return 1;
+                        }
+                }
+        }
+        return 0;
+}
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_lblcr_table *tbl;
+        struct ip_vs_lblcr_entry *en;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+        en = ip_vs_lblcr_get(tbl, iph->daddr);
+        if (en == NULL) {
+                dest = __ip_vs_wlc_schedule(svc, iph);
+                if (dest == NULL) {
+                        IP_VS_DBG(1, "no destination available\n");
+                        return NULL;
+                }
+                en = ip_vs_lblcr_new(iph->daddr);
+                if (en == NULL) {
+                        return NULL;
+                }
+                ip_vs_dest_set_insert(&en->set, dest);
+                ip_vs_lblcr_hash(tbl, en);
+        } else {
+                dest = ip_vs_dest_set_min(&en->set);
+                if (!dest || is_overloaded(dest, svc)) {
+                        dest = __ip_vs_wlc_schedule(svc, iph);
+                        if (dest == NULL) {
+                                IP_VS_DBG(1, "no destination available\n");
+                                return NULL;
+                        }
+                        ip_vs_dest_set_insert(&en->set, dest);
+                }
+                if (atomic_read(&en->set.size) > 1 &&
+                    jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+                        struct ip_vs_dest *m;
+                        m = ip_vs_dest_set_max(&en->set);
+                        if (m)
+                                ip_vs_dest_set_erase(&en->set, m);
+                }
+        }
+        en->lastuse = jiffies;
+        IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(en->addr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+        .name =                 "lblcr",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_lblcr_init_svc,
+        .done_service =         ip_vs_lblcr_done_svc,
+        .update_service =       ip_vs_lblcr_update_svc,
+        .schedule =             ip_vs_lblcr_schedule,
+};
+static int __init ip_vs_lblcr_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
+        sysctl_header = register_sysctl_table(lblcr_root_table, 0);
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+        proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
+#endif
+        return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+        proc_net_remove("ip_vs_lblcr");
+#endif
+        unregister_sysctl_table(sysctl_header);
+        unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
new file mode 100644
index 000000000000..d88fef90a641
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -0,0 +1,123 @@
+/*
+ * IPVS:        Least-Connection Scheduling module
+ *
+ * Version:     $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     added the ip_vs_lc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We think the overhead of processing active connections is 256
+         * times higher than that of inactive connections in average. (This
+         * 256 times might not be accurate, we will change it later) We
+         * use the following formula to estimate the overhead now:
+         *                dest->activeconns*256 + dest->inactconns
+         */
+        return (atomic_read(&dest->activeconns) << 8) +
+                atomic_read(&dest->inactconns);
+}
+/*
+ *      Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least = NULL;
+        unsigned int loh = 0, doh;
+        IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
+        /*
+         * Simply select the server with the least number of
+         *        (activeconns<<5) + inactconns
+         * Except whose weight is equal to zero.
+         * If the weight is equal to zero, it means that the server is
+         * quiesced, the existing connections to the server still get
+         * served, but no new connection is assigned to the server.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+                    atomic_read(&dest->weight) == 0)
+                        continue;
+                doh = ip_vs_lc_dest_overhead(dest);
+                if (!least || doh < loh) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        if (least)
+        IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->inactconns));
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+        .name =                 "lc",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_lc_init_svc,
+        .done_service =         ip_vs_lc_done_svc,
+        .update_service =       ip_vs_lc_update_svc,
+        .schedule =             ip_vs_lc_schedule,
+};
+static int __init ip_vs_lc_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+static void __exit ip_vs_lc_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
new file mode 100644
index 000000000000..bc2a9e5f2a7b
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -0,0 +1,161 @@
+/*
+ * IPVS:        Never Queue scheduling module
+ *
+ * Version:     $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int
+ip_vs_nq_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_nq_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_nq_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We only use the active connection number in the cost
+         * calculation here.
+         */
+        return atomic_read(&dest->activeconns) + 1;
+}
+/*
+ *      Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least = NULL;
+        unsigned int loh = 0, doh;
+        IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
+        /*
+         * We calculate the load of each dest server as follows:
+         *      (server expected overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connections.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+                    !atomic_read(&dest->weight))
+                        continue;
+                doh = ip_vs_nq_dest_overhead(dest);
+                /* return the server directly if it is idle */
+                if (atomic_read(&dest->activeconns) == 0) {
+                        least = dest;
+                        loh = doh;
+                        goto out;
+                }
+                if (!least ||
+                    (loh * atomic_read(&dest->weight) >
+                     doh * atomic_read(&least->weight))) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        if (!least)
+                return NULL;
+  out:
+        IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+        .name =                 "nq",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_nq_init_svc,
+        .done_service =         ip_vs_nq_done_svc,
+        .update_service =       ip_vs_nq_update_svc,
+        .schedule =             ip_vs_nq_schedule,
+};
+static int __init ip_vs_nq_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+static void __exit ip_vs_nq_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
new file mode 100644
index 000000000000..253c46252bd5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -0,0 +1,244 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <net/ip_vs.h>
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+#define IP_VS_PROTO_TAB_SIZE            32      /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto)         ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+/*
+ *      register an ipvs protocol
+ */
+static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+        unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+        pp->next = ip_vs_proto_table[hash];
+        ip_vs_proto_table[hash] = pp;
+        if (pp->init != NULL)
+                pp->init(pp);
+        return 0;
+}
+/*
+ *      unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+        struct ip_vs_protocol **pp_p;
+        unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+        pp_p = &ip_vs_proto_table[hash];
+        for (; *pp_p; pp_p = &(*pp_p)->next) {
+                if (*pp_p == pp) {
+                        *pp_p = pp->next;
+                        if (pp->exit != NULL)
+                                pp->exit(pp);
+                        return 0;
+                }
+        }
+        return -ESRCH;
+}
+/*
+ *      get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+        struct ip_vs_protocol *pp;
+        unsigned hash = IP_VS_PROTO_HASH(proto);
+        for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+                if (pp->protocol == proto)
+                        return pp;
+        }
+        return NULL;
+}
+/*
+ *      Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(int flags)
+{
+        struct ip_vs_protocol *pp;
+        int i;
+        for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+                for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
+                        if (pp->timeout_change)
+                                pp->timeout_change(pp, flags);
+                }
+        }
+}
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+        int *t;
+        t = kmalloc(size, GFP_ATOMIC);
+        if (t == NULL)
+                return NULL;
+        memcpy(t, table, size);
+        return t;
+}
+/*
+ *      Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
+{
+        int i;
+        if (!table || !name || !to)
+                return -EINVAL;
+        for (i = 0; i < num; i++) {
+                if (strcmp(names[i], name))
+                        continue;
+                table[i] = to * HZ;
+                return 0;
+        }
+        return -ENOENT;
+}
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+        struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+        if (pp == NULL || pp->state_name == NULL)
+                return "ERR!";
+        return pp->state_name(state);
+}
+void
+ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+                          const struct sk_buff *skb,
+                          int offset,
+                          const char *msg)
+{
+        char buf[128];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else if (ih->frag_off & __constant_htons(IP_OFFSET))
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        else {
+                __u16 _ports[2], *pptr
+;
+                pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+                                          sizeof(_ports), _ports);
+                if (pptr == NULL)
+                        sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
+                                pp->name,
+                                NIPQUAD(ih->saddr),
+                                NIPQUAD(ih->daddr));
+                else
+                        sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
+                                pp->name,
+                                NIPQUAD(ih->saddr),
+                                ntohs(pptr[0]),
+                                NIPQUAD(ih->daddr),
+                                ntohs(pptr[1]));
+        }
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+int ip_vs_protocol_init(void)
+{
+        char protocols[64];
+#define REGISTER_PROTOCOL(p)                    \
+        do {                                    \
+                register_ip_vs_protocol(p);     \
+                strcat(protocols, ", ");        \
+                strcat(protocols, (p)->name);   \
+        } while (0)
+        protocols[0] = '\0';
+        protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+        REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+        REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ICMP
+        REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+        REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+        REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+        IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
+        return 0;
+}
+void ip_vs_protocol_cleanup(void)
+{
+        struct ip_vs_protocol *pp;
+        int i;
+        /* unregister all the ipvs protocols */
+        for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+                while ((pp = ip_vs_proto_table[i]) != NULL)
+                        unregister_ip_vs_protocol(pp);
+        }
+}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
new file mode 100644
index 000000000000..453e94a0bbd7
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -0,0 +1,177 @@
+/*
+ * ip_vs_proto_ah.c:    AH IPSec load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors:     Julian Anastasov <ja@ssi.bg>, February 2002
+ *              Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              version 2 as published by the Free Software Foundation;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+/* TODO:
+struct isakmp_hdr {
+        __u8            icookie[8];
+        __u8            rcookie[8];
+        __u8            np;
+        __u8            version;
+        __u8            xchgtype;
+        __u8            flags;
+        __u32           msgid;
+        __u32           length;
+};
+*/
+#define PORT_ISAKMP     500
+static struct ip_vs_conn *
+ah_conn_in_get(const struct sk_buff *skb,
+               struct ip_vs_protocol *pp,
+               const struct iphdr *iph,
+               unsigned int proto_off,
+               int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                /*
+                 * We are not sure if the packet is from our
+                 * service, so our conn_schedule hook should return NF_ACCEPT
+                 */
+                IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static struct ip_vs_conn *
+ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static int
+ah_conn_schedule(struct sk_buff *skb,
+                 struct ip_vs_protocol *pp,
+                 int *verdict, struct ip_vs_conn **cpp)
+{
+        /*
+         * AH is only related traffic. Pass the packet to IP stack.
+         */
+        *verdict = NF_ACCEPT;
+        return 0;
+}
+static void
+ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+                int offset, const char *msg)
+{
+        char buf[256];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+static void ah_init(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+static void ah_exit(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+struct ip_vs_protocol ip_vs_protocol_ah = {
+        .name =                 "AH",
+        .protocol =             IPPROTO_AH,
+        .dont_defrag =          1,
+        .init =                 ah_init,
+        .exit =                 ah_exit,
+        .conn_schedule =        ah_conn_schedule,
+        .conn_in_get =          ah_conn_in_get,
+        .conn_out_get =         ah_conn_out_get,
+        .snat_handler =         NULL,
+        .dnat_handler =         NULL,
+        .csum_check =           NULL,
+        .state_transition =     NULL,
+        .register_app =         NULL,
+        .unregister_app =       NULL,
+        .app_conn_bind =        NULL,
+        .debug_packet =         ah_debug_packet,
+        .timeout_change =       NULL,           /* ISAKMP */
+        .set_state_timeout =    NULL,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
new file mode 100644
index 000000000000..478e5c7c7e8e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -0,0 +1,175 @@
+/*
+ * ip_vs_proto_esp.c:   ESP IPSec load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors:     Julian Anastasov <ja@ssi.bg>, February 2002
+ *              Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              version 2 as published by the Free Software Foundation;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+/* TODO:
+struct isakmp_hdr {
+        __u8            icookie[8];
+        __u8            rcookie[8];
+        __u8            np;
+        __u8            version;
+        __u8            xchgtype;
+        __u8            flags;
+        __u32           msgid;
+        __u32           length;
+};
+*/
+#define PORT_ISAKMP     500
+static struct ip_vs_conn *
+esp_conn_in_get(const struct sk_buff *skb,
+                struct ip_vs_protocol *pp,
+                const struct iphdr *iph,
+                unsigned int proto_off,
+                int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_in_get(IPPROTO_UDP,
+                                       iph->daddr,
+                                       __constant_htons(PORT_ISAKMP),
+                                       iph->saddr,
+                                       __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                /*
+                 * We are not sure if the packet is from our
+                 * service, so our conn_schedule hook should return NF_ACCEPT
+                 */
+                IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static struct ip_vs_conn *
+esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP));
+        } else {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                                        iph->daddr,
+                                        __constant_htons(PORT_ISAKMP),
+                                        iph->saddr,
+                                        __constant_htons(PORT_ISAKMP));
+        }
+        if (!cp) {
+                IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+                          "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+                          inverse ? "ICMP+" : "",
+                          pp->name,
+                          NIPQUAD(iph->saddr),
+                          NIPQUAD(iph->daddr));
+        }
+        return cp;
+}
+static int
+esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                  int *verdict, struct ip_vs_conn **cpp)
+{
+        /*
+         * ESP is only related traffic. Pass the packet to IP stack.
+         */
+        *verdict = NF_ACCEPT;
+        return 0;
+}
+static void
+esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+                 int offset, const char *msg)
+{
+        char buf[256];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+static void esp_init(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+static void esp_exit(struct ip_vs_protocol *pp)
+{
+        /* nothing to do now */
+}
+struct ip_vs_protocol ip_vs_protocol_esp = {
+        .name =                 "ESP",
+        .protocol =             IPPROTO_ESP,
+        .dont_defrag =          1,
+        .init =                 esp_init,
+        .exit =                 esp_exit,
+        .conn_schedule =        esp_conn_schedule,
+        .conn_in_get =          esp_conn_in_get,
+        .conn_out_get =         esp_conn_out_get,
+        .snat_handler =         NULL,
+        .dnat_handler =         NULL,
+        .csum_check =           NULL,
+        .state_transition =     NULL,
+        .register_app =         NULL,
+        .unregister_app =       NULL,
+        .app_conn_bind =        NULL,
+        .debug_packet =         esp_debug_packet,
+        .timeout_change =       NULL,           /* ISAKMP */
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
new file mode 100644
index 000000000000..191e94aa1c1f
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c
@@ -0,0 +1,182 @@
+/*
+ * ip_vs_proto_icmp.c:  ICMP load balancing support for IP Virtual Server
+ *
+ * Authors:     Julian Anastasov <ja@ssi.bg>, March 2002
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              version 2 as published by the Free Software Foundation;
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+static int icmp_timeouts[1] =           { 1*60*HZ };
+static char * icmp_state_name_table[1] = { "ICMP" };
+static struct ip_vs_conn *
+icmp_conn_in_get(const struct sk_buff *skb,
+                 struct ip_vs_protocol *pp,
+                 const struct iphdr *iph,
+                 unsigned int proto_off,
+                 int inverse)
+{
+#if 0
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                        iph->saddr, 0,
+                        iph->daddr, 0);
+        } else {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                        iph->daddr, 0,
+                        iph->saddr, 0);
+        }
+        return cp;
+#else
+        return NULL;
+#endif
+}
+static struct ip_vs_conn *
+icmp_conn_out_get(const struct sk_buff *skb,
+                  struct ip_vs_protocol *pp,
+                  const struct iphdr *iph,
+                  unsigned int proto_off,
+                  int inverse)
+{
+#if 0
+        struct ip_vs_conn *cp;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(iph->protocol,
+                        iph->saddr, 0,
+                        iph->daddr, 0);
+        } else {
+                cp = ip_vs_conn_out_get(IPPROTO_UDP,
+                        iph->daddr, 0,
+                        iph->saddr, 0);
+        }
+        return cp;
+#else
+        return NULL;
+#endif
+}
+static int
+icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                   int *verdict, struct ip_vs_conn **cpp)
+{
+        *verdict = NF_ACCEPT;
+        return 0;
+}
+static int
+icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+        if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
+                if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+                        if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
+                                IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
+                                return 0;
+                        }
+                }
+        }
+        return 1;
+}
+static void
+icmp_debug_packet(struct ip_vs_protocol *pp,
+                  const struct sk_buff *skb,
+                  int offset,
+                  const char *msg)
+{
+        char buf[256];
+        struct iphdr _iph, *ih;
+        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+        if (ih == NULL)
+                sprintf(buf, "%s TRUNCATED", pp->name);
+        else if (ih->frag_off & __constant_htons(IP_OFFSET))
+                sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+                        pp->name, NIPQUAD(ih->saddr),
+                        NIPQUAD(ih->daddr));
+        else {
+                struct icmphdr _icmph, *ic;
+                ic = skb_header_pointer(skb, offset + ih->ihl*4,
+                                        sizeof(_icmph), &_icmph);
+                if (ic == NULL)
+                        sprintf(buf, "%s TRUNCATED to %u bytes\n",
+                                pp->name, skb->len - offset);
+                else
+                        sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
+                                pp->name, NIPQUAD(ih->saddr),
+                                NIPQUAD(ih->daddr),
+                                ic->type, ic->code);
+        }
+        printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+static int
+icmp_state_transition(struct ip_vs_conn *cp, int direction,
+                      const struct sk_buff *skb,
+                      struct ip_vs_protocol *pp)
+{
+        cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
+        return 1;
+}
+static int
+icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+        int num;
+        char **names;
+        num = IP_VS_ICMP_S_LAST;
+        names = icmp_state_name_table;
+        return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
+}
+static void icmp_init(struct ip_vs_protocol *pp)
+{
+        pp->timeout_table = icmp_timeouts;
+}
+static void icmp_exit(struct ip_vs_protocol *pp)
+{
+}
+struct ip_vs_protocol ip_vs_protocol_icmp = {
+        .name =                 "ICMP",
+        .protocol =             IPPROTO_ICMP,
+        .dont_defrag =          0,
+        .init =                 icmp_init,
+        .exit =                 icmp_exit,
+        .conn_schedule =        icmp_conn_schedule,
+        .conn_in_get =          icmp_conn_in_get,
+        .conn_out_get =         icmp_conn_out_get,
+        .snat_handler =         NULL,
+        .dnat_handler =         NULL,
+        .csum_check =           icmp_csum_check,
+        .state_transition =     icmp_state_transition,
+        .register_app =         NULL,
+        .unregister_app =       NULL,
+        .app_conn_bind =        NULL,
+        .debug_packet =         icmp_debug_packet,
+        .timeout_change =       NULL,
+        .set_state_timeout =    icmp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 000000000000..e65de675da74
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,640 @@
+/*
+ * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+static struct ip_vs_conn *
+tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                return ip_vs_conn_in_get(iph->protocol,
+                                         iph->saddr, pptr[0],
+                                         iph->daddr, pptr[1]);
+        } else {
+                return ip_vs_conn_in_get(iph->protocol,
+                                         iph->daddr, pptr[1],
+                                         iph->saddr, pptr[0]);
+        }
+}
+static struct ip_vs_conn *
+tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                return ip_vs_conn_out_get(iph->protocol,
+                                          iph->saddr, pptr[0],
+                                          iph->daddr, pptr[1]);
+        } else {
+                return ip_vs_conn_out_get(iph->protocol,
+                                          iph->daddr, pptr[1],
+                                          iph->saddr, pptr[0]);
+        }
+}
+static int
+tcp_conn_schedule(struct sk_buff *skb,
+                  struct ip_vs_protocol *pp,
+                  int *verdict, struct ip_vs_conn **cpp)
+{
+        struct ip_vs_service *svc;
+        struct tcphdr _tcph, *th;
+        th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL) {
+                *verdict = NF_DROP;
+                return 0;
+        }
+        if (th->syn &&
+            (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+                                     skb->nh.iph->daddr, th->dest))) {
+                if (ip_vs_todrop()) {
+                        /*
+                         * It seems that we are very loaded.
+                         * We have to drop this packet :(
+                         */
+                        ip_vs_service_put(svc);
+                        *verdict = NF_DROP;
+                        return 0;
+                }
+                /*
+                 * Let the virtual server select a real server for the
+                 * incoming connection, and create a connection entry.
+                 */
+                *cpp = ip_vs_schedule(svc, skb);
+                if (!*cpp) {
+                        *verdict = ip_vs_leave(svc, skb, pp);
+                        return 0;
+                }
+                ip_vs_service_put(svc);
+        }
+        return 1;
+}
+static inline void
+tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
+                     u16 oldport, u16 newport)
+{
+        tcph->check =
+                ip_vs_check_diff(~oldip, newip,
+                                 ip_vs_check_diff(oldport ^ 0xFFFF,
+                                                  newport, tcph->check));
+}
+static int
+tcp_snat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct tcphdr *tcph;
+        unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /* Call application helper if needed */
+                if (!ip_vs_app_pkt_out(cp, pskb))
+                        return 0;
+        }
+        tcph = (void *)(*pskb)->nh.iph + tcphoff;
+        tcph->source = cp->vport;
+        /* Adjust TCP checksums */
+        if (!cp->app) {
+                /* Only port and addr are changed, do fast csum update */
+                tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
+                                     cp->dport, cp->vport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                tcph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, tcphoff,
+                                             (*pskb)->len - tcphoff, 0);
+                tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+                                                (*pskb)->len - tcphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+                          pp->name, tcph->check,
+                          (char*)&(tcph->check) - (char*)tcph);
+        }
+        return 1;
+}
+static int
+tcp_dnat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct tcphdr *tcph;
+        unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /*
+                 *      Attempt ip_vs_app call.
+                 *      It will fix ip_vs_conn and iph ack_seq stuff
+                 */
+                if (!ip_vs_app_pkt_in(cp, pskb))
+                        return 0;
+        }
+        tcph = (void *)(*pskb)->nh.iph + tcphoff;
+        tcph->dest = cp->dport;
+        /*
+         *      Adjust TCP checksums
+         */
+        if (!cp->app) {
+                /* Only port and addr are changed, do fast csum update */
+                tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
+                                     cp->vport, cp->dport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                tcph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, tcphoff,
+                                             (*pskb)->len - tcphoff, 0);
+                tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+                                                (*pskb)->len - tcphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+        }
+        return 1;
+}
+static int
+tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+        unsigned int tcphoff = skb->nh.iph->ihl*4;
+        switch (skb->ip_summed) {
+        case CHECKSUM_NONE:
+                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+        case CHECKSUM_HW:
+                if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
+                                      skb->len - tcphoff,
+                                      skb->nh.iph->protocol, skb->csum)) {
+                        IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                         "Failed checksum for");
+                        return 0;
+                }
+                break;
+        default:
+                /* CHECKSUM_UNNECESSARY */
+                break;
+        }
+        return 1;
+}
+#define TCP_DIR_INPUT           0
+#define TCP_DIR_OUTPUT          4
+#define TCP_DIR_INPUT_ONLY      8
+static int tcp_state_off[IP_VS_DIR_LAST] = {
+        [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
+        [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
+        [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
+};
+/*
+ *      Timeout table[state]
+ */
+static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+        [IP_VS_TCP_S_NONE]              =       2*HZ,
+        [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
+        [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
+        [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
+        [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
+        [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
+        [IP_VS_TCP_S_CLOSE]             =       10*HZ,
+        [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
+        [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
+        [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
+        [IP_VS_TCP_S_SYNACK]            =       120*HZ,
+        [IP_VS_TCP_S_LAST]              =       2*HZ,
+};
+#if 0
+/* FIXME: This is going to die */
+static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
+        [IP_VS_TCP_S_NONE]              =       2*HZ,
+        [IP_VS_TCP_S_ESTABLISHED]       =       8*60*HZ,
+        [IP_VS_TCP_S_SYN_SENT]          =       60*HZ,
+        [IP_VS_TCP_S_SYN_RECV]          =       10*HZ,
+        [IP_VS_TCP_S_FIN_WAIT]          =       60*HZ,
+        [IP_VS_TCP_S_TIME_WAIT]         =       60*HZ,
+        [IP_VS_TCP_S_CLOSE]             =       10*HZ,
+        [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
+        [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
+        [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
+        [IP_VS_TCP_S_SYNACK]            =       100*HZ,
+        [IP_VS_TCP_S_LAST]              =       2*HZ,
+};
+#endif
+static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+        [IP_VS_TCP_S_NONE]              =       "NONE",
+        [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
+        [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
+        [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
+        [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
+        [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
+        [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
+        [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
+        [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
+        [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
+        [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
+        [IP_VS_TCP_S_LAST]              =       "BUG!",
+};
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+struct tcp_states_t {
+        int next_state[IP_VS_TCP_S_LAST];
+};
+static const char * tcp_state_name(int state)
+{
+        if (state >= IP_VS_TCP_S_LAST)
+                return "ERR!";
+        return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+static struct tcp_states_t tcp_states [] = {
+/*      INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+/*      OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+/*      INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+static struct tcp_states_t tcp_states_dos [] = {
+/*      INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+/*      OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+/*      INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+static struct tcp_states_t *tcp_state_table = tcp_states;
+static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+{
+        int on = (flags & 1);           /* secure_tcp */
+        /*
+        ** FIXME: change secure_tcp to independent sysctl var
+        ** or make it per-service or per-app because it is valid
+        ** for most if not for all of the applications. Something
+        ** like "capabilities" (flags) for each object.
+        */
+        tcp_state_table = (on? tcp_states_dos : tcp_states);
+}
+static int
+tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+        return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
+                                       tcp_state_name_table, sname, to);
+}
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+        if (th->rst)
+                return 3;
+        if (th->syn)
+                return 0;
+        if (th->fin)
+                return 1;
+        if (th->ack)
+                return 2;
+        return -1;
+}
+static inline void
+set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+              int direction, struct tcphdr *th)
+{
+        int state_idx;
+        int new_state = IP_VS_TCP_S_CLOSE;
+        int state_off = tcp_state_off[direction];
+        /*
+         *    Update state offset to INPUT_ONLY if necessary
+         *    or delete NO_OUTPUT flag if output packet detected
+         */
+        if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+                if (state_off == TCP_DIR_OUTPUT)
+                        cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+                else
+                        state_off = TCP_DIR_INPUT_ONLY;
+        }
+        if ((state_idx = tcp_state_idx(th)) < 0) {
+                IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+                goto tcp_state_out;
+        }
+        new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+  tcp_state_out:
+        if (new_state != cp->state) {
+                struct ip_vs_dest *dest = cp->dest;
+                IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
+                          "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+                          pp->name,
+                          (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
+                          th->syn? 'S' : '.',
+                          th->fin? 'F' : '.',
+                          th->ack? 'A' : '.',
+                          th->rst? 'R' : '.',
+                          NIPQUAD(cp->daddr), ntohs(cp->dport),
+                          NIPQUAD(cp->caddr), ntohs(cp->cport),
+                          tcp_state_name(cp->state),
+                          tcp_state_name(new_state),
+                          atomic_read(&cp->refcnt));
+                if (dest) {
+                        if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                            (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+                                atomic_dec(&dest->activeconns);
+                                atomic_inc(&dest->inactconns);
+                                cp->flags |= IP_VS_CONN_F_INACTIVE;
+                        } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                                   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+                                atomic_inc(&dest->activeconns);
+                                atomic_dec(&dest->inactconns);
+                                cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+                        }
+                }
+        }
+        cp->timeout = pp->timeout_table[cp->state = new_state];
+}
+/*
+ *      Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+                     const struct sk_buff *skb,
+                     struct ip_vs_protocol *pp)
+{
+        struct tcphdr _tcph, *th;
+        th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_tcph), &_tcph);
+        if (th == NULL)
+                return 0;
+        spin_lock(&cp->lock);
+        set_tcp_state(pp, cp, direction, th);
+        spin_unlock(&cp->lock);
+        return 1;
+}
+/*
+ *      Hash table for TCP application incarnations
+ */
+#define TCP_APP_TAB_BITS        4
+#define TCP_APP_TAB_SIZE        (1 << TCP_APP_TAB_BITS)
+#define TCP_APP_TAB_MASK        (TCP_APP_TAB_SIZE - 1)
+static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(tcp_app_lock);
+static inline __u16 tcp_app_hashkey(__u16 port)
+{
+        return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
+}
+static int tcp_register_app(struct ip_vs_app *inc)
+{
+        struct ip_vs_app *i;
+        __u16 hash, port = inc->port;
+        int ret = 0;
+        hash = tcp_app_hashkey(port);
+        spin_lock_bh(&tcp_app_lock);
+        list_for_each_entry(i, &tcp_apps[hash], p_list) {
+                if (i->port == port) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+        }
+        list_add(&inc->p_list, &tcp_apps[hash]);
+        atomic_inc(&ip_vs_protocol_tcp.appcnt);
+  out:
+        spin_unlock_bh(&tcp_app_lock);
+        return ret;
+}
+static void
+tcp_unregister_app(struct ip_vs_app *inc)
+{
+        spin_lock_bh(&tcp_app_lock);
+        atomic_dec(&ip_vs_protocol_tcp.appcnt);
+        list_del(&inc->p_list);
+        spin_unlock_bh(&tcp_app_lock);
+}
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+        int hash;
+        struct ip_vs_app *inc;
+        int result = 0;
+        /* Default binding: bind app only for NAT */
+        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+                return 0;
+        /* Lookup application incarnations and bind the right one */
+        hash = tcp_app_hashkey(cp->vport);
+        spin_lock(&tcp_app_lock);
+        list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+                if (inc->port == cp->vport) {
+                        if (unlikely(!ip_vs_app_inc_get(inc)))
+                                break;
+                        spin_unlock(&tcp_app_lock);
+                        IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+                                  "%u.%u.%u.%u:%u to app %s on port %u\n",
+                                  __FUNCTION__,
+                                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                                  inc->name, ntohs(inc->port));
+                        cp->app = inc;
+                        if (inc->init_conn)
+                                result = inc->init_conn(inc, cp);
+                        goto out;
+                }
+        }
+        spin_unlock(&tcp_app_lock);
+  out:
+        return result;
+}
+/*
+ *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+{
+        spin_lock(&cp->lock);
+        cp->state = IP_VS_TCP_S_LISTEN;
+        cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+        spin_unlock(&cp->lock);
+}
+static void tcp_init(struct ip_vs_protocol *pp)
+{
+        IP_VS_INIT_HASH_TABLE(tcp_apps);
+        pp->timeout_table = tcp_timeouts;
+}
+static void tcp_exit(struct ip_vs_protocol *pp)
+{
+}
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+        .name =                 "TCP",
+        .protocol =             IPPROTO_TCP,
+        .dont_defrag =          0,
+        .appcnt =               ATOMIC_INIT(0),
+        .init =                 tcp_init,
+        .exit =                 tcp_exit,
+        .register_app =         tcp_register_app,
+        .unregister_app =       tcp_unregister_app,
+        .conn_schedule =        tcp_conn_schedule,
+        .conn_in_get =          tcp_conn_in_get,
+        .conn_out_get =         tcp_conn_out_get,
+        .snat_handler =         tcp_snat_handler,
+        .dnat_handler =         tcp_dnat_handler,
+        .csum_check =           tcp_csum_check,
+        .state_name =           tcp_state_name,
+        .state_transition =     tcp_state_transition,
+        .app_conn_bind =        tcp_app_conn_bind,
+        .debug_packet =         ip_vs_tcpudp_debug_packet,
+        .timeout_change =       tcp_timeout_change,
+        .set_state_timeout =    tcp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 000000000000..8ae5f2e0aefa
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,427 @@
+/*
+ * ip_vs_proto_udp.c:   UDP load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+static struct ip_vs_conn *
+udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                                       iph->saddr, pptr[0],
+                                       iph->daddr, pptr[1]);
+        } else {
+                cp = ip_vs_conn_in_get(iph->protocol,
+                                       iph->daddr, pptr[1],
+                                       iph->saddr, pptr[0]);
+        }
+        return cp;
+}
+static struct ip_vs_conn *
+udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+        struct ip_vs_conn *cp;
+        __u16 _ports[2], *pptr;
+        pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                  sizeof(_ports), _ports);
+        if (pptr == NULL)
+                return NULL;
+        if (likely(!inverse)) {
+                cp = ip_vs_conn_out_get(iph->protocol,
+                                        iph->saddr, pptr[0],
+                                        iph->daddr, pptr[1]);
+        } else {
+                cp = ip_vs_conn_out_get(iph->protocol,
+                                        iph->daddr, pptr[1],
+                                        iph->saddr, pptr[0]);
+        }
+        return cp;
+}
+static int
+udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                  int *verdict, struct ip_vs_conn **cpp)
+{
+        struct ip_vs_service *svc;
+        struct udphdr _udph, *uh;
+        uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                sizeof(_udph), &_udph);
+        if (uh == NULL) {
+                *verdict = NF_DROP;
+                return 0;
+        }
+        if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+                                     skb->nh.iph->daddr, uh->dest))) {
+                if (ip_vs_todrop()) {
+                        /*
+                         * It seems that we are very loaded.
+                         * We have to drop this packet :(
+                         */
+                        ip_vs_service_put(svc);
+                        *verdict = NF_DROP;
+                        return 0;
+                }
+                /*
+                 * Let the virtual server select a real server for the
+                 * incoming connection, and create a connection entry.
+                 */
+                *cpp = ip_vs_schedule(svc, skb);
+                if (!*cpp) {
+                        *verdict = ip_vs_leave(svc, skb, pp);
+                        return 0;
+                }
+                ip_vs_service_put(svc);
+        }
+        return 1;
+}
+static inline void
+udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
+                     u16 oldport, u16 newport)
+{
+        uhdr->check =
+                ip_vs_check_diff(~oldip, newip,
+                                 ip_vs_check_diff(oldport ^ 0xFFFF,
+                                                  newport, uhdr->check));
+        if (!uhdr->check)
+                uhdr->check = 0xFFFF;
+}
+static int
+udp_snat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct udphdr *udph;
+        unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /*
+                 *      Call application helper if needed
+                 */
+                if (!ip_vs_app_pkt_out(cp, pskb))
+                        return 0;
+        }
+        udph = (void *)(*pskb)->nh.iph + udphoff;
+        udph->source = cp->vport;
+        /*
+         *      Adjust UDP checksums
+         */
+        if (!cp->app && (udph->check != 0)) {
+                /* Only port and addr are changed, do fast csum update */
+                udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
+                                     cp->dport, cp->vport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                udph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, udphoff,
+                                             (*pskb)->len - udphoff, 0);
+                udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+                                                (*pskb)->len - udphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                if (udph->check == 0)
+                        udph->check = 0xFFFF;
+                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+                          pp->name, udph->check,
+                          (char*)&(udph->check) - (char*)udph);
+        }
+        return 1;
+}
+static int
+udp_dnat_handler(struct sk_buff **pskb,
+                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+        struct udphdr *udph;
+        unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+        /* csum_check requires unshared skb */
+        if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+                return 0;
+        if (unlikely(cp->app != NULL)) {
+                /* Some checks before mangling */
+                if (pp->csum_check && !pp->csum_check(*pskb, pp))
+                        return 0;
+                /*
+                 *      Attempt ip_vs_app call.
+                 *      It will fix ip_vs_conn
+                 */
+                if (!ip_vs_app_pkt_in(cp, pskb))
+                        return 0;
+        }
+        udph = (void *)(*pskb)->nh.iph + udphoff;
+        udph->dest = cp->dport;
+        /*
+         *      Adjust UDP checksums
+         */
+        if (!cp->app && (udph->check != 0)) {
+                /* Only port and addr are changed, do fast csum update */
+                udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
+                                     cp->vport, cp->dport);
+                if ((*pskb)->ip_summed == CHECKSUM_HW)
+                        (*pskb)->ip_summed = CHECKSUM_NONE;
+        } else {
+                /* full checksum calculation */
+                udph->check = 0;
+                (*pskb)->csum = skb_checksum(*pskb, udphoff,
+                                             (*pskb)->len - udphoff, 0);
+                udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+                                                (*pskb)->len - udphoff,
+                                                cp->protocol,
+                                                (*pskb)->csum);
+                if (udph->check == 0)
+                        udph->check = 0xFFFF;
+                (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+        }
+        return 1;
+}
+static int
+udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+        struct udphdr _udph, *uh;
+        unsigned int udphoff = skb->nh.iph->ihl*4;
+        uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+        if (uh == NULL)
+                return 0;
+        if (uh->check != 0) {
+                switch (skb->ip_summed) {
+                case CHECKSUM_NONE:
+                        skb->csum = skb_checksum(skb, udphoff,
+                                                 skb->len - udphoff, 0);
+                case CHECKSUM_HW:
+                        if (csum_tcpudp_magic(skb->nh.iph->saddr,
+                                              skb->nh.iph->daddr,
+                                              skb->len - udphoff,
+                                              skb->nh.iph->protocol,
+                                              skb->csum)) {
+                                IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                                 "Failed checksum for");
+                                return 0;
+                        }
+                        break;
+                default:
+                        /* CHECKSUM_UNNECESSARY */
+                        break;
+                }
+        }
+        return 1;
+}
+/*
+ *      Note: the caller guarantees that only one of register_app,
+ *      unregister_app or app_conn_bind is called each time.
+ */
+#define UDP_APP_TAB_BITS        4
+#define UDP_APP_TAB_SIZE        (1 << UDP_APP_TAB_BITS)
+#define UDP_APP_TAB_MASK        (UDP_APP_TAB_SIZE - 1)
+static struct list_head udp_apps[UDP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(udp_app_lock);
+static inline __u16 udp_app_hashkey(__u16 port)
+{
+        return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
+}
+static int udp_register_app(struct ip_vs_app *inc)
+{
+        struct ip_vs_app *i;
+        __u16 hash, port = inc->port;
+        int ret = 0;
+        hash = udp_app_hashkey(port);
+        spin_lock_bh(&udp_app_lock);
+        list_for_each_entry(i, &udp_apps[hash], p_list) {
+                if (i->port == port) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+        }
+        list_add(&inc->p_list, &udp_apps[hash]);
+        atomic_inc(&ip_vs_protocol_udp.appcnt);
+  out:
+        spin_unlock_bh(&udp_app_lock);
+        return ret;
+}
+static void
+udp_unregister_app(struct ip_vs_app *inc)
+{
+        spin_lock_bh(&udp_app_lock);
+        atomic_dec(&ip_vs_protocol_udp.appcnt);
+        list_del(&inc->p_list);
+        spin_unlock_bh(&udp_app_lock);
+}
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+        int hash;
+        struct ip_vs_app *inc;
+        int result = 0;
+        /* Default binding: bind app only for NAT */
+        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+                return 0;
+        /* Lookup application incarnations and bind the right one */
+        hash = udp_app_hashkey(cp->vport);
+        spin_lock(&udp_app_lock);
+        list_for_each_entry(inc, &udp_apps[hash], p_list) {
+                if (inc->port == cp->vport) {
+                        if (unlikely(!ip_vs_app_inc_get(inc)))
+                                break;
+                        spin_unlock(&udp_app_lock);
+                        IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+                                  "%u.%u.%u.%u:%u to app %s on port %u\n",
+                                  __FUNCTION__,
+                                  NIPQUAD(cp->caddr), ntohs(cp->cport),
+                                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+                                  inc->name, ntohs(inc->port));
+                        cp->app = inc;
+                        if (inc->init_conn)
+                                result = inc->init_conn(inc, cp);
+                        goto out;
+                }
+        }
+        spin_unlock(&udp_app_lock);
+  out:
+        return result;
+}
+static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+        [IP_VS_UDP_S_NORMAL]            =       5*60*HZ,
+        [IP_VS_UDP_S_LAST]              =       2*HZ,
+};
+static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+        [IP_VS_UDP_S_NORMAL]            =       "UDP",
+        [IP_VS_UDP_S_LAST]              =       "BUG!",
+};
+static int
+udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+        return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
+                                       udp_state_name_table, sname, to);
+}
+static const char * udp_state_name(int state)
+{
+        if (state >= IP_VS_UDP_S_LAST)
+                return "ERR!";
+        return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+static int
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+                     const struct sk_buff *skb,
+                     struct ip_vs_protocol *pp)
+{
+        cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+        return 1;
+}
+static void udp_init(struct ip_vs_protocol *pp)
+{
+        IP_VS_INIT_HASH_TABLE(udp_apps);
+        pp->timeout_table = udp_timeouts;
+}
+static void udp_exit(struct ip_vs_protocol *pp)
+{
+}
+struct ip_vs_protocol ip_vs_protocol_udp = {
+        .name =                 "UDP",
+        .protocol =             IPPROTO_UDP,
+        .dont_defrag =          0,
+        .init =                 udp_init,
+        .exit =                 udp_exit,
+        .conn_schedule =        udp_conn_schedule,
+        .conn_in_get =          udp_conn_in_get,
+        .conn_out_get =         udp_conn_out_get,
+        .snat_handler =         udp_snat_handler,
+        .dnat_handler =         udp_dnat_handler,
+        .csum_check =           udp_csum_check,
+        .state_transition =     udp_state_transition,
+        .state_name =           udp_state_name,
+        .register_app =         udp_register_app,
+        .unregister_app =       udp_unregister_app,
+        .app_conn_bind =        udp_app_conn_bind,
+        .debug_packet =         ip_vs_tcpudp_debug_packet,
+        .timeout_change =       NULL,
+        .set_state_timeout =    udp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
new file mode 100644
index 000000000000..b23bab231cab
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -0,0 +1,118 @@
+/*
+ * IPVS:        Round-Robin Scheduling module
+ *
+ * Version:     $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
+ *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_rr_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct list_head *p, *q;
+        struct ip_vs_dest *dest;
+        IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
+        write_lock(&svc->sched_lock);
+        p = (struct list_head *)svc->sched_data;
+        p = p->next;
+        q = p;
+        do {
+                /* skip list head */
+                if (q == &svc->destinations) {
+                        q = q->next;
+                        continue;
+                }
+                
+                dest = list_entry(q, struct ip_vs_dest, n_list);
+                if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                    atomic_read(&dest->weight) > 0)
+                        /* HIT */
+                        goto out;
+                q = q->next;
+        } while (q != p);
+        write_unlock(&svc->sched_lock);
+        return NULL;
+  out:
+        svc->sched_data = q;
+        write_unlock(&svc->sched_lock);
+        IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d\n",
+                  NIPQUAD(dest->addr), ntohs(dest->port),
+                  atomic_read(&dest->activeconns),
+                  atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+        return dest;
+}
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+        .name =                 "rr",                   /* name */
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_rr_init_svc,
+        .done_service =         ip_vs_rr_done_svc,
+        .update_service =       ip_vs_rr_update_svc,
+        .schedule =             ip_vs_rr_schedule,
+};
+static int __init ip_vs_rr_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+static void __exit ip_vs_rr_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
new file mode 100644
index 000000000000..0f7c56a225bd
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <net/ip_vs.h>
+/*
+ *  IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_sched_lock);
+/*
+ *  Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+                         struct ip_vs_scheduler *scheduler)
+{
+        int ret;
+        if (svc == NULL) {
+                IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
+                return -EINVAL;
+        }
+        if (scheduler == NULL) {
+                IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
+                return -EINVAL;
+        }
+        svc->scheduler = scheduler;
+        if (scheduler->init_service) {
+                ret = scheduler->init_service(svc);
+                if (ret) {
+                        IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
+                        return ret;
+                }
+        }
+        return 0;
+}
+/*
+ *  Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+        struct ip_vs_scheduler *sched;
+        if (svc == NULL) {
+                IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
+                return -EINVAL;
+        }
+        sched = svc->scheduler;
+        if (sched == NULL) {
+                IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
+                return -EINVAL;
+        }
+        if (sched->done_service) {
+                if (sched->done_service(svc) != 0) {
+                        IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
+                        return -EINVAL;
+                }
+        }
+        svc->scheduler = NULL;
+        return 0;
+}
+/*
+ *  Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+        struct ip_vs_scheduler *sched;
+        IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
+                  sched_name);
+        read_lock_bh(&__ip_vs_sched_lock);
+        list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+                /*
+                 * Test and get the modules atomically
+                 */
+                if (sched->module && !try_module_get(sched->module)) {
+                        /*
+                         * This scheduler is just deleted
+                         */
+                        continue;
+                }
+                if (strcmp(sched_name, sched->name)==0) {
+                        /* HIT */
+                        read_unlock_bh(&__ip_vs_sched_lock);
+                        return sched;
+                }
+                if (sched->module)
+                        module_put(sched->module);
+        }
+        read_unlock_bh(&__ip_vs_sched_lock);
+        return NULL;
+}
+/*
+ *  Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+        struct ip_vs_scheduler *sched;
+        /*
+         *  Search for the scheduler by sched_name
+         */
+        sched = ip_vs_sched_getbyname(sched_name);
+        /*
+         *  If scheduler not found, load the module and search again
+         */
+        if (sched == NULL) {
+                request_module("ip_vs_%s", sched_name);
+                sched = ip_vs_sched_getbyname(sched_name);
+        }
+        return sched;
+}
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+        if (scheduler->module)
+                module_put(scheduler->module);
+}
+/*
+ *  Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+        struct ip_vs_scheduler *sched;
+        if (!scheduler) {
+                IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
+                return -EINVAL;
+        }
+        if (!scheduler->name) {
+                IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
+                return -EINVAL;
+        }
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        /*
+         *  Make sure that the scheduler with this name doesn't exist
+         *  in the scheduler list.
+         */
+        sched = ip_vs_sched_getbyname(scheduler->name);
+        if (sched) {
+                ip_vs_scheduler_put(sched);
+                ip_vs_use_count_dec();
+                IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+                          "already existed in the system\n", scheduler->name);
+                return -EINVAL;
+        }
+        write_lock_bh(&__ip_vs_sched_lock);
+        if (scheduler->n_list.next != &scheduler->n_list) {
+                write_unlock_bh(&__ip_vs_sched_lock);
+                ip_vs_use_count_dec();
+                IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+                          "already linked\n", scheduler->name);
+                return -EINVAL;
+        }
+        /*
+         *      Add it into the d-linked scheduler list
+         */
+        list_add(&scheduler->n_list, &ip_vs_schedulers);
+        write_unlock_bh(&__ip_vs_sched_lock);
+        IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
+        return 0;
+}
+/*
+ *  Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+        if (!scheduler) {
+                IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
+                return -EINVAL;
+        }
+        write_lock_bh(&__ip_vs_sched_lock);
+        if (scheduler->n_list.next == &scheduler->n_list) {
+                write_unlock_bh(&__ip_vs_sched_lock);
+                IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
+                          "is not in the list. failed\n", scheduler->name);
+                return -EINVAL;
+        }
+        /*
+         *      Remove it from the d-linked scheduler list
+         */
+        list_del(&scheduler->n_list);
+        write_unlock_bh(&__ip_vs_sched_lock);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
+        return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
new file mode 100644
index 000000000000..ff366f7390d9
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -0,0 +1,163 @@
+/*
+ * IPVS:        Shortest Expected Delay scheduling module
+ *
+ * Version:     $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int
+ip_vs_sed_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_sed_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_sed_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We only use the active connection number in the cost
+         * calculation here.
+         */
+        return atomic_read(&dest->activeconns) + 1;
+}
+/*
+ *      Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least;
+        unsigned int loh, doh;
+        IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
+        /*
+         * We calculate the load of each dest server as follows:
+         *      (server expected overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connections.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                    atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = ip_vs_sed_dest_overhead(least);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = ip_vs_sed_dest_overhead(dest);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+        .name =                 "sed",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_sed_init_svc,
+        .done_service =         ip_vs_sed_done_svc,
+        .update_service =       ip_vs_sed_update_svc,
+        .schedule =             ip_vs_sed_schedule,
+};
+static int __init ip_vs_sed_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+static void __exit ip_vs_sed_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
new file mode 100644
index 000000000000..6f7c50e44a39
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -0,0 +1,255 @@
+/*
+ * IPVS:        Source Hashing scheduling module
+ *
+ * Version:     $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[src_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) or (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+/*
+ *      IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+        struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+/*
+ *     for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS        8
+#endif
+#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
+/*
+ *      Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(__u32 addr)
+{
+        return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
+{
+        return (tbl[ip_vs_sh_hashkey(addr)]).dest;
+}
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+        int i;
+        struct ip_vs_sh_bucket *b;
+        struct list_head *p;
+        struct ip_vs_dest *dest;
+        b = tbl;
+        p = &svc->destinations;
+        for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+                if (list_empty(p)) {
+                        b->dest = NULL;
+                } else {
+                        if (p == &svc->destinations)
+                                p = p->next;
+                        dest = list_entry(p, struct ip_vs_dest, n_list);
+                        atomic_inc(&dest->refcnt);
+                        b->dest = dest;
+                        p = p->next;
+                }
+                b++;
+        }
+        return 0;
+}
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+        int i;
+        struct ip_vs_sh_bucket *b;
+        b = tbl;
+        for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+                if (b->dest) {
+                        atomic_dec(&b->dest->refcnt);
+                        b->dest = NULL;
+                }
+                b++;
+        }
+}
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_sh_bucket *tbl;
+        /* allocate the SH table for this service */
+        tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+                      GFP_ATOMIC);
+        if (tbl == NULL) {
+                IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        svc->sched_data = tbl;
+        IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+                  "current service\n",
+                  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+        /* assign the hash buckets with the updated service */
+        ip_vs_sh_assign(tbl, svc);
+        return 0;
+}
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_sh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_sh_flush(tbl);
+        /* release the table itself */
+        kfree(svc->sched_data);
+        IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+                  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+        return 0;
+}
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_sh_bucket *tbl = svc->sched_data;
+        /* got to clean up hash buckets here */
+        ip_vs_sh_flush(tbl);
+        /* assign the hash buckets with the updated service */
+        ip_vs_sh_assign(tbl, svc);
+        return 0;
+}
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+        return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+/*
+ *      Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_sh_bucket *tbl;
+        struct iphdr *iph = skb->nh.iph;
+        IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+        tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+        dest = ip_vs_sh_get(tbl, iph->saddr);
+        if (!dest
+            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+            || atomic_read(&dest->weight) <= 0
+            || is_overloaded(dest)) {
+                return NULL;
+        }
+        IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
+                  "--> server %u.%u.%u.%u:%d\n",
+                  NIPQUAD(iph->saddr),
+                  NIPQUAD(dest->addr),
+                  ntohs(dest->port));
+        return dest;
+}
+/*
+ *      IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+        .name =                 "sh",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_sh_init_svc,
+        .done_service =         ip_vs_sh_done_svc,
+        .update_service =       ip_vs_sh_update_svc,
+        .schedule =             ip_vs_sh_schedule,
+};
+static int __init ip_vs_sh_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+static void __exit ip_vs_sh_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
new file mode 100644
index 000000000000..25c479550a32
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -0,0 +1,892 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync:  sync connection info from master load balancer to backups
+ *              through multicast
+ *
+ * Changes:
+ *      Alexandre Cassen        :       Added master & backup support at a time.
+ *      Alexandre Cassen        :       Added SyncID support for incoming sync
+ *                                      messages filtering.
+ *      Justin Ossevoort        :       Fix endian problem on sync message size.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <net/ip.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>                /* for get_fs and set_fs */
+#include <net/ip_vs.h>
+#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT  8848          /* multicast port */
+/*
+ *      IPVS sync connection entry
+ */
+struct ip_vs_sync_conn {
+        __u8                    reserved;
+        /* Protocol, addresses and port numbers */
+        __u8                    protocol;       /* Which protocol (TCP/UDP) */
+        __u16                   cport;
+        __u16                   vport;
+        __u16                   dport;
+        __u32                   caddr;          /* client address */
+        __u32                   vaddr;          /* virtual address */
+        __u32                   daddr;          /* destination address */
+        /* Flags and state transition */
+        __u16                   flags;          /* status flags */
+        __u16                   state;          /* state info */
+        /* The sequence options start here */
+};
+struct ip_vs_sync_conn_options {
+        struct ip_vs_seq        in_seq;         /* incoming seq. struct */
+        struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+};
+#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
+#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
+#define FULL_CONN_SIZE  \
+(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+/*
+  The master mulitcasts messages to the backup load balancers in the
+  following format.
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (1)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                            .                                  |
+      |                            .                                  |
+      |                            .                                  |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (n)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+#define SYNC_MESG_HEADER_LEN    4
+struct ip_vs_sync_mesg {
+        __u8                    nr_conns;
+        __u8                    syncid;
+        __u16                   size;
+        /* ip_vs_sync_conn entries start here */
+};
+/* the maximum length of sync (sending/receiving) message */
+static int sync_send_mesg_maxlen;
+static int sync_recv_mesg_maxlen;
+struct ip_vs_sync_buff {
+        struct list_head        list;
+        unsigned long           firstuse;
+        /* pointers for the message data */
+        struct ip_vs_sync_mesg  *mesg;
+        unsigned char           *head;
+        unsigned char           *end;
+};
+/* the sync_buff list head and the lock */
+static LIST_HEAD(ip_vs_sync_queue);
+static DEFINE_SPINLOCK(ip_vs_sync_lock);
+/* current sync_buff for accepting new conn entries */
+static struct ip_vs_sync_buff   *curr_sb = NULL;
+static DEFINE_SPINLOCK(curr_sb_lock);
+/* ipvs sync daemon state */
+volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
+volatile int ip_vs_master_syncid = 0;
+volatile int ip_vs_backup_syncid = 0;
+/* multicast interface name */
+char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+/* multicast addr */
+static struct sockaddr_in mcast_addr;
+static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+{
+        spin_lock(&ip_vs_sync_lock);
+        list_add_tail(&sb->list, &ip_vs_sync_queue);
+        spin_unlock(&ip_vs_sync_lock);
+}
+static inline struct ip_vs_sync_buff * sb_dequeue(void)
+{
+        struct ip_vs_sync_buff *sb;
+        spin_lock_bh(&ip_vs_sync_lock);
+        if (list_empty(&ip_vs_sync_queue)) {
+                sb = NULL;
+        } else {
+                sb = list_entry(ip_vs_sync_queue.next,
+                                struct ip_vs_sync_buff,
+                                list);
+                list_del(&sb->list);
+        }
+        spin_unlock_bh(&ip_vs_sync_lock);
+        return sb;
+}
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+{
+        struct ip_vs_sync_buff *sb;
+        if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+                return NULL;
+        if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+                kfree(sb);
+                return NULL;
+        }
+        sb->mesg->nr_conns = 0;
+        sb->mesg->syncid = ip_vs_master_syncid;
+        sb->mesg->size = 4;
+        sb->head = (unsigned char *)sb->mesg + 4;
+        sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+        sb->firstuse = jiffies;
+        return sb;
+}
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+        kfree(sb->mesg);
+        kfree(sb);
+}
+/*
+ *      Get the current sync buffer if it has been created for more
+ *      than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(unsigned long time)
+{
+        struct ip_vs_sync_buff *sb;
+        spin_lock_bh(&curr_sb_lock);
+        if (curr_sb && (time == 0 ||
+                        time_before(jiffies - curr_sb->firstuse, time))) {
+                sb = curr_sb;
+                curr_sb = NULL;
+        } else
+                sb = NULL;
+        spin_unlock_bh(&curr_sb_lock);
+        return sb;
+}
+/*
+ *      Add an ip_vs_conn information into the current sync_buff.
+ *      Called by ip_vs_in.
+ */
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
+{
+        struct ip_vs_sync_mesg *m;
+        struct ip_vs_sync_conn *s;
+        int len;
+        spin_lock(&curr_sb_lock);
+        if (!curr_sb) {
+                if (!(curr_sb=ip_vs_sync_buff_create())) {
+                        spin_unlock(&curr_sb_lock);
+                        IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
+                        return;
+                }
+        }
+        len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+                SIMPLE_CONN_SIZE;
+        m = curr_sb->mesg;
+        s = (struct ip_vs_sync_conn *)curr_sb->head;
+        /* copy members */
+        s->protocol = cp->protocol;
+        s->cport = cp->cport;
+        s->vport = cp->vport;
+        s->dport = cp->dport;
+        s->caddr = cp->caddr;
+        s->vaddr = cp->vaddr;
+        s->daddr = cp->daddr;
+        s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+        s->state = htons(cp->state);
+        if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+                struct ip_vs_sync_conn_options *opt =
+                        (struct ip_vs_sync_conn_options *)&s[1];
+                memcpy(opt, &cp->in_seq, sizeof(*opt));
+        }
+        m->nr_conns++;
+        m->size += len;
+        curr_sb->head += len;
+        /* check if there is a space for next one */
+        if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
+                sb_queue_tail(curr_sb);
+                curr_sb = NULL;
+        }
+        spin_unlock(&curr_sb_lock);
+        /* synchronize its controller if it has */
+        if (cp->control)
+                ip_vs_sync_conn(cp->control);
+}
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ */
+static void ip_vs_process_message(const char *buffer, const size_t buflen)
+{
+        struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+        struct ip_vs_sync_conn *s;
+        struct ip_vs_sync_conn_options *opt;
+        struct ip_vs_conn *cp;
+        char *p;
+        int i;
+        /* Convert size back to host byte order */
+        m->size = ntohs(m->size);
+        if (buflen != m->size) {
+                IP_VS_ERR("bogus message\n");
+                return;
+        }
+        /* SyncID sanity check */
+        if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+                IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
+                          m->syncid);
+                return;
+        }
+        p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+        for (i=0; i<m->nr_conns; i++) {
+                s = (struct ip_vs_sync_conn *)p;
+                cp = ip_vs_conn_in_get(s->protocol,
+                                       s->caddr, s->cport,
+                                       s->vaddr, s->vport);
+                if (!cp) {
+                        cp = ip_vs_conn_new(s->protocol,
+                                            s->caddr, s->cport,
+                                            s->vaddr, s->vport,
+                                            s->daddr, s->dport,
+                                            ntohs(s->flags), NULL);
+                        if (!cp) {
+                                IP_VS_ERR("ip_vs_conn_new failed\n");
+                                return;
+                        }
+                        cp->state = ntohs(s->state);
+                } else if (!cp->dest) {
+                        /* it is an entry created by the synchronization */
+                        cp->state = ntohs(s->state);
+                        cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
+                }       /* Note that we don't touch its state and flags
+                           if it is a normal entry. */
+                if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
+                        opt = (struct ip_vs_sync_conn_options *)&s[1];
+                        memcpy(&cp->in_seq, opt, sizeof(*opt));
+                        p += FULL_CONN_SIZE;
+                } else
+                        p += SIMPLE_CONN_SIZE;
+                atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+                cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
+                ip_vs_conn_put(cp);
+                if (p > buffer+buflen) {
+                        IP_VS_ERR("bogus message\n");
+                        return;
+                }
+        }
+}
+/*
+ *      Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+        lock_sock(sk);
+        inet->mc_loop = loop ? 1 : 0;
+        release_sock(sk);
+}
+/*
+ *      Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+        lock_sock(sk);
+        inet->mc_ttl = ttl;
+        release_sock(sk);
+}
+/*
+ *      Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+        struct net_device *dev;
+        struct inet_sock *inet = inet_sk(sk);
+        if ((dev = __dev_get_by_name(ifname)) == NULL)
+                return -ENODEV;
+        if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+                return -EINVAL;
+        lock_sock(sk);
+        inet->mc_index = dev->ifindex;
+        /*  inet->mc_addr  = 0; */
+        release_sock(sk);
+        return 0;
+}
+/*
+ *      Set the maximum length of sync message according to the
+ *      specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(int sync_state)
+{
+        struct net_device *dev;
+        int num;
+        if (sync_state == IP_VS_STATE_MASTER) {
+                if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
+                        return -ENODEV;
+                num = (dev->mtu - sizeof(struct iphdr) -
+                       sizeof(struct udphdr) -
+                       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+                sync_send_mesg_maxlen =
+                        SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
+                IP_VS_DBG(7, "setting the maximum length of sync sending "
+                          "message %d.\n", sync_send_mesg_maxlen);
+        } else if (sync_state == IP_VS_STATE_BACKUP) {
+                if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
+                        return -ENODEV;
+                sync_recv_mesg_maxlen = dev->mtu -
+                        sizeof(struct iphdr) - sizeof(struct udphdr);
+                IP_VS_DBG(7, "setting the maximum length of sync receiving "
+                          "message %d.\n", sync_recv_mesg_maxlen);
+        }
+        return 0;
+}
+/*
+ *      Join a multicast group.
+ *      the group is specified by a class D multicast address 224.0.0.0/8
+ *      in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+        struct ip_mreqn mreq;
+        struct net_device *dev;
+        int ret;
+        memset(&mreq, 0, sizeof(mreq));
+        memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+        if ((dev = __dev_get_by_name(ifname)) == NULL)
+                return -ENODEV;
+        if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+                return -EINVAL;
+        mreq.imr_ifindex = dev->ifindex;
+        lock_sock(sk);
+        ret = ip_mc_join_group(sk, &mreq);
+        release_sock(sk);
+        return ret;
+}
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+        struct net_device *dev;
+        u32 addr;
+        struct sockaddr_in sin;
+        if ((dev = __dev_get_by_name(ifname)) == NULL)
+                return -ENODEV;
+        addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+        if (!addr)
+                IP_VS_ERR("You probably need to specify IP address on "
+                          "multicast interface.\n");
+        IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
+                  ifname, NIPQUAD(addr));
+        /* Now bind the socket with the address of multicast interface */
+        sin.sin_family       = AF_INET;
+        sin.sin_addr.s_addr  = addr;
+        sin.sin_port         = 0;
+        return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+/*
+ *      Set up sending multicast socket over UDP
+ */
+static struct socket * make_send_sock(void)
+{
+        struct socket *sock;
+        /* First create a socket */
+        if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+                IP_VS_ERR("Error during creation of socket; terminating\n");
+                return NULL;
+        }
+        if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
+                IP_VS_ERR("Error setting outbound mcast interface\n");
+                goto error;
+        }
+        set_mcast_loop(sock->sk, 0);
+        set_mcast_ttl(sock->sk, 1);
+        if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
+                IP_VS_ERR("Error binding address of the mcast interface\n");
+                goto error;
+        }
+        if (sock->ops->connect(sock,
+                               (struct sockaddr*)&mcast_addr,
+                               sizeof(struct sockaddr), 0) < 0) {
+                IP_VS_ERR("Error connecting to the multicast addr\n");
+                goto error;
+        }
+        return sock;
+  error:
+        sock_release(sock);
+        return NULL;
+}
+/*
+ *      Set up receiving multicast socket over UDP
+ */
+static struct socket * make_receive_sock(void)
+{
+        struct socket *sock;
+        /* First create a socket */
+        if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+                IP_VS_ERR("Error during creation of socket; terminating\n");
+                return NULL;
+        }
+        /* it is equivalent to the REUSEADDR option in user-space */
+        sock->sk->sk_reuse = 1;
+        if (sock->ops->bind(sock,
+                            (struct sockaddr*)&mcast_addr,
+                            sizeof(struct sockaddr)) < 0) {
+                IP_VS_ERR("Error binding to the multicast addr\n");
+                goto error;
+        }
+        /* join the multicast group */
+        if (join_mcast_group(sock->sk,
+                             (struct in_addr*)&mcast_addr.sin_addr,
+                             ip_vs_backup_mcast_ifn) < 0) {
+                IP_VS_ERR("Error joining to the multicast group\n");
+                goto error;
+        }
+        return sock;
+  error:
+        sock_release(sock);
+        return NULL;
+}
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+        struct msghdr   msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+        struct kvec     iov;
+        int             len;
+        EnterFunction(7);
+        iov.iov_base     = (void *)buffer;
+        iov.iov_len      = length;
+        len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+        LeaveFunction(7);
+        return len;
+}
+static void
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+        int msize;
+        msize = msg->size;
+        /* Put size in network byte order */
+        msg->size = htons(msg->size);
+        if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
+                IP_VS_ERR("ip_vs_send_async error\n");
+}
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+        struct msghdr           msg = {NULL,};
+        struct kvec             iov;
+        int                     len;
+        EnterFunction(7);
+        /* Receive a packet */
+        iov.iov_base     = buffer;
+        iov.iov_len      = (size_t)buflen;
+        len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+        if (len < 0)
+                return -1;
+        LeaveFunction(7);
+        return len;
+}
+static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
+static pid_t sync_master_pid = 0;
+static pid_t sync_backup_pid = 0;
+static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
+static int stop_master_sync = 0;
+static int stop_backup_sync = 0;
+static void sync_master_loop(void)
+{
+        struct socket *sock;
+        struct ip_vs_sync_buff *sb;
+        /* create the sending multicast socket */
+        sock = make_send_sock();
+        if (!sock)
+                return;
+        IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
+                   "syncid = %d\n",
+                   ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+        for (;;) {
+                while ((sb=sb_dequeue())) {
+                        ip_vs_send_sync_msg(sock, sb->mesg);
+                        ip_vs_sync_buff_release(sb);
+                }
+                /* check if entries stay in curr_sb for 2 seconds */
+                if ((sb = get_curr_sync_buff(2*HZ))) {
+                        ip_vs_send_sync_msg(sock, sb->mesg);
+                        ip_vs_sync_buff_release(sb);
+                }
+                if (stop_master_sync)
+                        break;
+                ssleep(1);
+        }
+        /* clean up the sync_buff queue */
+        while ((sb=sb_dequeue())) {
+                ip_vs_sync_buff_release(sb);
+        }
+        /* clean up the current sync_buff */
+        if ((sb = get_curr_sync_buff(0))) {
+                ip_vs_sync_buff_release(sb);
+        }
+        /* release the sending multicast socket */
+        sock_release(sock);
+}
+static void sync_backup_loop(void)
+{
+        struct socket *sock;
+        char *buf;
+        int len;
+        if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
+                IP_VS_ERR("sync_backup_loop: kmalloc error\n");
+                return;
+        }
+        /* create the receiving multicast socket */
+        sock = make_receive_sock();
+        if (!sock)
+                goto out;
+        IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
+                   "syncid = %d\n",
+                   ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+        for (;;) {
+                /* do you have data now? */
+                while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
+                        if ((len =
+                             ip_vs_receive(sock, buf,
+                                           sync_recv_mesg_maxlen)) <= 0) {
+                                IP_VS_ERR("receiving message error\n");
+                                break;
+                        }
+                        /* disable bottom half, because it accessed the data
+                           shared by softirq while getting/creating conns */
+                        local_bh_disable();
+                        ip_vs_process_message(buf, len);
+                        local_bh_enable();
+                }
+                if (stop_backup_sync)
+                        break;
+                ssleep(1);
+        }
+        /* release the sending multicast socket */
+        sock_release(sock);
+  out:
+        kfree(buf);
+}
+static void set_sync_pid(int sync_state, pid_t sync_pid)
+{
+        if (sync_state == IP_VS_STATE_MASTER)
+                sync_master_pid = sync_pid;
+        else if (sync_state == IP_VS_STATE_BACKUP)
+                sync_backup_pid = sync_pid;
+}
+static void set_stop_sync(int sync_state, int set)
+{
+        if (sync_state == IP_VS_STATE_MASTER)
+                stop_master_sync = set;
+        else if (sync_state == IP_VS_STATE_BACKUP)
+                stop_backup_sync = set;
+        else {
+                stop_master_sync = set;
+                stop_backup_sync = set;
+        }
+}
+static int sync_thread(void *startup)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        mm_segment_t oldmm;
+        int state;
+        const char *name;
+        /* increase the module use count */
+        ip_vs_use_count_inc();
+        if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
+                state = IP_VS_STATE_MASTER;
+                name = "ipvs_syncmaster";
+        } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
+                state = IP_VS_STATE_BACKUP;
+                name = "ipvs_syncbackup";
+        } else {
+                IP_VS_BUG();
+                ip_vs_use_count_dec();
+                return -EINVAL;
+        }
+        daemonize(name);
+        oldmm = get_fs();
+        set_fs(KERNEL_DS);
+        /* Block all signals */
+        spin_lock_irq(&current->sighand->siglock);
+        siginitsetinv(&current->blocked, 0);
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        /* set the maximum length of sync message */
+        set_sync_mesg_maxlen(state);
+        /* set up multicast address */
+        mcast_addr.sin_family = AF_INET;
+        mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
+        mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
+        add_wait_queue(&sync_wait, &wait);
+        set_sync_pid(state, current->pid);
+        complete((struct completion *)startup);
+        /* processing master/backup loop here */
+        if (state == IP_VS_STATE_MASTER)
+                sync_master_loop();
+        else if (state == IP_VS_STATE_BACKUP)
+                sync_backup_loop();
+        else IP_VS_BUG();
+        remove_wait_queue(&sync_wait, &wait);
+        /* thread exits */
+        set_sync_pid(state, 0);
+        IP_VS_INFO("sync thread stopped!\n");
+        set_fs(oldmm);
+        /* decrease the module use count */
+        ip_vs_use_count_dec();
+        set_stop_sync(state, 0);
+        wake_up(&stop_sync_wait);
+        return 0;
+}
+static int fork_sync_thread(void *startup)
+{
+        pid_t pid;
+        /* fork the sync thread here, then the parent process of the
+           sync thread is the init process after this thread exits. */
+  repeat:
+        if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
+                IP_VS_ERR("could not create sync_thread due to %d... "
+                          "retrying.\n", pid);
+                ssleep(1);
+                goto repeat;
+        }
+        return 0;
+}
+int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+{
+        DECLARE_COMPLETION(startup);
+        pid_t pid;
+        if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
+            (state == IP_VS_STATE_BACKUP && sync_backup_pid))
+                return -EEXIST;
+        IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+        IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
+                  sizeof(struct ip_vs_sync_conn));
+        ip_vs_sync_state |= state;
+        if (state == IP_VS_STATE_MASTER) {
+                strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+                ip_vs_master_syncid = syncid;
+        } else {
+                strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+                ip_vs_backup_syncid = syncid;
+        }
+  repeat:
+        if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
+                IP_VS_ERR("could not create fork_sync_thread due to %d... "
+                          "retrying.\n", pid);
+                ssleep(1);
+                goto repeat;
+        }
+        wait_for_completion(&startup);
+        return 0;
+}
+int stop_sync_thread(int state)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
+            (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
+                return -ESRCH;
+        IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+        IP_VS_INFO("stopping sync thread %d ...\n",
+                   (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        add_wait_queue(&stop_sync_wait, &wait);
+        set_stop_sync(state, 1);
+        ip_vs_sync_state -= state;
+        wake_up(&sync_wait);
+        schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&stop_sync_wait, &wait);
+        /* Note: no need to reap the sync thread, because its parent
+           process is the init process */
+        if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
+            (state == IP_VS_STATE_BACKUP && stop_backup_sync))
+                IP_VS_BUG();
+        return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
new file mode 100644
index 000000000000..8a9d913261d8
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -0,0 +1,151 @@
+/*
+ * IPVS:        Weighted Least-Connection Scheduling module
+ *
+ * Version:     $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
+ *     Wensong Zhang            :     changed to use the inactconns in scheduling
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+static int
+ip_vs_wlc_init_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_wlc_done_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static int
+ip_vs_wlc_update_svc(struct ip_vs_service *svc)
+{
+        return 0;
+}
+static inline unsigned int
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+{
+        /*
+         * We think the overhead of processing active connections is 256
+         * times higher than that of inactive connections in average. (This
+         * 256 times might not be accurate, we will change it later) We
+         * use the following formula to estimate the overhead now:
+         *                dest->activeconns*256 + dest->inactconns
+         */
+        return (atomic_read(&dest->activeconns) << 8) +
+                atomic_read(&dest->inactconns);
+}
+/*
+ *      Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest, *least;
+        unsigned int loh, doh;
+        IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+        /*
+         * We calculate the load of each dest server as follows:
+         *                (dest overhead) / dest->weight
+         *
+         * Remember -- no floats in kernel mode!!!
+         * The comparison of h1*w2 > h2*w1 is equivalent to that of
+         *                h1/w1 > h2/w2
+         * if every weight is larger than zero.
+         *
+         * The server with weight=0 is quiesced and will not receive any
+         * new connections.
+         */
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                    atomic_read(&dest->weight) > 0) {
+                        least = dest;
+                        loh = ip_vs_wlc_dest_overhead(least);
+                        goto nextstage;
+                }
+        }
+        return NULL;
+        /*
+         *    Find the destination with the least load.
+         */
+  nextstage:
+        list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                        continue;
+                doh = ip_vs_wlc_dest_overhead(dest);
+                if (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight)) {
+                        least = dest;
+                        loh = doh;
+                }
+        }
+        IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d overhead %d\n",
+                  NIPQUAD(least->addr), ntohs(least->port),
+                  atomic_read(&least->activeconns),
+                  atomic_read(&least->refcnt),
+                  atomic_read(&least->weight), loh);
+        return least;
+}
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+        .name =                 "wlc",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_wlc_init_svc,
+        .done_service =         ip_vs_wlc_done_svc,
+        .update_service =       ip_vs_wlc_update_svc,
+        .schedule =             ip_vs_wlc_schedule,
+};
+static int __init ip_vs_wlc_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+static void __exit ip_vs_wlc_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
new file mode 100644
index 000000000000..749fa044eca5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -0,0 +1,235 @@
+/*
+ * IPVS:        Weighted Round-Robin Scheduling module
+ *
+ * Version:     $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
+ *     Julian Anastasov         :     fixed the bug of returning destination
+ *                                    with weight 0 when all weights are zero
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+        struct list_head *cl;   /* current list head */
+        int cw;                 /* current weight */
+        int mw;                 /* maximum weight */
+        int di;                 /* decreasing interval */
+};
+/*
+ *    Get the gcd of server weights
+ */
+static int gcd(int a, int b)
+{
+        int c;
+        while ((c = a % b)) {
+                a = b;
+                b = c;
+        }
+        return b;
+}
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest;
+        int weight;
+        int g = 0;
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                weight = atomic_read(&dest->weight);
+                if (weight > 0) {
+                        if (g > 0)
+                                g = gcd(weight, g);
+                        else
+                                g = weight;
+                }
+        }
+        return g ? g : 1;
+}
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+        struct ip_vs_dest *dest;
+        int weight = 0;
+        list_for_each_entry(dest, &svc->destinations, n_list) {
+                if (atomic_read(&dest->weight) > weight)
+                        weight = atomic_read(&dest->weight);
+        }
+        return weight;
+}
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_wrr_mark *mark;
+        /*
+         *    Allocate the mark variable for WRR scheduling
+         */
+        mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+        if (mark == NULL) {
+                IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
+                return -ENOMEM;
+        }
+        mark->cl = &svc->destinations;
+        mark->cw = 0;
+        mark->mw = ip_vs_wrr_max_weight(svc);
+        mark->di = ip_vs_wrr_gcd_weight(svc);
+        svc->sched_data = mark;
+        return 0;
+}
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+        /*
+         *    Release the mark variable
+         */
+        kfree(svc->sched_data);
+        return 0;
+}
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+        struct ip_vs_wrr_mark *mark = svc->sched_data;
+        mark->cl = &svc->destinations;
+        mark->mw = ip_vs_wrr_max_weight(svc);
+        mark->di = ip_vs_wrr_gcd_weight(svc);
+        if (mark->cw > mark->mw)
+                mark->cw = 0;
+        return 0;
+}
+/*
+ *    Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+        struct ip_vs_dest *dest;
+        struct ip_vs_wrr_mark *mark = svc->sched_data;
+        struct list_head *p;
+        IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
+        /*
+         * This loop will always terminate, because mark->cw in (0, max_weight]
+         * and at least one server has its weight equal to max_weight.
+         */
+        write_lock(&svc->sched_lock);
+        p = mark->cl;
+        while (1) {
+                if (mark->cl == &svc->destinations) {
+                        /* it is at the head of the destination list */
+                        if (mark->cl == mark->cl->next) {
+                                /* no dest entry */
+                                dest = NULL;
+                                goto out;
+                        }
+                        mark->cl = svc->destinations.next;
+                        mark->cw -= mark->di;
+                        if (mark->cw <= 0) {
+                                mark->cw = mark->mw;
+                                /*
+                                 * Still zero, which means no available servers.
+                                 */
+                                if (mark->cw == 0) {
+                                        mark->cl = &svc->destinations;
+                                        IP_VS_INFO("ip_vs_wrr_schedule(): "
+                                                   "no available servers\n");
+                                        dest = NULL;
+                                        goto out;
+                                }
+                        }
+                } else
+                        mark->cl = mark->cl->next;
+                if (mark->cl != &svc->destinations) {
+                        /* not at the head of the list */
+                        dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+                        if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                            atomic_read(&dest->weight) >= mark->cw) {
+                                /* got it */
+                                break;
+                        }
+                }
+                if (mark->cl == p && mark->cw == mark->di) {
+                        /* back to the start, and no dest is found.
+                           It is only possible when all dests are OVERLOADED */
+                        dest = NULL;
+                        goto out;
+                }
+        }
+        IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
+                  "activeconns %d refcnt %d weight %d\n",
+                  NIPQUAD(dest->addr), ntohs(dest->port),
+                  atomic_read(&dest->activeconns),
+                  atomic_read(&dest->refcnt),
+                  atomic_read(&dest->weight));
+  out:
+        write_unlock(&svc->sched_lock);
+        return dest;
+}
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+        .name =                 "wrr",
+        .refcnt =               ATOMIC_INIT(0),
+        .module =               THIS_MODULE,
+        .init_service =         ip_vs_wrr_init_svc,
+        .done_service =         ip_vs_wrr_done_svc,
+        .update_service =       ip_vs_wrr_update_svc,
+        .schedule =             ip_vs_wrr_schedule,
+};
+static int __init ip_vs_wrr_init(void)
+{
+        INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+static void __exit ip_vs_wrr_cleanup(void)
+{
+        unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
new file mode 100644
index 000000000000..faa6176bbeb1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -0,0 +1,563 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>                  /* for ip_route_output */
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+/*
+ *      Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+{
+        struct dst_entry *old_dst;
+        old_dst = dest->dst_cache;
+        dest->dst_cache = dst;
+        dest->dst_rtos = rtos;
+        dst_release(old_dst);
+}
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+{
+        struct dst_entry *dst = dest->dst_cache;
+        if (!dst)
+                return NULL;
+        if ((dst->obsolete || rtos != dest->dst_rtos) &&
+            dst->ops->check(dst, cookie) == NULL) {
+                dest->dst_cache = NULL;
+                dst_release(dst);
+                return NULL;
+        }
+        dst_hold(dst);
+        return dst;
+}
+static inline struct rtable *
+__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct ip_vs_dest *dest = cp->dest;
+        if (dest) {
+                spin_lock(&dest->dst_lock);
+                if (!(rt = (struct rtable *)
+                      __ip_vs_dst_check(dest, rtos, 0))) {
+                        struct flowi fl = {
+                                .oif = 0,
+                                .nl_u = {
+                                        .ip4_u = {
+                                                .daddr = dest->addr,
+                                                .saddr = 0,
+                                                .tos = rtos, } },
+                        };
+                        if (ip_route_output_key(&rt, &fl)) {
+                                spin_unlock(&dest->dst_lock);
+                                IP_VS_DBG_RL("ip_route_output error, "
+                                             "dest: %u.%u.%u.%u\n",
+                                             NIPQUAD(dest->addr));
+                                return NULL;
+                        }
+                        __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+                        IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
+                                  NIPQUAD(dest->addr),
+                                  atomic_read(&rt->u.dst.__refcnt), rtos);
+                }
+                spin_unlock(&dest->dst_lock);
+        } else {
+                struct flowi fl = {
+                        .oif = 0,
+                        .nl_u = {
+                                .ip4_u = {
+                                        .daddr = cp->daddr,
+                                        .saddr = 0,
+                                        .tos = rtos, } },
+                };
+                if (ip_route_output_key(&rt, &fl)) {
+                        IP_VS_DBG_RL("ip_route_output error, dest: "
+                                     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
+                        return NULL;
+                }
+        }
+        return rt;
+}
+/*
+ *      Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+        struct dst_entry *old_dst;
+        old_dst = dest->dst_cache;
+        dest->dst_cache = NULL;
+        dst_release(old_dst);
+}
+#define IP_VS_XMIT(skb, rt)                             \
+do {                                                    \
+        nf_reset_debug(skb);                            \
+        (skb)->nfcache |= NFC_IPVS_PROPERTY;            \
+        (skb)->ip_summed = CHECKSUM_NONE;               \
+        NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
+                (rt)->u.dst.dev, dst_output);           \
+} while (0)
+/*
+ *      NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                struct ip_vs_protocol *pp)
+{
+        /* we do not touch skb and do not need pskb ptr */
+        return NF_ACCEPT;
+}
+/*
+ *      Bypass transmitter
+ *      Let packets bypass the destination when the destination is not
+ *      available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                  struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct iphdr  *iph = skb->nh.iph;
+        u8     tos = iph->tos;
+        int    mtu;
+        struct flowi fl = {
+                .oif = 0,
+                .nl_u = {
+                        .ip4_u = {
+                                .daddr = iph->daddr,
+                                .saddr = 0,
+                                .tos = RT_TOS(tos), } },
+        };
+        EnterFunction(10);
+        if (ip_route_output_key(&rt, &fl)) {
+                IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+                             "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+                goto tx_error_icmp;
+        }
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+                ip_rt_put(rt);
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+                goto tx_error;
+        }
+        /*
+         * Call ip_send_check because we are not sure it is called
+         * after ip_defrag. Is copy-on-write needed?
+         */
+        if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+                ip_rt_put(rt);
+                return NF_STOLEN;
+        }
+        ip_send_check(skb->nh.iph);
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+ tx_error_icmp:
+        dst_link_failure(skb);
+ tx_error:
+        kfree_skb(skb);
+        LeaveFunction(10);
+        return NF_STOLEN;
+}
+/*
+ *      NAT transmitter (only for outside-to-inside nat forwarding)
+ *      Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+               struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;              /* Route to the other host */
+        int mtu;
+        struct iphdr *iph = skb->nh.iph;
+        EnterFunction(10);
+        /* check if it is a connection of no-client-port */
+        if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+                __u16 _pt, *p;
+                p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+                if (p == NULL)
+                        goto tx_error;
+                ip_vs_conn_fill_cport(cp, *p);
+                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+        }
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+                goto tx_error_icmp;
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+                ip_rt_put(rt);
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
+                goto tx_error;
+        }
+        /* copy-on-write the packet before mangling it */
+        if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
+                goto tx_error_put;
+        if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+                goto tx_error_put;
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /* mangle the packet */
+        if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
+                goto tx_error;
+        skb->nh.iph->daddr = cp->daddr;
+        ip_send_check(skb->nh.iph);
+        IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+        /* FIXME: when application helper enlarges the packet and the length
+           is larger than the MTU of outgoing device, there will be still
+           MTU problem. */
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        LeaveFunction(10);
+        kfree_skb(skb);
+        return NF_STOLEN;
+  tx_error_put:
+        ip_rt_put(rt);
+        goto tx_error;
+}
+/*
+ *   IP Tunneling transmitter
+ *
+ *   This function encapsulates the packet in a new IP packet, its
+ *   destination will be set to cp->daddr. Most code of this function
+ *   is taken from ipip.c.
+ *
+ *   It is used in VS/TUN cluster. The load balancer selects a real
+ *   server from a cluster based on a scheduling algorithm,
+ *   encapsulates the request packet and forwards it to the selected
+ *   server. For example, all real servers are configured with
+ *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ *   the encapsulated packet, it will decapsulate the packet, processe
+ *   the request and return the response packets directly to the client
+ *   without passing the load balancer. This can greatly increase the
+ *   scalability of virtual server.
+ *
+ *   Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                  struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct net_device *tdev;                /* Device to other host */
+        struct iphdr  *old_iph = skb->nh.iph;
+        u8     tos = old_iph->tos;
+        u16    df = old_iph->frag_off;
+        struct iphdr  *iph;                     /* Our new IP header */
+        int    max_headroom;                    /* The extra header space needed */
+        int    mtu;
+        EnterFunction(10);
+        if (skb->protocol != __constant_htons(ETH_P_IP)) {
+                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+                             "ETH_P_IP: %d, skb protocol: %d\n",
+                             __constant_htons(ETH_P_IP), skb->protocol);
+                goto tx_error;
+        }
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+                goto tx_error_icmp;
+        tdev = rt->u.dst.dev;
+        mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+        if (mtu < 68) {
+                ip_rt_put(rt);
+                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+                goto tx_error;
+        }
+        if (skb->dst)
+                skb->dst->ops->update_pmtu(skb->dst, mtu);
+        df |= (old_iph->frag_off&__constant_htons(IP_DF));
+        if ((old_iph->frag_off&__constant_htons(IP_DF))
+            && mtu < ntohs(old_iph->tot_len)) {
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                ip_rt_put(rt);
+                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+                goto tx_error;
+        }
+        /*
+         * Okay, now see if we can stuff it in the buffer as-is.
+         */
+        max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+        if (skb_headroom(skb) < max_headroom
+            || skb_cloned(skb) || skb_shared(skb)) {
+                struct sk_buff *new_skb =
+                        skb_realloc_headroom(skb, max_headroom);
+                if (!new_skb) {
+                        ip_rt_put(rt);
+                        kfree_skb(skb);
+                        IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+                        return NF_STOLEN;
+                }
+                kfree_skb(skb);
+                skb = new_skb;
+                old_iph = skb->nh.iph;
+        }
+        skb->h.raw = (void *) old_iph;
+        /* fix old IP header checksum */
+        ip_send_check(old_iph);
+        skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /*
+         *      Push down and install the IPIP header.
+         */
+        iph                     =       skb->nh.iph;
+        iph->version            =       4;
+        iph->ihl                =       sizeof(struct iphdr)>>2;
+        iph->frag_off           =       df;
+        iph->protocol           =       IPPROTO_IPIP;
+        iph->tos                =       tos;
+        iph->daddr              =       rt->rt_dst;
+        iph->saddr              =       rt->rt_src;
+        iph->ttl                =       old_iph->ttl;
+        iph->tot_len            =       htons(skb->len);
+        ip_select_ident(iph, &rt->u.dst, NULL);
+        ip_send_check(iph);
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        kfree_skb(skb);
+        LeaveFunction(10);
+        return NF_STOLEN;
+}
+/*
+ *      Direct Routing transmitter
+ *      Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+              struct ip_vs_protocol *pp)
+{
+        struct rtable *rt;                      /* Route to the other host */
+        struct iphdr  *iph = skb->nh.iph;
+        int    mtu;
+        EnterFunction(10);
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+                goto tx_error_icmp;
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
+                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+                ip_rt_put(rt);
+                IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+                goto tx_error;
+        }
+        /*
+         * Call ip_send_check because we are not sure it is called
+         * after ip_defrag. Is copy-on-write needed?
+         */
+        if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+                ip_rt_put(rt);
+                return NF_STOLEN;
+        }
+        ip_send_check(skb->nh.iph);
+        /* drop old route */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        LeaveFunction(10);
+        return NF_STOLEN;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        kfree_skb(skb);
+        LeaveFunction(10);
+        return NF_STOLEN;
+}
+/*
+ *      ICMP packet transmitter
+ *      called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                struct ip_vs_protocol *pp, int offset)
+{
+        struct rtable   *rt;    /* Route to the other host */
+        int mtu;
+        int rc;
+        EnterFunction(10);
+        /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+           forwarded directly here, because there is no need to
+           translate address/port back */
+        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+                if (cp->packet_xmit)
+                        rc = cp->packet_xmit(skb, cp, pp);
+                else
+                        rc = NF_ACCEPT;
+                /* do not touch skb anymore */
+                atomic_inc(&cp->in_pkts);
+                __ip_vs_conn_put(cp);
+                goto out;
+        }
+        /*
+         * mangle and send the packet here (only for VS/NAT)
+         */
+        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
+                goto tx_error_icmp;
+        /* MTU checking */
+        mtu = dst_mtu(&rt->u.dst);
+        if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
+                ip_rt_put(rt);
+                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+                IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+                goto tx_error;
+        }
+        /* copy-on-write the packet before mangling it */
+        if (!ip_vs_make_skb_writable(&skb, offset))
+                goto tx_error_put;
+        if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+                goto tx_error_put;
+        /* drop the old route when skb is not shared */
+        dst_release(skb->dst);
+        skb->dst = &rt->u.dst;
+        ip_vs_nat_icmp(skb, pp, cp, 0);
+        /* Another hack: avoid icmp_send in ip_fragment */
+        skb->local_df = 1;
+        IP_VS_XMIT(skb, rt);
+        rc = NF_STOLEN;
+        goto out;
+  tx_error_icmp:
+        dst_link_failure(skb);
+  tx_error:
+        dev_kfree_skb(skb);
+        rc = NF_STOLEN;
+  out:
+        LeaveFunction(10);
+        return rc;
+  tx_error_put:
+        ip_rt_put(rt);
+        goto tx_error;
+}
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4/ipvs