132 files changed, 5268 insertions, 1846 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 4193cdcd3ae..c6cec5aa548 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -66,6 +66,13 @@ source "net/ipv6/Kconfig"
 endif # if INET
+config NETWORK_SECMARK
+        bool "Security Marking"
+        help
+          This enables security marking of network packets, similar
+          to nfmark, but designated for security purposes.
+          If you are unsure how to answer this question, answer N.
 menuconfig NETFILTER
        bool "Network packet filtering (replaces ipchains)"
        ---help---
@@ -215,6 +222,21 @@ config NET_PKTGEN
          To compile this code as a module, choose M here: the
          module will be called pktgen.
+config NET_TCPPROBE
+        tristate "TCP connection probing"
+        depends on INET && EXPERIMENTAL && PROC_FS && KPROBES
+        ---help---
+        This module allows for capturing the changes to TCP connection
+        state in response to incoming packets. It is used for debugging
+        TCP congestion avoidance modules. If you don't understand
+        what was just said, you don't need it: say N.
+        Documentation on how to use the packet generator can be found
+        at http://linux-net.osdl.org/index.php/TcpProbe
+        To compile this code as a module, choose M here: the
+        module will be called tcp_probe.
 endmenu
 endmenu
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 72d85298266..f92f9c94d2c 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
                printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
                return;
        }
-        spin_lock_bh(&entry->neigh->dev->xmit_lock);    /* block clip_start_xmit() */
+        netif_tx_lock_bh(entry->neigh->dev);    /* block clip_start_xmit() */
        entry->neigh->used = jiffies;
        for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
                if (*walk == clip_vcc) {
@@ -122,7 +122,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
        printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
               "0x%p)\n", entry, clip_vcc);
      out:
-        spin_unlock_bh(&entry->neigh->dev->xmit_lock);
+        netif_tx_unlock_bh(entry->neigh->dev);
 }
 /* The neighbour entry n->lock is held. */
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 59556e40e14..f444c12cde5 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 bridge-y        := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
                        br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \
-                        br_stp_if.o br_stp_timer.o
+                        br_stp_if.o br_stp_timer.o br_netlink.o
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 12da21afb9c..654401ceb2d 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -30,36 +30,46 @@ static struct llc_sap *br_stp_sap;
 static int __init br_init(void)
 {
+        int err;
        br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv);
        if (!br_stp_sap) {
                printk(KERN_ERR "bridge: can't register sap for STP\n");
-                return -EBUSY;
+                return -EADDRINUSE;
        }
        br_fdb_init();
-#ifdef CONFIG_BRIDGE_NETFILTER
+        err = br_netfilter_init();
-        if (br_netfilter_init())
+        if (err)
-                return 1;
+                goto err_out1;
-#endif
+        err = register_netdevice_notifier(&br_device_notifier);
+        if (err)
+                goto err_out2;
+        br_netlink_init();
        brioctl_set(br_ioctl_deviceless_stub);
        br_handle_frame_hook = br_handle_frame;
        br_fdb_get_hook = br_fdb_get;
        br_fdb_put_hook = br_fdb_put;
-        register_netdevice_notifier(&br_device_notifier);
        return 0;
+err_out2:
+        br_netfilter_fini();
+err_out1:
+        llc_sap_put(br_stp_sap);
+        return err;
 }
 static void __exit br_deinit(void)
 {
        rcu_assign_pointer(br_stp_sap->rcv_func, NULL);
-#ifdef CONFIG_BRIDGE_NETFILTER
+        br_netlink_fini();
        br_netfilter_fini();
-#endif
        unregister_netdevice_notifier(&br_device_notifier);
        brioctl_set(NULL);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 0c88a2ac32c..2afdc7c0736 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -145,9 +145,9 @@ static int br_set_tx_csum(struct net_device *dev, u32 data)
        struct net_bridge *br = netdev_priv(dev);
        if (data)
-                br->feature_mask |= NETIF_F_IP_CSUM;
+                br->feature_mask |= NETIF_F_NO_CSUM;
        else
-                br->feature_mask &= ~NETIF_F_IP_CSUM;
+                br->feature_mask &= ~NETIF_F_ALL_CSUM;
        br_features_recompute(br);
        return 0;
@@ -185,5 +185,5 @@ void br_dev_setup(struct net_device *dev)
        dev->priv_flags = IFF_EBRIDGE;
        dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
-                | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
+                | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_NO_CSUM;
 }
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 56f3aa47e75..0dca027ceb8 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -20,14 +20,11 @@
 #include <linux/netfilter_bridge.h>
 #include "br_private.h"
+/* Don't forward packets to originating port or forwarding diasabled */
 static inline int should_deliver(const struct net_bridge_port *p, 
                                 const struct sk_buff *skb)
 {
-        if (skb->dev == p->dev ||
+        return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING);
-            p->state != BR_STATE_FORWARDING)
-                return 0;
-        return 1;
 }
 static inline unsigned packet_length(const struct sk_buff *skb)
@@ -55,10 +52,9 @@ int br_dev_queue_push_xmit(struct sk_buff *skb)
 int br_forward_finish(struct sk_buff *skb)
 {
-        NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
+        return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
-                        br_dev_queue_push_xmit);
+                       br_dev_queue_push_xmit);
-        return 0;
 }
 static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f5d47bf4f96..fdec773f5b5 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -372,12 +372,17 @@ void br_features_recompute(struct net_bridge *br)
        struct net_bridge_port *p;
        unsigned long features, checksum;
-        features = br->feature_mask &~ NETIF_F_IP_CSUM;
+        checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0;
-        checksum = br->feature_mask & NETIF_F_IP_CSUM;
+        features = br->feature_mask & ~NETIF_F_ALL_CSUM;
        list_for_each_entry(p, &br->port_list, list) {
-                if (!(p->dev->features 
+                if (checksum & NETIF_F_NO_CSUM &&
-                      & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
+                    !(p->dev->features & NETIF_F_NO_CSUM))
+                        checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
+                if (checksum & NETIF_F_HW_CSUM &&
+                    !(p->dev->features & NETIF_F_HW_CSUM))
+                        checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
+                if (!(p->dev->features & NETIF_F_IP_CSUM))
                        checksum = 0;
                features &= p->dev->features;
        }
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3da9264449f..3e41f9d6d51 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -407,12 +407,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
        if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
                if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
                        goto inhdr_error;
-                if (pkt_len + sizeof(struct ipv6hdr) < skb->len) {
+                if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
-                        if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr)))
+                        goto inhdr_error;
-                                goto inhdr_error;
-                        if (skb->ip_summed == CHECKSUM_HW)
-                                skb->ip_summed = CHECKSUM_NONE;
-                }
        }
        if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
                goto inhdr_error;
@@ -495,11 +491,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
        if (skb->len < len || len < 4 * iph->ihl)
                goto inhdr_error;
-        if (skb->len > len) {
+        pskb_trim_rcsum(skb, len);
-                __pskb_trim(skb, len);
-                if (skb->ip_summed == CHECKSUM_HW)
-                        skb->ip_summed = CHECKSUM_NONE;
-        }
        nf_bridge_put(skb->nf_bridge);
        if (!nf_bridge_alloc(skb))
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
new file mode 100644
index 00000000000..881d7d1a732
--- /dev/null
+++ b/net/bridge/br_netlink.c
@@ -0,0 +1,199 @@
+/*
+ *      Bridge netlink control interface
+ *
+ *      Authors:
+ *      Stephen Hemminger               <shemminger@osdl.org>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/rtnetlink.h>
+#include "br_private.h"
+/*
+ * Create one netlink message for one interface
+ * Contains port and master info as well as carrier and bridge state.
+ */
+static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port,
+                          u32 pid, u32 seq, int event, unsigned int flags)
+{
+        const struct net_bridge *br = port->br;
+        const struct net_device *dev = port->dev;
+        struct ifinfomsg *r;
+        struct nlmsghdr *nlh;
+        unsigned char *b = skb->tail;
+        u32 mtu = dev->mtu;
+        u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+        u8 portstate = port->state;
+        pr_debug("br_fill_info event %d port %s master %s\n",
+                 event, dev->name, br->dev->name);
+        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
+        r = NLMSG_DATA(nlh);
+        r->ifi_family = AF_BRIDGE;
+        r->__ifi_pad = 0;
+        r->ifi_type = dev->type;
+        r->ifi_index = dev->ifindex;
+        r->ifi_flags = dev_get_flags(dev);
+        r->ifi_change = 0;
+        RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name);
+        RTA_PUT(skb, IFLA_MASTER, sizeof(int), &br->dev->ifindex);
+        if (dev->addr_len)
+                RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+        RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu);
+        if (dev->ifindex != dev->iflink)
+                RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink);
+        RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate);
+        if (event == RTM_NEWLINK)
+                RTA_PUT(skb, IFLA_PROTINFO, sizeof(portstate), &portstate);
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+rtattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -EINVAL;
+}
+/*
+ * Notify listeners of a change in port information
+ */
+void br_ifinfo_notify(int event, struct net_bridge_port *port)
+{
+        struct sk_buff *skb;
+        int err = -ENOMEM;
+        pr_debug("bridge notify event=%d\n", event);
+        skb = alloc_skb(NLMSG_SPACE(sizeof(struct ifinfomsg) + 128),
+                        GFP_ATOMIC);
+        if (!skb)
+                goto err_out;
+        err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0);
+        if (err)
+                goto err_kfree;
+        NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
+        return;
+err_kfree:
+        kfree_skb(skb);
+err_out:
+        netlink_set_err(rtnl, 0, RTNLGRP_LINK, err);
+}
+/*
+ * Dump information about all ports, in response to GETLINK
+ */
+static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct net_device *dev;
+        int idx;
+        int s_idx = cb->args[0];
+        int err = 0;
+        read_lock(&dev_base_lock);
+        for (dev = dev_base, idx = 0; dev; dev = dev->next) {
+                struct net_bridge_port *p = dev->br_port;
+                /* not a bridge port */
+                if (!p)
+                        continue;
+                if (idx < s_idx)
+                        continue;
+                err = br_fill_ifinfo(skb, p, NETLINK_CB(cb->skb).pid,
+                                     cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI);
+                if (err <= 0)
+                        break;
+                ++idx;
+        }
+        read_unlock(&dev_base_lock);
+        cb->args[0] = idx;
+        return skb->len;
+}
+/*
+ * Change state of port (ie from forwarding to blocking etc)
+ * Used by spanning tree in user space.
+ */
+static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
+{
+        struct rtattr  **rta = arg;
+        struct ifinfomsg *ifm = NLMSG_DATA(nlh);
+        struct net_device *dev;
+        struct net_bridge_port *p;
+        u8 new_state;
+        if (ifm->ifi_family != AF_BRIDGE)
+                return -EPFNOSUPPORT;
+        /* Must pass valid state as PROTINFO */
+        if (rta[IFLA_PROTINFO-1]) {
+                u8 *pstate = RTA_DATA(rta[IFLA_PROTINFO-1]);
+                new_state = *pstate;
+        } else
+                return -EINVAL;
+        if (new_state > BR_STATE_BLOCKING)
+                return -EINVAL;
+        /* Find bridge port */
+        dev = __dev_get_by_index(ifm->ifi_index);
+        if (!dev)
+                return -ENODEV;
+        p = dev->br_port;
+        if (!p)
+                return -EINVAL;
+        /* if kernel STP is running, don't allow changes */
+        if (p->br->stp_enabled)
+                return -EBUSY;
+        if (!netif_running(dev))
+                return -ENETDOWN;
+        if (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED)
+                return -ENETDOWN;
+        p->state = new_state;
+        br_log_state(p);
+        return 0;
+}
+static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = {
+        [RTM_GETLINK - RTM_BASE] = { .dumpit    = br_dump_ifinfo, },
+        [RTM_SETLINK - RTM_BASE] = { .doit      = br_rtm_setlink, },
+};
+void __init br_netlink_init(void)
+{
+        rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table;
+}
+void __exit br_netlink_fini(void)
+{
+        rtnetlink_links[PF_BRIDGE] = NULL;
+}
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index a43a9c1d50d..20278494e4d 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -14,6 +14,7 @@
 */
 #include <linux/kernel.h>
+#include <linux/rtnetlink.h>
 #include "br_private.h"
@@ -49,6 +50,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
        case NETDEV_CHANGEADDR:
                br_fdb_changeaddr(p, dev->dev_addr);
+                br_ifinfo_notify(RTM_NEWLINK, p);
                br_stp_recalculate_bridge_id(br);
                break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 86ecea7ed37..c491fb2f280 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -29,7 +29,7 @@
 #define BR_PORT_DEBOUNCE (HZ/10)
-#define BR_VERSION      "2.1"
+#define BR_VERSION      "2.2"
 typedef struct bridge_id bridge_id;
 typedef struct mac_addr mac_addr;
@@ -192,8 +192,13 @@ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg);
 /* br_netfilter.c */
+#ifdef CONFIG_BRIDGE_NETFILTER
 extern int br_netfilter_init(void);
 extern void br_netfilter_fini(void);
+#else
+#define br_netfilter_init()     (0)
+#define br_netfilter_fini()     do { } while(0)
+#endif
 /* br_stp.c */
 extern void br_log_state(const struct net_bridge_port *p);
@@ -232,6 +237,11 @@ extern struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
 extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
+/* br_netlink.c */
+extern void br_netlink_init(void);
+extern void br_netlink_fini(void);
+extern void br_ifinfo_notify(int event, struct net_bridge_port *port);
 #ifdef CONFIG_SYSFS
 /* br_sysfs_if.c */
 extern struct sysfs_ops brport_sysfs_ops;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 23dea1422c9..14cd025079a 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/smp_lock.h>
 #include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
 #include "br_private.h"
 #include "br_private_stp.h"
@@ -86,6 +87,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
 void br_stp_enable_port(struct net_bridge_port *p)
 {
        br_init_port(p);
+        br_ifinfo_notify(RTM_NEWLINK, p);
        br_port_state_selection(p->br);
 }
@@ -99,6 +101,8 @@ void br_stp_disable_port(struct net_bridge_port *p)
        printk(KERN_INFO "%s: port %i(%s) entering %s state\n",
               br->dev->name, p->port_no, p->dev->name, "disabled");
+        br_ifinfo_notify(RTM_DELLINK, p);
        wasroot = br_is_root_bridge(br);
        br_become_designated_port(p);
        p->state = BR_STATE_DISABLED;
diff --git a/net/core/Makefile b/net/core/Makefile
index 79fe12cced2..e9bd2467d5a 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_WIRELESS_EXT) += wireless.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 4fba549caf2..ab39fe17cb5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include <net/iw_handler.h>
 #include <asm/current.h>
 #include <linux/audit.h>
+#include <linux/dmaengine.h>
 /*
 *      The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16]; /* 16 way hashed list */
 static struct list_head ptype_all;              /* Taps */
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+static spinlock_t net_dma_event_lock;
+#endif
 /*
 * The @dev_base list is protected by @dev_base_lock and the rtnl
 * semaphore.
@@ -1215,75 +1222,15 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)       (0)
 #endif
-/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
-{
-        unsigned int size;
-        u8 *data;
-        long offset;
-        struct skb_shared_info *ninfo;
-        int headerlen = skb->data - skb->head;
-        int expand = (skb->tail + skb->data_len) - skb->end;
-        if (skb_shared(skb))
-                BUG();
-        if (expand <= 0)
-                expand = 0;
-        size = skb->end - skb->head + expand;
-        size = SKB_DATA_ALIGN(size);
-        data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
-        if (!data)
-                return -ENOMEM;
-        /* Copy entire thing */
-        if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
-                BUG();
-        /* Set up shinfo */
-        ninfo = (struct skb_shared_info*)(data + size);
-        atomic_set(&ninfo->dataref, 1);
-        ninfo->tso_size = skb_shinfo(skb)->tso_size;
-        ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
-        ninfo->nr_frags = 0;
-        ninfo->frag_list = NULL;
-        /* Offset between the two in bytes */
-        offset = data - skb->head;
-        /* Free old data. */
-        skb_release_data(skb);
-        skb->head = data;
-        skb->end  = data + size;
-        /* Set up new pointers */
-        skb->h.raw   += offset;
-        skb->nh.raw  += offset;
-        skb->mac.raw += offset;
-        skb->tail    += offset;
-        skb->data    += offset;
-        /* We are no longer a clone, even if we were. */
-        skb->cloned    = 0;
-        skb->tail     += skb->data_len;
-        skb->data_len  = 0;
-        return 0;
-}
 #define HARD_TX_LOCK(dev, cpu) {                        \
        if ((dev->features & NETIF_F_LLTX) == 0) {      \
-                spin_lock(&dev->xmit_lock);             \
+                netif_tx_lock(dev);                     \
-                dev->xmit_lock_owner = cpu;             \
        }                                               \
 }
 #define HARD_TX_UNLOCK(dev) {                           \
        if ((dev->features & NETIF_F_LLTX) == 0) {      \
-                dev->xmit_lock_owner = -1;              \
+                netif_tx_unlock(dev);                   \
-                spin_unlock(&dev->xmit_lock);           \
        }                                               \
 }
@@ -1321,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb)
        if (skb_shinfo(skb)->frag_list &&
            !(dev->features & NETIF_F_FRAGLIST) &&
-            __skb_linearize(skb, GFP_ATOMIC))
+            __skb_linearize(skb))
                goto out_kfree_skb;
        /* Fragmented skb is linearized if device does not support SG,
@@ -1330,14 +1277,14 @@ int dev_queue_xmit(struct sk_buff *skb)
         */
        if (skb_shinfo(skb)->nr_frags &&
            (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
-            __skb_linearize(skb, GFP_ATOMIC))
+            __skb_linearize(skb))
                goto out_kfree_skb;
        /* If packet is not checksummed and device does not support
         * checksumming for this protocol, complete checksumming here.
         */
        if (skb->ip_summed == CHECKSUM_HW &&
-            (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
+            (!(dev->features & NETIF_F_GEN_CSUM) &&
             (!(dev->features & NETIF_F_IP_CSUM) ||
              skb->protocol != htons(ETH_P_IP))))
                if (skb_checksum_help(skb, 0))
@@ -1382,8 +1329,8 @@ int dev_queue_xmit(struct sk_buff *skb)
        /* The device has no queue. Common case for software devices:
           loopback, all the sorts of tunnels...
-           Really, it is unlikely that xmit_lock protection is necessary here.
+           Really, it is unlikely that netif_tx_lock protection is necessary
-           (f.e. loopback and IP tunnels are clean ignoring statistics
+           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
           counters.)
           However, it is possible, that they rely on protection
           made by us here.
@@ -1846,6 +1793,19 @@ static void net_rx_action(struct softirq_action *h)
                }
        }
 out:
+#ifdef CONFIG_NET_DMA
+        /*
+         * There may not be any more sk_buffs coming right now, so push
+         * any pending DMA copies to hardware
+         */
+        if (net_dma_client) {
+                struct dma_chan *chan;
+                rcu_read_lock();
+                list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
+                        dma_async_memcpy_issue_pending(chan);
+                rcu_read_unlock();
+        }
+#endif
        local_irq_enable();
        return;
@@ -2785,7 +2745,7 @@ int register_netdevice(struct net_device *dev)
        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
        spin_lock_init(&dev->queue_lock);
-        spin_lock_init(&dev->xmit_lock);
+        spin_lock_init(&dev->_xmit_lock);
        dev->xmit_lock_owner = -1;
 #ifdef CONFIG_NET_CLS_ACT
        spin_lock_init(&dev->ingress_lock);
@@ -2829,9 +2789,7 @@ int register_netdevice(struct net_device *dev)
        /* Fix illegal SG+CSUM combinations. */
        if ((dev->features & NETIF_F_SG) &&
-            !(dev->features & (NETIF_F_IP_CSUM |
+            !(dev->features & NETIF_F_ALL_CSUM)) {
-                               NETIF_F_NO_CSUM |
-                               NETIF_F_HW_CSUM))) {
                printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
                       dev->name);
                dev->features &= ~NETIF_F_SG;
@@ -3300,6 +3258,88 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+        unsigned int cpu, i, n;
+        struct dma_chan *chan;
+        lock_cpu_hotplug();
+        if (net_dma_count == 0) {
+                for_each_online_cpu(cpu)
+                        rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL);
+                unlock_cpu_hotplug();
+                return;
+        }
+        i = 0;
+        cpu = first_cpu(cpu_online_map);
+        rcu_read_lock();
+        list_for_each_entry(chan, &net_dma_client->channels, client_node) {
+                n = ((num_online_cpus() / net_dma_count)
+                   + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+                while(n) {
+                        per_cpu(softnet_data.net_dma, cpu) = chan;
+                        cpu = next_cpu(cpu, cpu_online_map);
+                        n--;
+                }
+                i++;
+        }
+        rcu_read_unlock();
+        unlock_cpu_hotplug();
+}
+/**
+ * netdev_dma_event - event callback for the net_dma_client
+ * @client: should always be net_dma_client
+ * @chan:
+ * @event:
+ */
+static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+        enum dma_event event)
+{
+        spin_lock(&net_dma_event_lock);
+        switch (event) {
+        case DMA_RESOURCE_ADDED:
+                net_dma_count++;
+                net_dma_rebalance();
+                break;
+        case DMA_RESOURCE_REMOVED:
+                net_dma_count--;
+                net_dma_rebalance();
+                break;
+        default:
+                break;
+        }
+        spin_unlock(&net_dma_event_lock);
+}
+/**
+ * netdev_dma_regiser - register the networking subsystem as a DMA client
+ */
+static int __init netdev_dma_register(void)
+{
+        spin_lock_init(&net_dma_event_lock);
+        net_dma_client = dma_async_client_register(netdev_dma_event);
+        if (net_dma_client == NULL)
+                return -ENOMEM;
+        dma_async_client_chan_request(net_dma_client, num_online_cpus());
+        return 0;
+}
+#else
+static int __init netdev_dma_register(void) { return -ENODEV; }
+#endif /* CONFIG_NET_DMA */
 /*
 *      Initialize the DEV module. At boot time this walks the device list and
@@ -3353,6 +3393,8 @@ static int __init net_dev_init(void)
                atomic_set(&queue->backlog_dev.refcnt, 1);
        }
+        netdev_dma_register();
        dev_boot_phase = 0;
        open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
@@ -3371,7 +3413,6 @@ subsys_initcall(net_dev_init);
 EXPORT_SYMBOL(__dev_get_by_index);
 EXPORT_SYMBOL(__dev_get_by_name);
 EXPORT_SYMBOL(__dev_remove_pack);
-EXPORT_SYMBOL(__skb_linearize);
 EXPORT_SYMBOL(dev_valid_name);
 EXPORT_SYMBOL(dev_add_pack);
 EXPORT_SYMBOL(dev_alloc_name);
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 05d60850840..c57d887da2e 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -62,7 +62,7 @@
 *      Device mc lists are changed by bh at least if IPv6 is enabled,
 *      so that it must be bh protected.
 *
- *      We block accesses to device mc filters with dev->xmit_lock.
+ *      We block accesses to device mc filters with netif_tx_lock.
 */
 /*
@@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_device *dev)
 void dev_mc_upload(struct net_device *dev)
 {
-        spin_lock_bh(&dev->xmit_lock);
+        netif_tx_lock_bh(dev);
        __dev_mc_upload(dev);
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
 }
 /*
@@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
        int err = 0;
        struct dev_mc_list *dmi, **dmip;
-        spin_lock_bh(&dev->xmit_lock);
+        netif_tx_lock_bh(dev);
        for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
                /*
@@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
                         */
                        __dev_mc_upload(dev);
                        
-                        spin_unlock_bh(&dev->xmit_lock);
+                        netif_tx_unlock_bh(dev);
                        return 0;
                }
        }
        err = -ENOENT;
 done:
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
        return err;
 }
@@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
        dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
-        spin_lock_bh(&dev->xmit_lock);
+        netif_tx_lock_bh(dev);
        for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
                if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
                    dmi->dmi_addrlen == alen) {
@@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
        }
        if ((dmi = dmi1) == NULL) {
-                spin_unlock_bh(&dev->xmit_lock);
+                netif_tx_unlock_bh(dev);
                return -ENOMEM;
        }
        memcpy(dmi->dmi_addr, addr, alen);
@@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
        __dev_mc_upload(dev);
        
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
        return 0;
 done:
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
        kfree(dmi1);
        return err;
 }
@@ -204,7 +204,7 @@ done:
 void dev_mc_discard(struct net_device *dev)
 {
-        spin_lock_bh(&dev->xmit_lock);
+        netif_tx_lock_bh(dev);
        
        while (dev->mc_list != NULL) {
                struct dev_mc_list *tmp = dev->mc_list;
@@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *dev)
        }
        dev->mc_count = 0;
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
 }
 #ifdef CONFIG_PROC_FS
@@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
        struct dev_mc_list *m;
        struct net_device *dev = v;
-        spin_lock_bh(&dev->xmit_lock);
+        netif_tx_lock_bh(dev);
        for (m = dev->mc_list; m; m = m->next) {
                int i;
@@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
                seq_putc(seq, '\n');
        }
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
        return 0;
 }
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e6f76106a99..33ce7ed6afc 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev)
 u32 ethtool_op_get_tx_csum(struct net_device *dev)
 {
-        return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
+        return (dev->features & NETIF_F_ALL_CSUM) != 0;
 }
 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
@@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
                return -EFAULT;
        if (edata.data && 
-            !(dev->features & (NETIF_F_IP_CSUM |
+            !(dev->features & NETIF_F_ALL_CSUM))
-                               NETIF_F_NO_CSUM |
-                               NETIF_F_HW_CSUM)))
                return -EINVAL;
        return __ethtool_set_sg(dev, edata.data);
@@ -591,7 +589,7 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
 static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
 {
-        struct ethtool_value edata = { ETHTOOL_GTSO };
+        struct ethtool_value edata = { ETHTOOL_GUFO };
        if (!dev->ethtool_ops->get_ufo)
                return -EOPNOTSUPP;
@@ -600,6 +598,7 @@ static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
                 return -EFAULT;
        return 0;
 }
 static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
 {
        struct ethtool_value edata;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e8e05cebd95..9cb78183038 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
        do {
                npinfo->tries--;
-                spin_lock(&np->dev->xmit_lock);
+                netif_tx_lock(np->dev);
-                np->dev->xmit_lock_owner = smp_processor_id();
                /*
                 * network drivers do not expect to be called if the queue is
                 * stopped.
                 */
                if (netif_queue_stopped(np->dev)) {
-                        np->dev->xmit_lock_owner = -1;
+                        netif_tx_unlock(np->dev);
-                        spin_unlock(&np->dev->xmit_lock);
                        netpoll_poll(np);
                        udelay(50);
                        continue;
                }
                status = np->dev->hard_start_xmit(skb, np->dev);
-                np->dev->xmit_lock_owner = -1;
+                netif_tx_unlock(np->dev);
-                spin_unlock(&np->dev->xmit_lock);
                /* success */
                if(!status) {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c23e9c06ee2..67ed14ddabd 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2897,7 +2897,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
                }
        }
-        spin_lock_bh(&odev->xmit_lock);
+        netif_tx_lock_bh(odev);
        if (!netif_queue_stopped(odev)) {
                atomic_inc(&(pkt_dev->skb->users));
@@ -2942,7 +2942,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
                pkt_dev->next_tx_ns = 0;
        }
-        spin_unlock_bh(&odev->xmit_lock);
+        netif_tx_unlock_bh(odev);
        /* If pkt_dev->count is zero, then run forever */
        if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fb3770f9c09..bb7210f4005 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
        n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
        C(input_dev);
 #endif
+        skb_copy_secmark(n, skb);
 #endif
        C(truesize);
        atomic_set(&n->users, 1);
@@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
        new->tc_index   = old->tc_index;
 #endif
+        skb_copy_secmark(new, old);
        atomic_set(&new->users, 1);
        skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
        skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
@@ -800,12 +801,10 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
        return nskb;
 }       
 
-/* Trims skb to length len. It can change skb pointers, if "realloc" is 1.
+/* Trims skb to length len. It can change skb pointers.
- * If realloc==0 and trimming is impossible without change of data,
- * it is BUG().
 */
-int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
 {
        int offset = skb_headlen(skb);
        int nfrags = skb_shinfo(skb)->nr_frags;
@@ -815,7 +814,6 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
                int end = offset + skb_shinfo(skb)->frags[i].size;
                if (end > len) {
                        if (skb_cloned(skb)) {
-                                BUG_ON(!realloc);
                                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
                                        return -ENOMEM;
                        }
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb9ea2..5d820c37665 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
                atomic_set(&newsk->sk_omem_alloc, 0);
                skb_queue_head_init(&newsk->sk_receive_queue);
                skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+                skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
                rwlock_init(&newsk->sk_dst_lock);
                rwlock_init(&newsk->sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+        skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
        sk->sk_send_head        =       NULL;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 00000000000..b7c98dbcdb8
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h> /* for BUG_TRAP */
+#include <net/tcp.h>
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+/**
+ *      dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *      @skb - buffer to copy
+ *      @offset - offset in the buffer to start copying from
+ *      @iovec - io vector to copy to
+ *      @len - amount of data to copy from buffer to iovec
+ *      @pinned_list - locked iovec buffer data
+ *
+ *      Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+                        struct sk_buff *skb, int offset, struct iovec *to,
+                        size_t len, struct dma_pinned_list *pinned_list)
+{
+        int start = skb_headlen(skb);
+        int i, copy = start - offset;
+        dma_cookie_t cookie = 0;
+        /* Copy header. */
+        if (copy > 0) {
+                if (copy > len)
+                        copy = len;
+                cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+                                            skb->data + offset, copy);
+                if (cookie < 0)
+                        goto fault;
+                len -= copy;
+                if (len == 0)
+                        goto end;
+                offset += copy;
+        }
+        /* Copy paged appendix. Hmm... why does this look so complicated? */
+        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+                int end;
+                BUG_TRAP(start <= offset + len);
+                end = start + skb_shinfo(skb)->frags[i].size;
+                copy = end - offset;
+                if ((copy = end - offset) > 0) {
+                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+                        struct page *page = frag->page;
+                        if (copy > len)
+                                copy = len;
+                        cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+                                        frag->page_offset + offset - start, copy);
+                        if (cookie < 0)
+                                goto fault;
+                        len -= copy;
+                        if (len == 0)
+                                goto end;
+                        offset += copy;
+                }
+                start = end;
+        }
+        if (skb_shinfo(skb)->frag_list) {
+                struct sk_buff *list = skb_shinfo(skb)->frag_list;
+                for (; list; list = list->next) {
+                        int end;
+                        BUG_TRAP(start <= offset + len);
+                        end = start + list->len;
+                        copy = end - offset;
+                        if (copy > 0) {
+                                if (copy > len)
+                                        copy = len;
+                                cookie = dma_skb_copy_datagram_iovec(chan, list,
+                                                offset - start, to, copy,
+                                                pinned_list);
+                                if (cookie < 0)
+                                        goto fault;
+                                len -= copy;
+                                if (len == 0)
+                                        goto end;
+                                offset += copy;
+                        }
+                        start = end;
+                }
+        }
+end:
+        if (!len) {
+                skb->dma_cookie = cookie;
+                return cookie;
+        }
+fault:
+        return -EFAULT;
+}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2e0ee8355c4..5317fd3e669 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                }
                dccp_pr_debug("packet_type=%s\n",
                              dccp_packet_name(dh->dccph_type));
-                sk_eat_skb(sk, skb);
+                sk_eat_skb(sk, skb, 0);
 verify_sock_status:
                if (sock_flag(sk, SOCK_DONE)) {
                        len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
                }
        found_fin_ok:
                if (!(flags & MSG_PEEK))
-                        sk_eat_skb(sk, skb);
+                        sk_eat_skb(sk, skb, 0);
                break;
        } while (1);
 out:
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 547523b41c8..a2ba9db1c37 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -801,8 +801,7 @@ got_it:
                 * We linearize everything except data segments here.
                 */
                if (cb->nsp_flags & ~0x60) {
-                        if (unlikely(skb_is_nonlinear(skb)) &&
+                        if (unlikely(skb_linearize(skb)))
-                            skb_linearize(skb, GFP_ATOMIC) != 0)
                                goto free_out;
                }
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e172cf98d7f..5abf7057af0 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
                        padlen);
        if (flags & DN_RT_PKT_CNTL) {
-                if (unlikely(skb_is_nonlinear(skb)) &&
+                if (unlikely(skb_linearize(skb)))
-                    skb_linearize(skb, GFP_ATOMIC) != 0)
                        goto dump_it;
                switch(flags & DN_RT_CNTL_MSK) {
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f7532237..da33393be45 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,24 @@ config INET_TUNNEL
        tristate
        default n
+config INET_XFRM_MODE_TRANSPORT
+        tristate "IP: IPsec transport mode"
+        default y
+        select XFRM
+        ---help---
+          Support for IPsec transport mode.
+          If unsure, say Y.
+config INET_XFRM_MODE_TUNNEL
+        tristate "IP: IPsec tunnel mode"
+        default y
+        select XFRM
+        ---help---
+          Support for IPsec tunnel mode.
+          If unsure, say Y.
 config INET_DIAG
        tristate "INET: socket monitoring interface"
        default y
@@ -532,6 +550,38 @@ config TCP_CONG_SCALABLE
        properties, though is known to have fairness issues.
        See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+config TCP_CONG_LP
+        tristate "TCP Low Priority"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+        to utiliza only the excess network bandwidth as compared to the
+        ``fair share`` of bandwidth as targeted by TCP.
+        See http://www-ece.rice.edu/networks/TCP-LP/
+config TCP_CONG_VENO
+        tristate "TCP Veno"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        TCP Veno is a sender-side only enhancement of TCP to obtain better
+        throughput over wireless networks. TCP Veno makes use of state
+        distinguishing to circumvent the difficult judgment of the packet loss
+        type. TCP Veno cuts down less congestion window in response to random
+        loss packets.
+        See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+config TCP_CONG_COMPOUND
+        tristate "TCP Compound"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        TCP Compound is a sender-side only change to TCP that uses
+        a mixed Reno/Vegas approach to calculate the cwnd.
+        For further details look here:
+          ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
 endmenu
 config TCP_CONG_BIC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0b9d2..38b8039bdd5 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
@@ -34,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
@@ -41,7 +44,10 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
                      xfrm4_output.o
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e2e4771fa4c..c7782230080 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -119,6 +119,7 @@ error:
 static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 {
        int ah_hlen;
+        int ihl;
        struct iphdr *iph;
        struct ip_auth_hdr *ah;
        struct ah_data *ahp;
@@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
        ah = (struct ip_auth_hdr*)skb->data;
        iph = skb->nh.iph;
-        memcpy(work_buf, iph, iph->ihl*4);
+        ihl = skb->data - skb->nh.raw;
+        memcpy(work_buf, iph, ihl);
        iph->ttl = 0;
        iph->tos = 0;
        iph->frag_off = 0;
        iph->check = 0;
-        if (iph->ihl != 5) {
+        if (ihl > sizeof(*iph)) {
                u32 dummy;
                if (ip_clear_mutable_options(iph, &dummy))
                        goto out;
@@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
                u8 auth_data[MAX_AH_AUTH_LEN];
                
                memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
-                skb_push(skb, skb->data - skb->nh.raw);
+                skb_push(skb, ihl);
                ahp->icv(ahp, skb, ah->auth_data);
                if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
                        x->stats.integrity_failed++;
@@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
                }
        }
        ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
-        skb->nh.raw = skb_pull(skb, ah_hlen);
+        skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl);
-        memcpy(skb->nh.raw, work_buf, iph->ihl*4);
+        __skb_pull(skb, ah_hlen + ihl);
-        skb->nh.iph->tot_len = htons(skb->len);
-        skb_pull(skb, skb->nh.iph->ihl*4);
-        skb->h.raw = skb->data;
        return 0;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9d1881c07a3..9bbdd449455 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
        int alen = esp->auth.icv_trunc_len;
        int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
        int nfrags;
-        int encap_len = 0;
+        int ihl;
        u8 nexthdr[2];
        struct scatterlist *sg;
-        u8 workbuf[60];
        int padlen;
        if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
@@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
        skb->ip_summed = CHECKSUM_NONE;
        esph = (struct ip_esp_hdr*)skb->data;
-        iph = skb->nh.iph;
        /* Get ivec. This can be wrong, check against another impls. */
        if (esp->conf.ivlen)
@@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
        /* ... check padding bits here. Silly. :-) */ 
+        iph = skb->nh.iph;
+        ihl = iph->ihl * 4;
        if (x->encap) {
                struct xfrm_encap_tmpl *encap = x->encap;
-                struct udphdr *uh;
+                struct udphdr *uh = (void *)(skb->nh.raw + ihl);
-                uh = (struct udphdr *)(iph + 1);
-                encap_len = (void*)esph - (void*)uh;
                /*
                 * 1) if the NAT-T peer's IP or port changed then
@@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
        iph->protocol = nexthdr[1];
        pskb_trim(skb, skb->len - alen - padlen - 2);
-        memcpy(workbuf, skb->nh.raw, iph->ihl*4);
+        skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl;
-        skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
-        skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
-        memcpy(skb->nh.raw, workbuf, iph->ihl*4);
-        skb->nh.iph->tot_len = htons(skb->len);
        return 0;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2a0455911ee..017900172f7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -730,7 +730,6 @@ out_err:
 static void icmp_redirect(struct sk_buff *skb)
 {
        struct iphdr *iph;
-        unsigned long ip;
        if (skb->len < sizeof(struct iphdr))
                goto out_err;
@@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb)
                goto out;
        iph = (struct iphdr *)skb->data;
-        ip = iph->daddr;
        switch (skb->h.icmph->code & 7) {
        case ICMP_REDIR_NET:
@@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb)
                 */
        case ICMP_REDIR_HOST:
        case ICMP_REDIR_HOSTTOS:
-                ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway,
+                ip_rt_redirect(skb->nh.iph->saddr, iph->daddr,
+                               skb->h.icmph->un.gateway,
                               iph->saddr, skb->dev);
                break;
        }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d512239a147..ab680c851aa 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
                }
                seq_printf(seq,
-                           "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+                           "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
                           im->multiaddr, im->users,
                           im->tm_running, im->tm_running ?
                           jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cff9c3a72da..8538aac3d14 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        nf_bridge_get(to->nf_bridge);
 #endif
 #endif
+        skb_copy_secmark(to, from);
 }
 /*
@@ -839,7 +840,7 @@ int ip_append_data(struct sock *sk,
         */
        if (transhdrlen &&
            length + fragheaderlen <= mtu &&
-            rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+            rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
            !exthdrlen)
                csummode = CHECKSUM_HW;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 95278b22b66..3ed8b57a100 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list);
 static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
 {
        int err, plen, dlen;
-        struct iphdr *iph;
        struct ipcomp_data *ipcd = x->data;
        u8 *start, *scratch;
        struct crypto_tfm *tfm;
@@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
                
        skb_put(skb, dlen - plen);
        memcpy(skb->data, scratch, dlen);
-        iph = skb->nh.iph;
-        iph->tot_len = htons(dlen + iph->ihl * 4);
 out:    
        put_cpu();
        return err;
@@ -83,34 +80,21 @@ out:
 static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-        u8 nexthdr;
+        int err = -ENOMEM;
-        int err = 0;
        struct iphdr *iph;
-        union {
+        struct ip_comp_hdr *ipch;
-                struct iphdr    iph;
-                char            buf[60];
-        } tmp_iph;
-        if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+        if (skb_linearize_cow(skb))
-            skb_linearize(skb, GFP_ATOMIC) != 0) {
-                err = -ENOMEM;
                goto out;
-        }
        skb->ip_summed = CHECKSUM_NONE;
        /* Remove ipcomp header and decompress original payload */      
        iph = skb->nh.iph;
-        memcpy(&tmp_iph, iph, iph->ihl * 4);
+        ipch = (void *)skb->data;
-        nexthdr = *(u8 *)skb->data;
+        iph->protocol = ipch->nexthdr;
-        skb_pull(skb, sizeof(struct ip_comp_hdr));
+        skb->h.raw = skb->nh.raw + sizeof(*ipch);
-        skb->nh.raw += sizeof(struct ip_comp_hdr);
+        __skb_pull(skb, sizeof(*ipch));
-        memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
-        iph = skb->nh.iph;
-        iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
-        iph->protocol = nexthdr;
-        skb->h.raw = skb->data;
        err = ipcomp_decompress(x, skb);
 out:    
@@ -171,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
                goto out_ok;
        }
-        if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+        if (skb_linearize_cow(skb))
-            skb_linearize(skb, GFP_ATOMIC) != 0) {
                goto out_ok;
-        }
        
        err = ipcomp_compress(x, skb);
        iph = skb->nh.iph;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d4072533da2..e1d7f5fbc52 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK
          of packets, but this mark value is kept in the conntrack session
          instead of the individual packets.
        
+config IP_NF_CONNTRACK_SECMARK
+        bool  'Connection tracking security mark support'
+        depends on IP_NF_CONNTRACK && NETWORK_SECMARK
+        help
+          This option enables security markings to be applied to
+          connections.  Typically they are copied to connections from
+          packets using the CONNSECMARK target and copied back from
+          connections to packets with the same target, with the packets
+          being originally labeled via SECMARK.
+          If unsure, say 'N'.
 config IP_NF_CONNTRACK_EVENTS
        bool "Connection tracking events (EXPERIMENTAL)"
        depends on EXPERIMENTAL && IP_NF_CONNTRACK
@@ -142,6 +154,8 @@ config IP_NF_TFTP
 config IP_NF_AMANDA
        tristate "Amanda backup protocol support"
        depends on IP_NF_CONNTRACK
+        select TEXTSEARCH
+        select TEXTSEARCH_KMP
        help
          If you are running the Amanda backup package <http://www.amanda.org/>
          on this machine or machines that will be MASQUERADED through this
@@ -181,14 +195,26 @@ config IP_NF_H323
          With this module you can support H.323 on a connection tracking/NAT
          firewall.
-          This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP
+          This module supports RAS, Fast Start, H.245 Tunnelling, Call
-          and T.120 based data and applications including audio, video, FAX,
+          Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
-          chat, whiteboard, file transfer, etc. For more information, please
+          whiteboard, file transfer, etc. For more information, please
-          see http://nath323.sourceforge.net/.
+          visit http://nath323.sourceforge.net/.
          If you want to compile it as a module, say 'M' here and read
          Documentation/modules.txt.  If unsure, say 'N'.
+config IP_NF_SIP
+        tristate "SIP protocol support (EXPERIMENTAL)"
+        depends on IP_NF_CONNTRACK && EXPERIMENTAL
+        help
+          SIP is an application-layer control protocol that can establish,
+          modify, and terminate multimedia sessions (conferences) such as
+          Internet telephony calls. With the ip_conntrack_sip and
+          the ip_nat_sip modules you can support the protocol on a connection
+          tracking/NATing firewall.
+          To compile it as a module, choose M here.  If unsure, say Y.
 config IP_NF_QUEUE
        tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
        help
@@ -501,6 +527,12 @@ config IP_NF_NAT_H323
        default IP_NF_NAT if IP_NF_H323=y
        default m if IP_NF_H323=m
+config IP_NF_NAT_SIP
+        tristate
+        depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+        default IP_NF_NAT if IP_NF_SIP=y
+        default m if IP_NF_SIP=m
 # mangle + specific targets
 config IP_NF_MANGLE
        tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 461cb1eb5de..3ded4a3af59 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
 obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
 obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
 obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
 # NAT helpers 
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
 obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
 obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
 obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a604b1ccfda..0a7bd7f0406 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -17,33 +17,29 @@
 *      this value.
 *
 */
-#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
 #include <linux/moduleparam.h>
+#include <linux/textsearch.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/udp.h>
-#include <net/checksum.h>
-#include <net/udp.h>
+#include <linux/netfilter.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
 static unsigned int master_timeout = 300;
+static char *ts_algo = "kmp";
 MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
 MODULE_DESCRIPTION("Amanda connection tracking module");
 MODULE_LICENSE("GPL");
 module_param(master_timeout, uint, 0600);
 MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
+module_param(ts_algo, charp, 0400);
-static const char *conns[] = { "DATA ", "MESG ", "INDEX " };
+MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
-/* This is slow, but it's simple. --RR */
-static char *amanda_buffer;
-static DEFINE_SPINLOCK(amanda_buffer_lock);
 unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
                                   enum ip_conntrack_info ctinfo,
@@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
                                   struct ip_conntrack_expect *exp);
 EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
+enum amanda_strings {
+        SEARCH_CONNECT,
+        SEARCH_NEWLINE,
+        SEARCH_DATA,
+        SEARCH_MESG,
+        SEARCH_INDEX,
+};
+static struct {
+        char                    *string;
+        size_t                  len;
+        struct ts_config        *ts;
+} search[] = {
+        [SEARCH_CONNECT] = {
+                .string = "CONNECT ",
+                .len    = 8,
+        },
+        [SEARCH_NEWLINE] = {
+                .string = "\n",
+                .len    = 1,
+        },
+        [SEARCH_DATA] = {
+                .string = "DATA ",
+                .len    = 5,
+        },
+        [SEARCH_MESG] = {
+                .string = "MESG ",
+                .len    = 5,
+        },
+        [SEARCH_INDEX] = {
+                .string = "INDEX ",
+                .len    = 6,
+        },
+};
 static int help(struct sk_buff **pskb,
                struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
 {
+        struct ts_state ts;
        struct ip_conntrack_expect *exp;
-        char *data, *data_limit, *tmp;
+        unsigned int dataoff, start, stop, off, i;
-        unsigned int dataoff, i;
+        char pbuf[sizeof("65535")], *tmp;
        u_int16_t port, len;
        int ret = NF_ACCEPT;
@@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb,
                return NF_ACCEPT;
        }
-        spin_lock_bh(&amanda_buffer_lock);
+        memset(&ts, 0, sizeof(ts));
-        skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
+        start = skb_find_text(*pskb, dataoff, (*pskb)->len,
-        data = amanda_buffer;
+                              search[SEARCH_CONNECT].ts, &ts);
-        data_limit = amanda_buffer + (*pskb)->len - dataoff;
+        if (start == UINT_MAX)
-        *data_limit = '\0';
-        /* Search for the CONNECT string */
-        data = strstr(data, "CONNECT ");
-        if (!data)
                goto out;
-        data += strlen("CONNECT ");
+        start += dataoff + search[SEARCH_CONNECT].len;
-        /* Only search first line. */   
+        memset(&ts, 0, sizeof(ts));
-        if ((tmp = strchr(data, '\n')))
+        stop = skb_find_text(*pskb, start, (*pskb)->len,
-                *tmp = '\0';
+                             search[SEARCH_NEWLINE].ts, &ts);
+        if (stop == UINT_MAX)
+                goto out;
+        stop += start;
-        for (i = 0; i < ARRAY_SIZE(conns); i++) {
+        for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
-                char *match = strstr(data, conns[i]);
+                memset(&ts, 0, sizeof(ts));
-                if (!match)
+                off = skb_find_text(*pskb, start, stop, search[i].ts, &ts);
+                if (off == UINT_MAX)
                        continue;
-                tmp = data = match + strlen(conns[i]);
+                off += start + search[i].len;
-                port = simple_strtoul(data, &data, 10);
-                len = data - tmp;
+                len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
+                if (skb_copy_bits(*pskb, off, pbuf, len))
+                        break;
+                pbuf[len] = '\0';
+                port = simple_strtoul(pbuf, &tmp, 10);
+                len = tmp - pbuf;
                if (port == 0 || len > 5)
                        break;
@@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb,
                exp->mask.dst.u.tcp.port = 0xFFFF;
                if (ip_nat_amanda_hook)
-                        ret = ip_nat_amanda_hook(pskb, ctinfo,
+                        ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff,
-                                                 tmp - amanda_buffer,
                                                 len, exp);
                else if (ip_conntrack_expect_related(exp) != 0)
                        ret = NF_DROP;
@@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb,
        }
 out:
-        spin_unlock_bh(&amanda_buffer_lock);
        return ret;
 }
 static struct ip_conntrack_helper amanda_helper = {
-        .max_expected = ARRAY_SIZE(conns),
+        .max_expected = 3,
        .timeout = 180,
        .me = THIS_MODULE,
        .help = help,
@@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = {
 static void __exit ip_conntrack_amanda_fini(void)
 {
+        int i;
        ip_conntrack_helper_unregister(&amanda_helper);
-        kfree(amanda_buffer);
+        for (i = 0; i < ARRAY_SIZE(search); i++)
+                textsearch_destroy(search[i].ts);
 }
 static int __init ip_conntrack_amanda_init(void)
 {
-        int ret;
+        int ret, i;
-        amanda_buffer = kmalloc(65536, GFP_KERNEL);
+        ret = -ENOMEM;
-        if (!amanda_buffer)
+        for (i = 0; i < ARRAY_SIZE(search); i++) {
-                return -ENOMEM;
+                search[i].ts = textsearch_prepare(ts_algo, search[i].string,
+                                                  search[i].len,
-        ret = ip_conntrack_helper_register(&amanda_helper);
+                                                  GFP_KERNEL, TS_AUTOLOAD);
-        if (ret < 0) {
+                if (search[i].ts == NULL)
-                kfree(amanda_buffer);
+                        goto err;
-                return ret;
        }
+        ret = ip_conntrack_helper_register(&amanda_helper);
+        if (ret < 0)
+                goto err;
        return 0;
+err:
+        for (; i >= 0; i--) {
+                if (search[i].ts)
+                        textsearch_destroy(search[i].ts);
+        }
+        return ret;
 }
 module_init(ip_conntrack_amanda_init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a297da7bbef..7e4cf9a4d15 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple,
                /* this is ugly, but there is no other place where to put it */
                conntrack->nat.masq_index = exp->master->nat.masq_index;
 #endif
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+                conntrack->secmark = exp->master->secmark;
+#endif
                nf_conntrack_get(&conntrack->master->ct_general);
                CONNTRACK_STAT_INC(expect_new);
        } else {
@@ -1130,6 +1133,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
        write_lock_bh(&ip_conntrack_lock);
+        /* Only update if this is not a fixed timeout */
+        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+                write_unlock_bh(&ip_conntrack_lock);
+                return;
+        }
        /* If not in hash table, timer will not be active yet */
        if (!is_confirmed(ct)) {
                ct->timeout.expires = extra_jiffies;
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 3e542bf28a9..4dcf526c394 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char);
 static int try_epsv_response(const char *, size_t, u_int32_t [], char);
 static const struct ftp_search {
-        enum ip_conntrack_dir dir;
        const char *pattern;
        size_t plen;
        char skip;
        char term;
        enum ip_ct_ftp_type ftptype;
        int (*getnum)(const char *, size_t, u_int32_t[], char);
-} search[] = {
+} search[IP_CT_DIR_MAX][2] = {
-        {
+        [IP_CT_DIR_ORIGINAL] = {
-                IP_CT_DIR_ORIGINAL,
+                {
-                "PORT", sizeof("PORT") - 1, ' ', '\r',
+                        .pattern        =  "PORT",
-                IP_CT_FTP_PORT,
+                        .plen           = sizeof("PORT") - 1,
-                try_rfc959,
+                        .skip           = ' ',
+                        .term           = '\r',
+                        .ftptype        = IP_CT_FTP_PORT,
+                        .getnum         = try_rfc959,
+                },
+                {
+                        .pattern        = "EPRT",
+                        .plen           = sizeof("EPRT") - 1,
+                        .skip           = ' ',
+                        .term           = '\r',
+                        .ftptype        = IP_CT_FTP_EPRT,
+                        .getnum         = try_eprt,
+                },
        },
-        {
+        [IP_CT_DIR_REPLY] = {
-                IP_CT_DIR_REPLY,
+                {
-                "227 ", sizeof("227 ") - 1, '(', ')',
+                        .pattern        = "227 ",
-                IP_CT_FTP_PASV,
+                        .plen           = sizeof("227 ") - 1,
-                try_rfc959,
+                        .skip           = '(',
-        },
+                        .term           = ')',
-        {
+                        .ftptype        = IP_CT_FTP_PASV,
-                IP_CT_DIR_ORIGINAL,
+                        .getnum         = try_rfc959,
-                "EPRT", sizeof("EPRT") - 1, ' ', '\r',
+                },
-                IP_CT_FTP_EPRT,
+                {
-                try_eprt,
+                        .pattern        = "229 ",
-        },
+                        .plen           = sizeof("229 ") - 1,
-        {
+                        .skip           = '(',
-                IP_CT_DIR_REPLY,
+                        .term           = ')',
-                "229 ", sizeof("229 ") - 1, '(', ')',
+                        .ftptype        = IP_CT_FTP_EPSV,
-                IP_CT_FTP_EPSV,
+                        .getnum         = try_epsv_response,
-                try_epsv_response,
+                },
        },
 };
@@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb,
        array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
        array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
-        for (i = 0; i < ARRAY_SIZE(search); i++) {
+        for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
-                if (search[i].dir != dir) continue;
                found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
-                                     search[i].pattern,
+                                     search[dir][i].pattern,
-                                     search[i].plen,
+                                     search[dir][i].plen,
-                                     search[i].skip,
+                                     search[dir][i].skip,
-                                     search[i].term,
+                                     search[dir][i].term,
                                     &matchoff, &matchlen,
                                     array,
-                                     search[i].getnum);
+                                     search[dir][i].getnum);
                if (found) break;
        }
        if (found == -1) {
@@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb,
                   this case. */
                if (net_ratelimit())
                        printk("conntrack_ftp: partial %s %u+%u\n",
-                               search[i].pattern,
+                               search[dir][i].pattern,
                               ntohl(th->seq), datalen);
                ret = NF_DROP;
                goto out;
@@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb,
        /* Now, NAT might want to mangle the packet, and register the
         * (possibly changed) expectation itself. */
        if (ip_nat_ftp_hook)
-                ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+                ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype,
                                      matchoff, matchlen, exp, &seq);
        else {
                /* Can't expect this?  Best to drop packet now. */
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
index 518f581d39e..0665674218c 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
@@ -22,6 +22,8 @@
 #include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
 #include <linux/netfilter_ipv4/ip_conntrack_h323.h>
 #include <linux/moduleparam.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 #if 0
 #define DEBUGP printk
@@ -38,6 +40,12 @@ static int gkrouted_only = 1;
 module_param(gkrouted_only, int, 0600);
 MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
+static int callforward_filter = 1;
+module_param(callforward_filter, bool, 0600);
+MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
+                                     "if both endpoints are on different sides "
+                                     "(determined by routing information)");
 /* Hooks for NAT */
 int (*set_h245_addr_hook) (struct sk_buff ** pskb,
                           unsigned char **data, int dataoff,
@@ -77,6 +85,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb,
                      unsigned char **data, int dataoff,
                      TransportAddress * addr, u_int16_t port,
                      struct ip_conntrack_expect * exp);
+int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
+                                struct ip_conntrack * ct,
+                                enum ip_conntrack_info ctinfo,
+                                unsigned char **data, int dataoff,
+                                TransportAddress * addr, u_int16_t port,
+                                struct ip_conntrack_expect * exp);
 int (*nat_q931_hook) (struct sk_buff ** pskb,
                      struct ip_conntrack * ct,
                      enum ip_conntrack_info ctinfo,
@@ -683,6 +697,92 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
        return ret;
 }
+/* Forwarding declaration */
+void ip_conntrack_q931_expect(struct ip_conntrack *new,
+                              struct ip_conntrack_expect *this);
+/****************************************************************************/
+static int expect_callforwarding(struct sk_buff **pskb,
+                                 struct ip_conntrack *ct,
+                                 enum ip_conntrack_info ctinfo,
+                                 unsigned char **data, int dataoff,
+                                 TransportAddress * addr)
+{
+        int dir = CTINFO2DIR(ctinfo);
+        int ret = 0;
+        u_int32_t ip;
+        u_int16_t port;
+        struct ip_conntrack_expect *exp = NULL;
+        /* Read alternativeAddress */
+        if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
+                return 0;
+        /* If the calling party is on the same side of the forward-to party,
+         * we don't need to track the second call */
+        if (callforward_filter) {
+                struct rtable *rt1, *rt2;
+                struct flowi fl1 = {
+                        .fl4_dst = ip,
+                };
+                struct flowi fl2 = {
+                        .fl4_dst = ct->tuplehash[!dir].tuple.src.ip,
+                };
+                if (ip_route_output_key(&rt1, &fl1) == 0) {
+                        if (ip_route_output_key(&rt2, &fl2) == 0) {
+                                if (rt1->rt_gateway == rt2->rt_gateway &&
+                                    rt1->u.dst.dev  == rt2->u.dst.dev)
+                                        ret = 1;
+                                dst_release(&rt2->u.dst);
+                        }
+                        dst_release(&rt1->u.dst);
+                }
+                if (ret) {
+                        DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
+                        return 0;
+                }
+        }
+        /* Create expect for the second call leg */
+        if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+                return -1;
+        exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+        exp->tuple.src.u.tcp.port = 0;
+        exp->tuple.dst.ip = ip;
+        exp->tuple.dst.u.tcp.port = htons(port);
+        exp->tuple.dst.protonum = IPPROTO_TCP;
+        exp->mask.src.ip = 0xFFFFFFFF;
+        exp->mask.src.u.tcp.port = 0;
+        exp->mask.dst.ip = 0xFFFFFFFF;
+        exp->mask.dst.u.tcp.port = 0xFFFF;
+        exp->mask.dst.protonum = 0xFF;
+        exp->flags = 0;
+        if (ct->tuplehash[dir].tuple.src.ip !=
+            ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) {
+                /* Need NAT */
+                ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff,
+                                              addr, port, exp);
+        } else {                /* Conntrack only */
+                exp->expectfn = ip_conntrack_q931_expect;
+                if (ip_conntrack_expect_related(exp) == 0) {
+                        DEBUGP("ip_ct_q931: expect Call Forwarding "
+                               "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+                               NIPQUAD(exp->tuple.src.ip),
+                               ntohs(exp->tuple.src.u.tcp.port),
+                               NIPQUAD(exp->tuple.dst.ip),
+                               ntohs(exp->tuple.dst.u.tcp.port));
+                } else
+                        ret = -1;
+        }
+        ip_conntrack_expect_put(exp);
+        return ret;
+}
 /****************************************************************************/
 static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
                         enum ip_conntrack_info ctinfo,
@@ -878,6 +978,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
        DEBUGP("ip_ct_q931: Facility\n");
+        if (facility->reason.choice == eFacilityReason_callForwarded) {
+                if (facility->options & eFacility_UUIE_alternativeAddress)
+                        return expect_callforwarding(pskb, ct, ctinfo, data,
+                                                     dataoff,
+                                                     &facility->
+                                                     alternativeAddress);
+                return 0;
+        }
        if (facility->options & eFacility_UUIE_h245Address) {
                ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
                                  &facility->h245Address);
@@ -1677,7 +1786,6 @@ static int __init init(void)
                fini();
                return ret;
        }
        DEBUGP("ip_ct_h323: init success\n");
        return 0;
 }
@@ -1696,6 +1804,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook);
 EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
 EXPORT_SYMBOL_GPL(nat_t120_hook);
 EXPORT_SYMBOL_GPL(nat_h245_hook);
+EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
 EXPORT_SYMBOL_GPL(nat_q931_hook);
 MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
index 022c47b9f6c..4b359618bed 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
@@ -1,4 +1,4 @@
-/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
 *
 * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
 *
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = {	/* SEQUENCE OF */
 static field_t _Facility_UUIE[] = {     /* SEQUENCE */
        {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
-        {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0,
+        {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
-         _TransportAddress},
+         offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
        {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
         _Facility_UUIE_alternativeAliasAddress},
        {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 01bd7cab936..33891bb1fde 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -399,38 +399,54 @@ nfattr_failure:
 static int ctnetlink_done(struct netlink_callback *cb)
 {
        DEBUGP("entered %s\n", __FUNCTION__);
+        if (cb->args[1])
+                ip_conntrack_put((struct ip_conntrack *)cb->args[1]);
        return 0;
 }
 static int
 ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
-        struct ip_conntrack *ct = NULL;
+        struct ip_conntrack *ct, *last;
        struct ip_conntrack_tuple_hash *h;
        struct list_head *i;
-        u_int32_t *id = (u_int32_t *) &cb->args[1];
        DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 
                        cb->args[0], *id);
        read_lock_bh(&ip_conntrack_lock);
-        for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+        for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) {
+restart:
+                last = (struct ip_conntrack *)cb->args[1];
                list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
                        h = (struct ip_conntrack_tuple_hash *) i;
                        if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
                                continue;
                        ct = tuplehash_to_ctrack(h);
-                        if (ct->id <= *id)
+                        if (last != NULL) {
-                                continue;
+                                if (ct == last) {
+                                        ip_conntrack_put(last);
+                                        cb->args[1] = 0;
+                                        last = NULL;
+                                } else
+                                        continue;
+                        }
                        if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
                                                cb->nlh->nlmsg_seq,
                                                IPCTNL_MSG_CT_NEW,
-                                                1, ct) < 0)
+                                                1, ct) < 0) {
+                                nf_conntrack_get(&ct->ct_general);
+                                cb->args[1] = (unsigned long)ct;
                                goto out;
-                        *id = ct->id;
+                        }
+                }
+                if (last != NULL) {
+                        ip_conntrack_put(last);
+                        cb->args[1] = 0;
+                        goto restart;
                }
        }
-out:    
+out:
        read_unlock_bh(&ip_conntrack_lock);
        DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
@@ -629,7 +645,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
 };
 static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
+ctnetlink_parse_nat(struct nfattr *nat,
                    const struct ip_conntrack *ct, struct ip_nat_range *range)
 {
        struct nfattr *tb[CTA_NAT_MAX];
@@ -639,7 +655,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
        memset(range, 0, sizeof(*range));
        
-        nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+        nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
        if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
                return -EINVAL;
@@ -854,39 +870,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
                /* ASSURED bit can only be set */
                return -EINVAL;
-        if (cda[CTA_NAT-1]) {
+        if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 #ifndef CONFIG_IP_NF_NAT_NEEDED
                return -EINVAL;
 #else
-                unsigned int hooknum;
                struct ip_nat_range range;
-                if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+                if (cda[CTA_NAT_DST-1]) {
-                        return -EINVAL;
+                        if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
+                                                &range) < 0)
-                DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
+                                return -EINVAL;
-                       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+                        if (ip_nat_initialized(ct,
-                       htons(range.min.all), htons(range.max.all));
+                                               HOOK2MANIP(NF_IP_PRE_ROUTING)))
-                
+                                return -EEXIST;
-                /* This is tricky but it works. ip_nat_setup_info needs the
+                        ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
-                 * hook number as parameter, so let's do the correct 
+                }
-                 * conversion and run away */
+                if (cda[CTA_NAT_SRC-1]) {
-                if (status & IPS_SRC_NAT_DONE)
+                        if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
-                        hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+                                                &range) < 0)
-                else if (status & IPS_DST_NAT_DONE)
+                                return -EINVAL;
-                        hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
+                        if (ip_nat_initialized(ct,
-                else 
+                                               HOOK2MANIP(NF_IP_POST_ROUTING)))
-                        return -EINVAL; /* Missing NAT flags */
+                                return -EEXIST;
+                        ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
-                DEBUGP("NAT status: %lu\n", 
+                }
-                       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-                
-                if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
-                        return -EEXIST;
-                ip_nat_setup_info(ct, &range, hooknum);
-                DEBUGP("NAT status after setup_info: %lu\n",
-                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
 #endif
        }
@@ -1106,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
        /* implicit 'else' */
        /* we only allow nat config for new conntracks */
-        if (cda[CTA_NAT-1]) {
+        if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
                err = -EINVAL;
                goto out_unlock;
        }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 56794797d55..21ee124c046 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
 }
 /* look up the source key for a given tuple */
-static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t)
+static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
 {
        struct ip_ct_gre_keymap *km;
-        u_int32_t key = 0;
+        __be16 key = 0;
        read_lock_bh(&ip_ct_gre_lock);
        km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
@@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
                           struct ip_conntrack_tuple *tuple)
 {
        struct gre_hdr_pptp _pgrehdr, *pgrehdr;
-        u_int32_t srckey;
+        __be16 srckey;
        struct gre_hdr _grehdr, *grehdr;
        /* first only delinearize old RFC1701 GRE header */
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index d8b14a9010a..23f1c504586 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
        }
        /* See ip_conntrack_proto_tcp.c */
-        if (hooknum == NF_IP_PRE_ROUTING &&
+        if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
            nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
                if (LOG_INVALID(IPPROTO_ICMP))
                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 062b252b58a..c5c2ce5cdeb 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb,
         * and moreover root might send raw packets.
         */
        /* FIXME: Source route IP option packets --RR */
-        if (hooknum == NF_IP_PRE_ROUTING &&
+        if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
            nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
                if (LOG_INVALID(IPPROTO_TCP))
                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 70899868783..9b2c16b4d2f 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
         * because the semantic of CHECKSUM_HW is different there 
         * and moreover root might send raw packets.
         * FIXME: Source route IP option packets --RR */
-        if (hooknum == NF_IP_PRE_ROUTING &&
+        if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
            nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
                if (LOG_INVALID(IPPROTO_UDP))
                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
new file mode 100644
index 00000000000..fc87ce0da40
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_sip.c
@@ -0,0 +1,471 @@
+/* SIP extension for IP connection tracking.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_conntrack_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP connection tracking helper");
+#define MAX_PORTS       8
+static unsigned short ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of sip servers");
+static unsigned int sip_timeout = SIP_TIMEOUT;
+module_param(sip_timeout, uint, 0600);
+MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
+unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
+                                enum ip_conntrack_info ctinfo,
+                                struct ip_conntrack *ct,
+                                const char **dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
+unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
+                                enum ip_conntrack_info ctinfo,
+                                struct ip_conntrack_expect *exp,
+                                const char *dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
+int ct_sip_get_info(const char *dptr, size_t dlen,
+                                unsigned int *matchoff,
+                                unsigned int *matchlen,
+                                struct sip_header_nfo *hnfo);
+EXPORT_SYMBOL_GPL(ct_sip_get_info);
+static int digits_len(const char *dptr, const char *limit, int *shift);
+static int epaddr_len(const char *dptr, const char *limit, int *shift);
+static int skp_digits_len(const char *dptr, const char *limit, int *shift);
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
+struct sip_header_nfo ct_sip_hdrs[] = {
+        {       /* Via header */
+                .lname          = "Via:",
+                .lnlen          = sizeof("Via:") - 1,
+                .sname          = "\r\nv:",
+                .snlen          = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
+                .ln_str         = "UDP ",
+                .ln_strlen      = sizeof("UDP ") - 1,
+                .match_len      = epaddr_len,
+        },
+        {       /* Contact header */
+                .lname          = "Contact:",
+                .lnlen          = sizeof("Contact:") - 1,
+                .sname          = "\r\nm:",
+                .snlen          = sizeof("\r\nm:") - 1,
+                .ln_str         = "sip:",
+                .ln_strlen      = sizeof("sip:") - 1,
+                .match_len      = skp_epaddr_len
+        },
+        {       /* Content length header */
+                .lname          = "Content-Length:",
+                .lnlen          = sizeof("Content-Length:") - 1,
+                .sname          = "\r\nl:",
+                .snlen          = sizeof("\r\nl:") - 1,
+                .ln_str         = ":",
+                .ln_strlen      = sizeof(":") - 1,
+                .match_len      = skp_digits_len
+        },
+        {       /* SDP media info */
+                .lname          = "\nm=",
+                .lnlen          = sizeof("\nm=") - 1,
+                .sname          = "\rm=",
+                .snlen          = sizeof("\rm=") - 1,
+                .ln_str         = "audio ",
+                .ln_strlen      = sizeof("audio ") - 1,
+                .match_len      = digits_len
+        },
+        {       /* SDP owner address*/
+                .lname          = "\no=",
+                .lnlen          = sizeof("\no=") - 1,
+                .sname          = "\ro=",
+                .snlen          = sizeof("\ro=") - 1,
+                .ln_str         = "IN IP4 ",
+                .ln_strlen      = sizeof("IN IP4 ") - 1,
+                .match_len      = epaddr_len
+        },
+        {       /* SDP connection info */
+                .lname          = "\nc=",
+                .lnlen          = sizeof("\nc=") - 1,
+                .sname          = "\rc=",
+                .snlen          = sizeof("\rc=") - 1,
+                .ln_str         = "IN IP4 ",
+                .ln_strlen      = sizeof("IN IP4 ") - 1,
+                .match_len      = epaddr_len
+        },
+        {       /* Requests headers */
+                .lname          = "sip:",
+                .lnlen          = sizeof("sip:") - 1,
+                .sname          = "sip:",
+                .snlen          = sizeof("sip:") - 1, /* yes, i know.. ;) */
+                .ln_str         = "@",
+                .ln_strlen      = sizeof("@") - 1,
+                .match_len      = epaddr_len
+        },
+        {       /* SDP version header */
+                .lname          = "\nv=",
+                .lnlen          = sizeof("\nv=") - 1,
+                .sname          = "\rv=",
+                .snlen          = sizeof("\rv=") - 1,
+                .ln_str         = "=",
+                .ln_strlen      = sizeof("=") - 1,
+                .match_len      = digits_len
+        }
+};
+EXPORT_SYMBOL_GPL(ct_sip_hdrs);
+/* get line lenght until first CR or LF seen. */
+int ct_sip_lnlen(const char *line, const char *limit)
+{
+        const char *k = line;
+        while ((line <= limit) && (*line == '\r' || *line == '\n'))
+                line++;
+        while (line <= limit) {
+                if (*line == '\r' || *line == '\n')
+                        break;
+                line++;
+        }
+        return line - k;
+}
+EXPORT_SYMBOL_GPL(ct_sip_lnlen);
+/* Linear string search, case sensitive. */
+const char *ct_sip_search(const char *needle, const char *haystack,
+                          size_t needle_len, size_t haystack_len)
+{
+        const char *limit = haystack + (haystack_len - needle_len);
+        while (haystack <= limit) {
+                if (memcmp(haystack, needle, needle_len) == 0)
+                        return haystack;
+                haystack++;
+        }
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(ct_sip_search);
+static int digits_len(const char *dptr, const char *limit, int *shift)
+{
+        int len = 0;
+        while (dptr <= limit && isdigit(*dptr)) {
+                dptr++;
+                len++;
+        }
+        return len;
+}
+/* get digits lenght, skiping blank spaces. */
+static int skp_digits_len(const char *dptr, const char *limit, int *shift)
+{
+        for (; dptr <= limit && *dptr == ' '; dptr++)
+                (*shift)++;
+        return digits_len(dptr, limit, shift);
+}
+/* Simple ipaddr parser.. */
+static int parse_ipaddr(const char *cp, const char **endp,
+                        u_int32_t *ipaddr, const char *limit)
+{
+        unsigned long int val;
+        int i, digit = 0;
+        for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
+                digit = 0;
+                if (!isdigit(*cp))
+                        break;
+                val = simple_strtoul(cp, (char **)&cp, 10);
+                if (val > 0xFF)
+                        return -1;
+                ((u_int8_t *)ipaddr)[i] = val;
+                digit = 1;
+                if (*cp != '.')
+                        break;
+                cp++;
+        }
+        if (!digit)
+                return -1;
+        if (endp)
+                *endp = cp;
+        return 0;
+}
+/* skip ip address. returns it lenght. */
+static int epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+        const char *aux = dptr;
+        u_int32_t ip;
+        if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
+                DEBUGP("ip: %s parse failed.!\n", dptr);
+                return 0;
+        }
+        /* Port number */
+        if (*dptr == ':') {
+                dptr++;
+                dptr += digits_len(dptr, limit, shift);
+        }
+        return dptr - aux;
+}
+/* get address length, skiping user info. */
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+        int s = *shift;
+        for (; dptr <= limit && *dptr != '@'; dptr++)
+                (*shift)++;
+        if (*dptr == '@') {
+                dptr++;
+                (*shift)++;
+        } else
+                *shift = s;
+        return epaddr_len(dptr, limit, shift);
+}
+/* Returns 0 if not found, -1 error parsing. */
+int ct_sip_get_info(const char *dptr, size_t dlen,
+                    unsigned int *matchoff,
+                    unsigned int *matchlen,
+                    struct sip_header_nfo *hnfo)
+{
+        const char *limit, *aux, *k = dptr;
+        int shift = 0;
+        limit = dptr + (dlen - hnfo->lnlen);
+        while (dptr <= limit) {
+                if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
+                    (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
+                        dptr++;
+                        continue;
+                }
+                aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
+                                    ct_sip_lnlen(dptr, limit));
+                if (!aux) {
+                        DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
+                               hnfo->lname);
+                        return -1;
+                }
+                aux += hnfo->ln_strlen;
+                *matchlen = hnfo->match_len(aux, limit, &shift);
+                if (!*matchlen)
+                        return -1;
+                *matchoff = (aux - k) + shift;
+                DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
+                       *matchlen);
+                return 1;
+        }
+        DEBUGP("%s header not found.\n", hnfo->lname);
+        return 0;
+}
+static int set_expected_rtp(struct sk_buff **pskb,
+                            struct ip_conntrack *ct,
+                            enum ip_conntrack_info ctinfo,
+                            u_int32_t ipaddr, u_int16_t port,
+                            const char *dptr)
+{
+        struct ip_conntrack_expect *exp;
+        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+        int ret;
+        exp = ip_conntrack_expect_alloc(ct);
+        if (exp == NULL)
+                return NF_DROP;
+        exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+        exp->tuple.src.u.udp.port = 0;
+        exp->tuple.dst.ip = ipaddr;
+        exp->tuple.dst.u.udp.port = htons(port);
+        exp->tuple.dst.protonum = IPPROTO_UDP;
+        exp->mask.src.ip = 0xFFFFFFFF;
+        exp->mask.src.u.udp.port = 0;
+        exp->mask.dst.ip = 0xFFFFFFFF;
+        exp->mask.dst.u.udp.port = 0xFFFF;
+        exp->mask.dst.protonum = 0xFF;
+        exp->expectfn = NULL;
+        exp->flags = 0;
+        if (ip_nat_sdp_hook)
+                ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr);
+        else {
+                if (ip_conntrack_expect_related(exp) != 0)
+                        ret = NF_DROP;
+                else
+                        ret = NF_ACCEPT;
+        }
+        ip_conntrack_expect_put(exp);
+        return ret;
+}
+static int sip_help(struct sk_buff **pskb,
+                    struct ip_conntrack *ct,
+                    enum ip_conntrack_info ctinfo)
+{
+        unsigned int dataoff, datalen;
+        const char *dptr;
+        int ret = NF_ACCEPT;
+        int matchoff, matchlen;
+        u_int32_t ipaddr;
+        u_int16_t port;
+        /* No Data ? */
+        dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+        if (dataoff >= (*pskb)->len) {
+                DEBUGP("skb->len = %u\n", (*pskb)->len);
+                return NF_ACCEPT;
+        }
+        ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
+        if (!skb_is_nonlinear(*pskb))
+                dptr = (*pskb)->data + dataoff;
+        else {
+                DEBUGP("Copy of skbuff not supported yet.\n");
+                goto out;
+        }
+        if (ip_nat_sip_hook) {
+                if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) {
+                        ret = NF_DROP;
+                        goto out;
+                }
+        }
+        /* After this point NAT, could have mangled skb, so
+           we need to recalculate payload lenght. */
+        datalen = (*pskb)->len - dataoff;
+        if (datalen < (sizeof("SIP/2.0 200") - 1))
+                goto out;
+        /* RTP info only in some SDP pkts */
+        if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
+            memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
+                goto out;
+        }
+        /* Get ip and port address from SDP packet. */
+        if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+                            &ct_sip_hdrs[POS_CONNECTION]) > 0) {
+                /* We'll drop only if there are parse problems. */
+                if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
+                                 dptr + datalen) < 0) {
+                        ret = NF_DROP;
+                        goto out;
+                }
+                if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+                                    &ct_sip_hdrs[POS_MEDIA]) > 0) {
+                        port = simple_strtoul(dptr + matchoff, NULL, 10);
+                        if (port < 1024) {
+                                ret = NF_DROP;
+                                goto out;
+                        }
+                        ret = set_expected_rtp(pskb, ct, ctinfo,
+                                               ipaddr, port, dptr);
+                }
+        }
+out:
+        return ret;
+}
+static struct ip_conntrack_helper sip[MAX_PORTS];
+static char sip_names[MAX_PORTS][10];
+static void fini(void)
+{
+        int i;
+        for (i = 0; i < ports_c; i++) {
+                DEBUGP("unregistering helper for port %d\n", ports[i]);
+                ip_conntrack_helper_unregister(&sip[i]);
+        }
+}
+static int __init init(void)
+{
+        int i, ret;
+        char *tmpname;
+        if (ports_c == 0)
+                ports[ports_c++] = SIP_PORT;
+        for (i = 0; i < ports_c; i++) {
+                /* Create helper structure */
+                memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
+                sip[i].tuple.dst.protonum = IPPROTO_UDP;
+                sip[i].tuple.src.u.udp.port = htons(ports[i]);
+                sip[i].mask.src.u.udp.port = 0xFFFF;
+                sip[i].mask.dst.protonum = 0xFF;
+                sip[i].max_expected = 1;
+                sip[i].timeout = 3 * 60; /* 3 minutes */
+                sip[i].me = THIS_MODULE;
+                sip[i].help = sip_help;
+                tmpname = &sip_names[i][0];
+                if (ports[i] == SIP_PORT)
+                        sprintf(tmpname, "sip");
+                else
+                        sprintf(tmpname, "sip-%d", i);
+                sip[i].name = tmpname;
+                DEBUGP("port #%d: %d\n", i, ports[i]);
+                ret = ip_conntrack_helper_register(&sip[i]);
+                if (ret) {
+                        printk("ERROR registering helper for port %d\n",
+                                ports[i]);
+                        fini();
+                        return ret;
+                }
+        }
+        return 0;
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 929d61f7be9..88445aac3f2 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
                return -ENOSPC;
 #endif
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+        if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+                return -ENOSPC;
+#endif
        if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
                return -ENOSPC;
@@ -417,7 +422,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum,
        /* This is where we call the helper: as the packet goes out. */
        ct = ip_conntrack_get(*pskb, &ctinfo);
-        if (ct && ct->helper) {
+        if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) {
                unsigned int ret;
                ret = ct->helper->help(pskb, ct, ctinfo);
                if (ret != NF_ACCEPT)
@@ -564,6 +569,8 @@ extern unsigned int ip_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
+int ip_conntrack_checksum = 1;
 static struct ctl_table_header *ip_ct_sysctl_header;
 static ctl_table ip_ct_sysctl_table[] = {
@@ -592,6 +599,14 @@ static ctl_table ip_ct_sysctl_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = NET_IPV4_NF_CONNTRACK_CHECKSUM,
+                .procname       = "ip_conntrack_checksum",
+                .data           = &ip_conntrack_checksum,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
                .procname       = "ip_conntrack_tcp_timeout_syn_sent",
                .data           = &ip_ct_tcp_timeout_syn_sent,
@@ -946,6 +961,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
 EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
index d45663d137a..419b878fb46 100644
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ b/net/ipv4/netfilter/ip_nat_helper_h323.c
@@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
 }
 /****************************************************************************/
+static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
+                                         struct ip_conntrack_expect *this)
+{
+        struct ip_nat_range range;
+        /* This must be a fresh one. */
+        BUG_ON(new->status & IPS_NAT_DONE_MASK);
+        /* Change src to where master sends to */
+        range.flags = IP_NAT_RANGE_MAP_IPS;
+        range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
+        /* hook doesn't matter, but it has to do source manip */
+        ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
+        /* For DST manip, map port here to where it's expected. */
+        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+        range.min = range.max = this->saved_proto;
+        range.min_ip = range.max_ip = this->saved_ip;
+        /* hook doesn't matter, but it has to do destination manip */
+        ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
+        ip_conntrack_q931_expect(new, this);
+}
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
+                              enum ip_conntrack_info ctinfo,
+                              unsigned char **data, int dataoff,
+                              TransportAddress * addr, u_int16_t port,
+                              struct ip_conntrack_expect *exp)
+{
+        int dir = CTINFO2DIR(ctinfo);
+        u_int16_t nated_port;
+        /* Set expectations for NAT */
+        exp->saved_ip = exp->tuple.dst.ip;
+        exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+        exp->expectfn = ip_nat_callforwarding_expect;
+        exp->dir = !dir;
+        /* Try to get same port: if not, try to change it. */
+        for (nated_port = port; nated_port != 0; nated_port++) {
+                exp->tuple.dst.u.tcp.port = htons(nated_port);
+                if (ip_conntrack_expect_related(exp) == 0)
+                        break;
+        }
+        if (nated_port == 0) {  /* No port available */
+                if (net_ratelimit())
+                        printk("ip_nat_q931: out of TCP ports\n");
+                return 0;
+        }
+        /* Modify signal */
+        if (!set_h225_addr(pskb, data, dataoff, addr,
+                           ct->tuplehash[!dir].tuple.dst.ip,
+                           nated_port) == 0) {
+                ip_conntrack_unexpect_related(exp);
+                return -1;
+        }
+        /* Success */
+        DEBUGP("ip_nat_q931: expect Call Forwarding "
+               "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+               NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
+               NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+        return 0;
+}
+/****************************************************************************/
 static int __init init(void)
 {
        BUG_ON(set_h245_addr_hook != NULL);
@@ -496,6 +570,7 @@ static int __init init(void)
        BUG_ON(nat_rtp_rtcp_hook != NULL);
        BUG_ON(nat_t120_hook != NULL);
        BUG_ON(nat_h245_hook != NULL);
+        BUG_ON(nat_callforwarding_hook != NULL);
        BUG_ON(nat_q931_hook != NULL);
        set_h245_addr_hook = set_h245_addr;
@@ -505,6 +580,7 @@ static int __init init(void)
        nat_rtp_rtcp_hook = nat_rtp_rtcp;
        nat_t120_hook = nat_t120;
        nat_h245_hook = nat_h245;
+        nat_callforwarding_hook = nat_callforwarding;
        nat_q931_hook = nat_q931;
        DEBUGP("ip_nat_h323: init success\n");
@@ -521,6 +597,7 @@ static void __exit fini(void)
        nat_rtp_rtcp_hook = NULL;
        nat_t120_hook = NULL;
        nat_h245_hook = NULL;
+        nat_callforwarding_hook = NULL;
        nat_q931_hook = NULL;
        synchronize_net();
 }
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
new file mode 100644
index 00000000000..6ffba63adca
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_sip.c
@@ -0,0 +1,249 @@
+/* SIP extension for UDP NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+extern struct sip_header_nfo ct_sip_hdrs[];
+static unsigned int mangle_sip_packet(struct sk_buff **pskb,
+                                      enum ip_conntrack_info ctinfo,
+                                      struct ip_conntrack *ct,
+                                      const char **dptr, size_t dlen,
+                                      char *buffer, int bufflen,
+                                      struct sip_header_nfo *hnfo)
+{
+        unsigned int matchlen, matchoff;
+        if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0)
+                return 0;
+        if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+                                      matchoff, matchlen, buffer, bufflen))
+                return 0;
+        /* We need to reload this. Thanks Patrick. */
+        *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+        return 1;
+}
+static unsigned int ip_nat_sip(struct sk_buff **pskb,
+                               enum ip_conntrack_info ctinfo,
+                               struct ip_conntrack *ct,
+                               const char **dptr)
+{
+        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+        char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
+        unsigned int bufflen, dataoff;
+        u_int32_t ip;
+        u_int16_t port;
+        dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+        ip   = ct->tuplehash[!dir].tuple.dst.ip;
+        port = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+        bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port));
+        /* short packet ? */
+        if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1))
+                return 0;
+        /* Basic rules: requests and responses. */
+        if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) {
+                const char *aux;
+                if ((ctinfo) < IP_CT_IS_REPLY) {
+                        mangle_sip_packet(pskb, ctinfo, ct, dptr,
+                                          (*pskb)->len - dataoff,
+                                          buffer, bufflen,
+                                          &ct_sip_hdrs[POS_CONTACT]);
+                        return 1;
+                }
+                if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+                                       (*pskb)->len - dataoff,
+                                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+                        return 0;
+                /* This search should ignore case, but later.. */
+                aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1,
+                                    (*pskb)->len - dataoff);
+                if (!aux)
+                        return 0;
+                if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"),
+                    ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff)))
+                        return 1;
+                return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+                                         (*pskb)->len - dataoff,
+                                         buffer, bufflen,
+                                         &ct_sip_hdrs[POS_CONTACT]);
+        }
+        if ((ctinfo) < IP_CT_IS_REPLY) {
+                if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+                                       (*pskb)->len - dataoff,
+                                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+                        return 0;
+                /* Mangle Contact if exists only. - watch udp_nat_mangle()! */
+                mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff,
+                                  buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]);
+                return 1;
+        }
+        /* This mangle requests headers. */
+        return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+                                 ct_sip_lnlen(*dptr,
+                                              *dptr + (*pskb)->len - dataoff),
+                                 buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]);
+}
+static int mangle_content_len(struct sk_buff **pskb,
+                              enum ip_conntrack_info ctinfo,
+                              struct ip_conntrack *ct,
+                              const char *dptr)
+{
+        unsigned int dataoff, matchoff, matchlen;
+        char buffer[sizeof("65536")];
+        int bufflen;
+        dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+        /* Get actual SDP lenght */
+        if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+                            &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) {
+                /* since ct_sip_get_info() give us a pointer passing 'v='
+                   we need to add 2 bytes in this count. */
+                int c_len = (*pskb)->len - dataoff - matchoff + 2;
+                /* Now, update SDP lenght */
+                if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+                                    &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) {
+                        bufflen = sprintf(buffer, "%u", c_len);
+                        return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+                                                        matchoff, matchlen,
+                                                        buffer, bufflen);
+                }
+        }
+        return 0;
+}
+static unsigned int mangle_sdp(struct sk_buff **pskb,
+                               enum ip_conntrack_info ctinfo,
+                               struct ip_conntrack *ct,
+                               u_int32_t newip, u_int16_t port,
+                               const char *dptr)
+{
+        char buffer[sizeof("nnn.nnn.nnn.nnn")];
+        unsigned int dataoff, bufflen;
+        dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+        /* Mangle owner and contact info. */
+        bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
+        if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+                               buffer, bufflen, &ct_sip_hdrs[POS_OWNER]))
+                return 0;
+        if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+                               buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION]))
+                return 0;
+        /* Mangle media port. */
+        bufflen = sprintf(buffer, "%u", port);
+        if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+                               buffer, bufflen, &ct_sip_hdrs[POS_MEDIA]))
+                return 0;
+        return mangle_content_len(pskb, ctinfo, ct, dptr);
+}
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_sdp(struct sk_buff **pskb,
+                               enum ip_conntrack_info ctinfo,
+                               struct ip_conntrack_expect *exp,
+                               const char *dptr)
+{
+        struct ip_conntrack *ct = exp->master;
+        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+        u_int32_t newip;
+        u_int16_t port;
+        DEBUGP("ip_nat_sdp():\n");
+        /* Connection will come from reply */
+        newip = ct->tuplehash[!dir].tuple.dst.ip;
+        exp->tuple.dst.ip = newip;
+        exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+        exp->dir = !dir;
+        /* When you see the packet, we need to NAT it the same as the
+           this one. */
+        exp->expectfn = ip_nat_follow_master;
+        /* Try to get same port: if not, try to change it. */
+        for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
+                exp->tuple.dst.u.udp.port = htons(port);
+                if (ip_conntrack_expect_related(exp) == 0)
+                        break;
+        }
+        if (port == 0)
+                return NF_DROP;
+        if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
+                ip_conntrack_unexpect_related(exp);
+                return NF_DROP;
+        }
+        return NF_ACCEPT;
+}
+static void __exit fini(void)
+{
+        ip_nat_sip_hook = NULL;
+        ip_nat_sdp_hook = NULL;
+        /* Make sure noone calls it, meanwhile. */
+        synchronize_net();
+}
+static int __init init(void)
+{
+        BUG_ON(ip_nat_sip_hook);
+        BUG_ON(ip_nat_sdp_hook);
+        ip_nat_sip_hook = ip_nat_sip;
+        ip_nat_sdp_hook = ip_nat_sdp;
+        return 0;
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index c33244263b9..d20d557f915 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void)
 module_init(ip_nat_snmp_basic_init);
 module_exit(ip_nat_snmp_basic_fini);
-module_param(debug, bool, 0600);
+module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index aad9d28c8d7..dbc83c5d7aa 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
        struct iphdr *iph = skb->nh.iph;
        unsigned long hashval;
        u_int16_t sport, dport;
-        struct tcphdr *th;
+        u_int16_t *ports;
-        struct udphdr *uh;
-        struct icmphdr *ih;
        switch (iph->protocol) {
        case IPPROTO_TCP:
-                th = (void *)iph+iph->ihl*4;
-                sport = ntohs(th->source);
-                dport = ntohs(th->dest);
-                break;
        case IPPROTO_UDP:
-                uh = (void *)iph+iph->ihl*4;
+        case IPPROTO_SCTP:
-                sport = ntohs(uh->source);
+        case IPPROTO_DCCP:
-                dport = ntohs(uh->dest);
-                break;
        case IPPROTO_ICMP:
-                ih = (void *)iph+iph->ihl*4;
+                ports = (void *)iph+iph->ihl*4;
-                sport = ntohs(ih->un.echo.id);
+                sport = ports[0];
-                dport = (ih->type<<8)|ih->code;
+                dport = ports[1];
                break;
        default:
                if (net_ratelimit()) {
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0bba3c2bb78..431a3ce6f7b 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
        /* This packet will not be the same as the other: clear nf fields */
        nf_reset(nskb);
        nskb->nfmark = 0;
+        skb_init_secmark(nskb);
        tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 7c6836c4646..92980ab8ce4 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -28,9 +28,6 @@
 #include <linux/jhash.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/sctp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/list.h>
@@ -83,6 +80,7 @@ struct ipt_hashlimit_htable {
        /* used internally */
        spinlock_t lock;                /* lock for list_head */
        u_int32_t rnd;                  /* random seed for hash */
+        int rnd_initialized;
        struct timer_list timer;        /* timer for gc */
        atomic_t count;                 /* number entries in table */
@@ -137,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
        /* initialize hash with random val at the time we allocate
         * the first hashtable entry */
-        if (!ht->rnd)
+        if (!ht->rnd_initialized) {
                get_random_bytes(&ht->rnd, 4);
+                ht->rnd_initialized = 1;
+        }
        if (ht->cfg.max &&
            atomic_read(&ht->count) >= ht->cfg.max) {
@@ -217,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
        atomic_set(&hinfo->count, 0);
        atomic_set(&hinfo->use, 1);
-        hinfo->rnd = 0;
+        hinfo->rnd_initialized = 0;
        spin_lock_init(&hinfo->lock);
        hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
        if (!hinfo->pde) {
@@ -381,49 +381,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
                dh->rateinfo.credit = dh->rateinfo.credit_cap;
 }
-static inline int get_ports(const struct sk_buff *skb, int offset, 
-                            u16 ports[2])
-{
-        union {
-                struct tcphdr th;
-                struct udphdr uh;
-                sctp_sctphdr_t sctph;
-        } hdr_u, *ptr_u;
-        /* Must not be a fragment. */
-        if (offset)
-                return 1;
-        /* Must be big enough to read ports (both UDP and TCP have
-           them at the start). */
-        ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); 
-        if (!ptr_u)
-                return 1;
-        switch (skb->nh.iph->protocol) {
-                case IPPROTO_TCP:
-                        ports[0] = ptr_u->th.source;
-                        ports[1] = ptr_u->th.dest;
-                        break;
-                case IPPROTO_UDP:
-                        ports[0] = ptr_u->uh.source;
-                        ports[1] = ptr_u->uh.dest;
-                        break;
-                case IPPROTO_SCTP:
-                        ports[0] = ptr_u->sctph.source;
-                        ports[1] = ptr_u->sctph.dest;
-                        break;
-                default:
-                        /* all other protocols don't supprot per-port hash
-                         * buckets */
-                        ports[0] = ports[1] = 0;
-                        break;
-        }
-        return 0;
-}
 static int
 hashlimit_match(const struct sk_buff *skb,
                const struct net_device *in,
@@ -449,8 +406,22 @@ hashlimit_match(const struct sk_buff *skb,
                dst.src_ip = skb->nh.iph->saddr;
        if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
            ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
-                u_int16_t ports[2];
+                u_int16_t _ports[2], *ports;
-                if (get_ports(skb, offset, ports)) {
+                switch (skb->nh.iph->protocol) {
+                case IPPROTO_TCP:
+                case IPPROTO_UDP:
+                case IPPROTO_SCTP:
+                case IPPROTO_DCCP:
+                        ports = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+                                                   sizeof(_ports), &_ports);
+                        break;
+                default:
+                        _ports[0] = _ports[1] = 0;
+                        ports = _ports;
+                        break;
+                }
+                if (!ports) {
                        /* We've been asked to examine this packet, and we
                          can't.  Hence, no choice but to drop. */
                        *hotdrop = 1;
@@ -561,7 +532,7 @@ static void
 hashlimit_destroy(const struct xt_match *match, void *matchinfo,
                  unsigned int matchsize)
 {
-        struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo;
+        struct ipt_hashlimit_info *r = matchinfo;
        htable_put(r->hinfo);
 }
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index b847ee409ef..61a2139f9cf 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -1,1007 +1,499 @@
-/* Kernel module to check if the source address has been seen recently. */
+/*
-/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
-/* Author: Stephen Frost <sfrost@snowman.net> */
+ *
-/* Project Page: http://snowman.net/projects/ipt_recent/ */
+ * This program is free software; you can redistribute it and/or modify
-/* This software is distributed under the terms of the GPL, Version 2 */
+ * it under the terms of the GNU General Public License version 2 as
-/* This copyright does not cover user programs that use kernel services
+ * published by the Free Software Foundation.
- * by normal system calls. */
+ *
+ * This is a replacement of the old ipt_recent module, which carried the
-#include <linux/module.h>
+ * following copyright notice:
-#include <linux/skbuff.h>
+ *
+ * Author: Stephen Frost <sfrost@snowman.net>
+ * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
+ */
+#include <linux/init.h>
+#include <linux/moduleparam.h>
 #include <linux/proc_fs.h>
-#include <linux/spinlock.h>
+#include <linux/seq_file.h>
-#include <linux/interrupt.h>
+#include <linux/string.h>
-#include <asm/uaccess.h>
 #include <linux/ctype.h>
-#include <linux/ip.h>
+#include <linux/list.h>
-#include <linux/vmalloc.h>
+#include <linux/random.h>
-#include <linux/moduleparam.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ipt_recent.h>
-#undef DEBUG
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-#define HASH_LOG 9
+MODULE_DESCRIPTION("IP tables recently seen matching module");
+MODULE_LICENSE("GPL");
-/* Defaults, these can be overridden on the module command-line. */
 static unsigned int ip_list_tot = 100;
 static unsigned int ip_pkt_list_tot = 20;
 static unsigned int ip_list_hash_size = 0;
 static unsigned int ip_list_perms = 0644;
-#ifdef DEBUG
-static int debug = 1;
-#endif
-static char version[] =
-KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>.  http://snowman.net/projects/ipt_recent/\n";
-MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
-MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
-MODULE_LICENSE("GPL");
 module_param(ip_list_tot, uint, 0400);
 module_param(ip_pkt_list_tot, uint, 0400);
 module_param(ip_list_hash_size, uint, 0400);
 module_param(ip_list_perms, uint, 0400);
-#ifdef DEBUG
+MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
-module_param(debug, bool, 0600);
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
-MODULE_PARM_DESC(debug,"enable debugging output");
+MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
-#endif
+MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list");
-MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember");
-MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs");
+struct recent_entry {
-MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files");
+        struct list_head        list;
+        struct list_head        lru_list;
-/* Structure of our list of recently seen addresses. */
+        u_int32_t               addr;
-struct recent_ip_list {
+        u_int8_t                ttl;
-        u_int32_t addr;
+        u_int8_t                index;
-        u_int8_t  ttl;
+        u_int16_t               nstamps;
-        unsigned long last_seen;
+        unsigned long           stamps[0];
-        unsigned long *last_pkts;
-        u_int32_t oldest_pkt;
-        u_int32_t hash_entry;
-        u_int32_t time_pos;
-};
-struct time_info_list {
-        u_int32_t position;
-        u_int32_t time;
 };
-/* Structure of our linked list of tables of recent lists. */
+struct recent_table {
-struct recent_ip_tables {
+        struct list_head        list;
-        char name[IPT_RECENT_NAME_LEN];
+        char                    name[IPT_RECENT_NAME_LEN];
-        int count;
-        int time_pos;
-        struct recent_ip_list *table;
-        struct recent_ip_tables *next;
-        spinlock_t list_lock;
-        int *hash_table;
-        struct time_info_list *time_info;
 #ifdef CONFIG_PROC_FS
-        struct proc_dir_entry *status_proc;
+        struct proc_dir_entry   *proc;
-#endif /* CONFIG_PROC_FS */
+#endif
+        unsigned int            refcnt;
+        unsigned int            entries;
+        struct list_head        lru_list;
+        struct list_head        iphash[0];
 };
-/* Our current list of addresses we have recently seen.
+static LIST_HEAD(tables);
- * Only added to on a --set, and only updated on --set || --update 
- */
-static struct recent_ip_tables *r_tables = NULL;
-/* We protect r_list with this spinlock so two processors are not modifying
- * the list at the same time. 
- */
 static DEFINE_SPINLOCK(recent_lock);
+static DEFINE_MUTEX(recent_mutex);
 #ifdef CONFIG_PROC_FS
-/* Our /proc/net/ipt_recent entry */
+static struct proc_dir_entry    *proc_dir;
-static struct proc_dir_entry *proc_net_ipt_recent = NULL;
+static struct file_operations   recent_fops;
-#endif
-/* Function declaration for later. */
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const struct xt_match *match,
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop);
-/* Function to hash a given address into the hash table of table_size size */
-static int hash_func(unsigned int addr, int table_size)
-{
-        int result = 0;
-        unsigned int value = addr;
-        do { result ^= value; } while((value >>= HASH_LOG));
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
-                         result & (table_size - 1),
-                         addr,
-                         table_size);
 #endif
-        return(result & (table_size - 1));
+static u_int32_t hash_rnd;
-}
+static int hash_rnd_initted;
-#ifdef CONFIG_PROC_FS
+static unsigned int recent_entry_hash(u_int32_t addr)
-/* This is the function which produces the output for our /proc output
- * interface which lists each IP address, the last seen time and the 
- * other recent times the address was seen.
- */
-static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
 {
-        int len = 0, count, last_len = 0, pkt_count;
+        if (!hash_rnd_initted) {
-        off_t pos = 0;
+                get_random_bytes(&hash_rnd, 4);
-        off_t begin = 0;
+                hash_rnd_initted = 1;
-        struct recent_ip_tables *curr_table;
-        curr_table = (struct recent_ip_tables*) data;
-        spin_lock_bh(&curr_table->list_lock);
-        for(count = 0; count < ip_list_tot; count++) {
-                if(!curr_table->table[count].addr) continue;
-                last_len = len;
-                len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
-                len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
-                len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
-                len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
-                len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
-                for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
-                        if(!curr_table->table[count].last_pkts[pkt_count]) break;
-                        len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
-                }
-                len += sprintf(buffer+len,"\n");
-                pos = begin + len;
-                if(pos < offset) { len = 0; begin = pos; }
-                if(pos > offset + length) { len = last_len; break; }
        }
+        return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1);
-        *start = buffer + (offset - begin);
-        len -= (offset - begin);
-        if(len > length) len = length;
-        spin_unlock_bh(&curr_table->list_lock);
-        return len;
 }
-/* ip_recent_ctrl provides an interface for users to modify the table
+static struct recent_entry *
- * directly.  This allows adding entries, removing entries, and
+recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl)
- * flushing the entire table.
- * This is done by opening up the appropriate table for writing and
- * sending one of:
- * xx.xx.xx.xx   -- Add entry to table with current time
- * +xx.xx.xx.xx  -- Add entry to table with current time
- * -xx.xx.xx.xx  -- Remove entry from table
- * clear         -- Flush table, remove all entries
- */
-static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
 {
-        static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff };
+        struct recent_entry *e;
-        u_int32_t val;
+        unsigned int h;
-        int base, used = 0;
-        char c, *cp;
+        h = recent_entry_hash(addr);
-        union iaddr {
+        list_for_each_entry(e, &table->iphash[h], list)
-                uint8_t bytes[4];
+                if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
-                uint32_t word;
+                        return e;
-        } res;
+        return NULL;
-        uint8_t *pp = res.bytes;
+}
-        int digit;
-        char buffer[20];
-        int len, check_set = 0, count;
-        u_int32_t addr = 0;
-        struct sk_buff *skb;
-        struct ipt_recent_info *info;
-        struct recent_ip_tables *curr_table;
-        curr_table = (struct recent_ip_tables*) data;
-        if(size > 20) len = 20; else len = size;
-        if(copy_from_user(buffer,input,len)) return -EFAULT;
-        if(len < 20) buffer[len] = '\0';
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
-#endif
-        cp = buffer;
+static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
-        while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; }
+{
+        list_del(&e->list);
+        list_del(&e->lru_list);
+        kfree(e);
+        t->entries--;
+}
-        /* Check if we are asked to flush the entire table */
+static struct recent_entry *
-        if(!memcmp(cp,"clear",5)) {
+recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl)
-                used += 5;
+{
-                spin_lock_bh(&curr_table->list_lock);
+        struct recent_entry *e;
-                curr_table->time_pos = 0;
-                for(count = 0; count < ip_list_hash_size; count++) {
-                        curr_table->hash_table[count] = -1;
-                }
-                for(count = 0; count < ip_list_tot; count++) {
-                        curr_table->table[count].last_seen = 0;
-                        curr_table->table[count].addr = 0;
-                        curr_table->table[count].ttl = 0;
-                        memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
-                        curr_table->table[count].oldest_pkt = 0;
-                        curr_table->table[count].time_pos = 0;
-                        curr_table->time_info[count].position = count;
-                        curr_table->time_info[count].time = 0;
-                }
-                spin_unlock_bh(&curr_table->list_lock);
-                return used;
-        }
-        check_set = IPT_RECENT_SET;
+        if (t->entries >= ip_list_tot) {
-        switch(*cp) {
+                e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
-                case '+': check_set = IPT_RECENT_SET; cp++; used++; break;
+                recent_entry_remove(t, e);
-                case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
-                default: if(!isdigit(*cp)) return (used+1); break;
        }
+        e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
+                    GFP_ATOMIC);
+        if (e == NULL)
+                return NULL;
+        e->addr      = addr;
+        e->ttl       = ttl;
+        e->stamps[0] = jiffies;
+        e->nstamps   = 1;
+        e->index     = 1;
+        list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
+        list_add_tail(&e->lru_list, &t->lru_list);
+        t->entries++;
+        return e;
+}
-#ifdef DEBUG
+static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
-        if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set);
+{
-#endif
+        e->stamps[e->index++] = jiffies;
-        /* Get addr (effectively inet_aton()) */
+        if (e->index > e->nstamps)
-        /* Shamelessly stolen from libc, a function in the kernel for doing
+                e->nstamps = e->index;
-         * this would, of course, be greatly preferred, but our options appear
+        e->index %= ip_pkt_list_tot;
-         * to be rather limited, so we will just do it ourselves here.
+        list_move_tail(&e->lru_list, &t->lru_list);
-         */
+}
-        res.word = 0;
-        c = *cp;
-        for(;;) {
-                if(!isdigit(c)) return used;
-                val = 0; base = 10; digit = 0;
-                if(c == '0') {
-                        c = *++cp;
-                        if(c == 'x' || c == 'X') base = 16, c = *++cp;
-                        else { base = 8; digit = 1; }
-                }
-                for(;;) {
-                        if(isascii(c) && isdigit(c)) {
-                                if(base == 8 && (c == '8' || c == '0')) return used;
-                                val = (val * base) + (c - '0');
-                                c = *++cp;
-                                digit = 1;
-                        } else if(base == 16 && isascii(c) && isxdigit(c)) {
-                                val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
-                                c = *++cp;
-                                digit = 1;
-                        } else break;
-                }
-                if(c == '.') {
-                        if(pp > res.bytes + 2 || val > 0xff) return used;
-                        *pp++ = val;
-                        c = *++cp;
-                } else break;
-        }
-        used = cp - buffer;
-        if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
-        if(c == '\n') used++;
-        if(!digit) return used;
-        if(val > max[pp - res.bytes]) return used;
+static struct recent_table *recent_table_lookup(const char *name)
-        addr = res.word | htonl(val);
+{
+        struct recent_table *t;
-        if(!addr && check_set == IPT_RECENT_SET) return used;
+        list_for_each_entry(t, &tables, list)
+                if (!strcmp(t->name, name))
+                        return t;
+        return NULL;
+}
-#ifdef DEBUG
+static void recent_table_flush(struct recent_table *t)
-        if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used);
+{
-#endif
+        struct recent_entry *e, *next;
+        unsigned int i;
-        /* Set up and just call match */
+        for (i = 0; i < ip_list_hash_size; i++) {
-        info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL);
+                list_for_each_entry_safe(e, next, &t->iphash[i], list)
-        if(!info) { return -ENOMEM; }
+                        recent_entry_remove(t, e);
-        info->seconds = 0;
-        info->hit_count = 0;
-        info->check_set = check_set;
-        info->invert = 0;
-        info->side = IPT_RECENT_SOURCE;
-        strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
-        info->name[IPT_RECENT_NAME_LEN-1] = '\0';
-        skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
-        if (!skb) {
-                used = -ENOMEM;
-                goto out_free_info;
-        }
-        skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
-        if (!skb->nh.iph) {
-                used = -ENOMEM;
-                goto out_free_skb;
        }
-        skb->nh.iph->saddr = addr;
-        skb->nh.iph->daddr = 0;
-        /* Clear ttl since we have no way of knowing it */
-        skb->nh.iph->ttl = 0;
-        match(skb,NULL,NULL,NULL,info,0,0,NULL);
-        kfree(skb->nh.iph);
-out_free_skb:
-        kfree(skb);
-out_free_info:
-        kfree(info);
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
-#endif
-        return used;
 }
-#endif /* CONFIG_PROC_FS */
-/* 'match' is our primary function, called by the kernel whenever a rule is
- * hit with our module as an option to it.
- * What this function does depends on what was specifically asked of it by
- * the user:
- * --set -- Add or update last seen time of the source address of the packet
- *   -- matchinfo->check_set == IPT_RECENT_SET
- * --rcheck -- Just check if the source address is in the list
- *   -- matchinfo->check_set == IPT_RECENT_CHECK
- * --update -- If the source address is in the list, update last_seen
- *   -- matchinfo->check_set == IPT_RECENT_UPDATE
- * --remove -- If the source address is in the list, remove it
- *   -- matchinfo->check_set == IPT_RECENT_REMOVE
- * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
- *   -- matchinfo->seconds
- * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
- *   -- matchinfo->hit_count
- * --seconds and --hitcount can be combined
- */
 static int
-match(const struct sk_buff *skb,
+ipt_recent_match(const struct sk_buff *skb,
-      const struct net_device *in,
+                 const struct net_device *in, const struct net_device *out,
-      const struct net_device *out,
+                 const struct xt_match *match, const void *matchinfo,
-      const struct xt_match *match,
+                 int offset, unsigned int protoff, int *hotdrop)
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop)
 {
-        int pkt_count, hits_found, ans;
-        unsigned long now;
        const struct ipt_recent_info *info = matchinfo;
-        u_int32_t addr = 0, time_temp;
+        struct recent_table *t;
-        u_int8_t ttl = skb->nh.iph->ttl;
+        struct recent_entry *e;
-        int *hash_table;
+        u_int32_t addr;
-        int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1;
+        u_int8_t ttl;
-        struct time_info_list *time_info;
+        int ret = info->invert;
-        struct recent_ip_tables *curr_table;
-        struct recent_ip_tables *last_table;
-        struct recent_ip_list *r_list;
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
-#endif
-        /* Default is false ^ info->invert */
-        ans = info->invert;
-#ifdef DEBUG
+        if (info->side == IPT_RECENT_DEST)
-        if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name);
+                addr = skb->nh.iph->daddr;
-#endif
+        else
+                addr = skb->nh.iph->saddr;
-        /* if out != NULL then routing has been done and TTL changed.
+        ttl = skb->nh.iph->ttl;
-         * We change it back here internally for match what came in before routing. */
+        /* use TTL as seen before forwarding */
-        if(out) ttl++;
+        if (out && !skb->sk)
+                ttl++;
-        /* Find the right table */
        spin_lock_bh(&recent_lock);
-        curr_table = r_tables;
+        t = recent_table_lookup(info->name);
-        while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) );
+        e = recent_entry_lookup(t, addr,
+                                info->check_set & IPT_RECENT_TTL ? ttl : 0);
-#ifdef DEBUG
+        if (e == NULL) {
-        if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name);
+                if (!(info->check_set & IPT_RECENT_SET))
-#endif
+                        goto out;
+                e = recent_entry_init(t, addr, ttl);
-        spin_unlock_bh(&recent_lock);
+                if (e == NULL)
+                        *hotdrop = 1;
-        /* Table with this name not found, match impossible */
+                ret ^= 1;
-        if(!curr_table) { return ans; }
+                goto out;
-        /* Make sure no one is changing the list while we work with it */
-        spin_lock_bh(&curr_table->list_lock);
-        r_list = curr_table->table;
-        if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
-        if(!addr) { 
-#ifdef DEBUG
-                if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
-#endif
-                spin_unlock_bh(&curr_table->list_lock);
-                return ans;
-        }
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
-#endif
-        /* Get jiffies now in case they changed while we were waiting for a lock */
-        now = jiffies;
-        hash_table = curr_table->hash_table;
-        time_info = curr_table->time_info;
-        orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
-        /* Hash entry at this result used */
-        /* Check for TTL match if requested.  If TTL is zero then a match would never
-         * happen, so match regardless of existing TTL in that case.  Zero means the
-         * entry was added via the /proc interface anyway, so we will just use the
-         * first TTL we get for that IP address. */
-        if(info->check_set & IPT_RECENT_TTL) {
-                while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
-                        (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
-                        /* Collision in hash table */
-                        hash_result = (hash_result + 1) % ip_list_hash_size;
-                }
-        } else {
-                while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
-                        /* Collision in hash table */
-                        hash_result = (hash_result + 1) % ip_list_hash_size;
-                }
-        }
-        if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
-                /* IP not in list and not asked to SET */
-                spin_unlock_bh(&curr_table->list_lock);
-                return ans;
-        }
-        /* Check if we need to handle the collision, do not need to on REMOVE */
-        if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
-#ifdef DEBUG
-                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
-                                 orig_hash_result,
-                                 hash_result,
-                                 r_list[hash_table[orig_hash_result]].addr,
-                                 addr);
-#endif
-                /* We had a collision.
-                 * orig_hash_result is where we started, hash_result is where we ended up.
-                 * So, swap them because we are likely to see the same guy again sooner */
-#ifdef DEBUG
-                if(debug) {
-                  printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
-                  printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
-                                r_list[hash_table[orig_hash_result]].hash_entry);
-                }
-#endif
-                r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
-                temp = hash_table[orig_hash_result];
-#ifdef DEBUG
-                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
-#endif
-                hash_table[orig_hash_result] = hash_table[hash_result];
-                hash_table[hash_result] = temp;
-                temp = hash_result;
-                hash_result = orig_hash_result;
-                orig_hash_result = temp;
-                time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
-                if(hash_table[hash_result] != -1) {
-                        r_list[hash_table[hash_result]].hash_entry = hash_result;
-                        time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
-                }
-#ifdef DEBUG
-                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
-#endif
        }
-        if(hash_table[hash_result] == -1) {
+        if (info->check_set & IPT_RECENT_SET)
-#ifdef DEBUG
+                ret ^= 1;
-                if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n",
+        else if (info->check_set & IPT_RECENT_REMOVE) {
-                                 hash_result, addr);
+                recent_entry_remove(t, e);
-#endif
+                ret ^= 1;
+        } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
-                /* New item found and IPT_RECENT_SET, so we need to add it */
+                unsigned long t = jiffies - info->seconds * HZ;
-                location = time_info[curr_table->time_pos].position;
+                unsigned int i, hits = 0;
-                hash_table[r_list[location].hash_entry] = -1;
-                hash_table[hash_result] = location;
+                for (i = 0; i < e->nstamps; i++) {
-                memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
+                        if (info->seconds && time_after(t, e->stamps[i]))
-                r_list[location].time_pos = curr_table->time_pos;
+                                continue;
-                r_list[location].addr = addr;
+                        if (++hits >= info->hit_count) {
-                r_list[location].ttl = ttl;
+                                ret ^= 1;
-                r_list[location].last_seen = now;
+                                break;
-                r_list[location].oldest_pkt = 1;
-                r_list[location].last_pkts[0] = now;
-                r_list[location].hash_entry = hash_result;
-                time_info[curr_table->time_pos].time = r_list[location].last_seen;
-                curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
-                ans = !info->invert;
-        } else {
-#ifdef DEBUG
-                if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
-                                 hash_result,
-                                 addr);
-#endif
-                /* Existing item found */
-                location = hash_table[hash_result];
-                /* We have a match on address, now to make sure it meets all requirements for a
-                 * full match. */
-                if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
-                        if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
-                        if(info->seconds && !info->hit_count) {
-                                if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
-                        }
-                        if(info->seconds && info->hit_count) {
-                                for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
-                                        if(r_list[location].last_pkts[pkt_count] == 0) break;
-                                        if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
-                                }
-                                if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
-                        }
-                        if(info->hit_count && !info->seconds) {
-                                for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
-                                        if(r_list[location].last_pkts[pkt_count] == 0) break;
-                                        hits_found++;
-                                }
-                                if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
                        }
                }
-#ifdef DEBUG
-                if(debug) {
-                        if(ans)
-                                printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
-                        else
-                                printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
-                }
-#endif
-                /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
-                 * current timestamp to the last_seen. */
-                if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
-#ifdef DEBUG
-                        if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
-#endif
-                        /* Have to update our time info */
-                        time_loc = r_list[location].time_pos;
-                        time_info[time_loc].time = now;
-                        time_info[time_loc].position = location;
-                        while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
-                                time_temp = time_info[time_loc].time;
-                                time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
-                                time_info[(time_loc+1)%ip_list_tot].time = time_temp;
-                                time_temp = time_info[time_loc].position;
-                                time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
-                                time_info[(time_loc+1)%ip_list_tot].position = time_temp;
-                                r_list[time_info[time_loc].position].time_pos = time_loc;
-                                r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
-                                time_loc = (time_loc+1) % ip_list_tot;
-                        }
-                        r_list[location].time_pos = time_loc;
-                        r_list[location].ttl = ttl;
-                        r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
-                        r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
-                        r_list[location].last_seen = now;
-                }
-                /* If we have been asked to remove the entry from the list, just set it to 0 */
-                if(info->check_set & IPT_RECENT_REMOVE) {
-#ifdef DEBUG
-                        if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
-#endif
-                        /* Check if this is part of a collision chain */
-                        while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
-                                orig_hash_result++;
-                                if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
-                                        /* Found collision chain, how deep does this rabbit hole go? */
-#ifdef DEBUG
-                                        if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
-#endif
-                                        end_collision_chain = orig_hash_result;
-                                }
-                        }
-                        if(end_collision_chain != -1) {
-#ifdef DEBUG
-                                if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
-#endif
-                                /* Part of a collision chain, swap it with the end of the chain
-                                 * before removing. */
-                                r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
-                                temp = hash_table[end_collision_chain];
-                                hash_table[end_collision_chain] = hash_table[hash_result];
-                                hash_table[hash_result] = temp;
-                                time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
-                                hash_result = end_collision_chain;
-                                r_list[hash_table[hash_result]].hash_entry = hash_result;
-                                time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
-                        }
-                        location = hash_table[hash_result];
-                        hash_table[r_list[location].hash_entry] = -1;
-                        time_loc = r_list[location].time_pos;
-                        time_info[time_loc].time = 0;
-                        time_info[time_loc].position = location;
-                        while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
-                                time_temp = time_info[time_loc].time;
-                                time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
-                                time_info[(time_loc+1)%ip_list_tot].time = time_temp;
-                                time_temp = time_info[time_loc].position;
-                                time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
-                                time_info[(time_loc+1)%ip_list_tot].position = time_temp;
-                                r_list[time_info[time_loc].position].time_pos = time_loc;
-                                r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
-                                time_loc = (time_loc+1) % ip_list_tot;
-                        }
-                        r_list[location].time_pos = time_loc;
-                        r_list[location].last_seen = 0;
-                        r_list[location].addr = 0;
-                        r_list[location].ttl = 0;
-                        memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
-                        r_list[location].oldest_pkt = 0;
-                        ans = !info->invert;
-                }
-                spin_unlock_bh(&curr_table->list_lock);
-                return ans;
        }
-        spin_unlock_bh(&curr_table->list_lock);
+        if (info->check_set & IPT_RECENT_SET ||
-#ifdef DEBUG
+            (info->check_set & IPT_RECENT_UPDATE && ret)) {
-        if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n");
+                recent_entry_update(t, e);
-#endif
+                e->ttl = ttl;
-        return ans;
+        }
+out:
+        spin_unlock_bh(&recent_lock);
+        return ret;
 }
-/* This function is to verify that the rule given during the userspace iptables
- * command is correct.
- * If the command is valid then we check if the table name referred to by the
- * rule exists, if not it is created.
- */
 static int
-checkentry(const char *tablename,
+ipt_recent_checkentry(const char *tablename, const void *ip,
-           const void *ip,
+                      const struct xt_match *match, void *matchinfo,
-           const struct xt_match *match,
+                      unsigned int matchsize, unsigned int hook_mask)
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
 {
-        int flag = 0, c;
-        unsigned long *hold;
        const struct ipt_recent_info *info = matchinfo;
-        struct recent_ip_tables *curr_table, *find_table, *last_table;
+        struct recent_table *t;
+        unsigned i;
-#ifdef DEBUG
+        int ret = 0;
-        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
-#endif
-        /* seconds and hit_count only valid for CHECK/UPDATE */
-        if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
-        if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
-        if(info->check_set & IPT_RECENT_CHECK) flag++;
-        if(info->check_set & IPT_RECENT_UPDATE) flag++;
-        /* One and only one of these should ever be set */
-        if(flag != 1) return 0;
-        /* Name must be set to something */
-        if(!info->name || !info->name[0]) return 0;
-        /* Things look good, create a list for this if it does not exist */
+        if (hweight8(info->check_set &
-        /* Lock the linked list while we play with it */
+                     (IPT_RECENT_SET | IPT_RECENT_REMOVE |
-        spin_lock_bh(&recent_lock);
+                      IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
+                return 0;
-        /* Look for an entry with this name already created */
+        if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
-        /* Finds the end of the list and the entry before the end if current name does not exist */
+            (info->seconds || info->hit_count))
-        find_table = r_tables;
+                return 0;
-        while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+        if (info->name[0] == '\0' ||
+            strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
+                return 0;
-        /* If a table already exists just increment the count on that table and return */
+        mutex_lock(&recent_mutex);
-        if(find_table) { 
+        t = recent_table_lookup(info->name);
-#ifdef DEBUG
+        if (t != NULL) {
-                if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name);
+                t->refcnt++;
-#endif
+                ret = 1;
-                find_table->count++;
+                goto out;
-                spin_unlock_bh(&recent_lock);
-                return 1;
        }
-        spin_unlock_bh(&recent_lock);
+        t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
+                    GFP_KERNEL);
-        /* Table with this name not found */
+        if (t == NULL)
-        /* Allocate memory for new linked list item */
+                goto out;
+        t->refcnt = 1;
-#ifdef DEBUG
+        strcpy(t->name, info->name);
-        if(debug) {
+        INIT_LIST_HEAD(&t->lru_list);
-                printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name);
+        for (i = 0; i < ip_list_hash_size; i++)
-                printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables));
+                INIT_LIST_HEAD(&t->iphash[i]);
+#ifdef CONFIG_PROC_FS
+        t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir);
+        if (t->proc == NULL) {
+                kfree(t);
+                goto out;
        }
+        t->proc->proc_fops = &recent_fops;
+        t->proc->data      = t;
 #endif
+        spin_lock_bh(&recent_lock);
+        list_add_tail(&t->list, &tables);
+        spin_unlock_bh(&recent_lock);
+        ret = 1;
+out:
+        mutex_unlock(&recent_mutex);
+        return ret;
+}
-        curr_table = vmalloc(sizeof(struct recent_ip_tables));
+static void
-        if(curr_table == NULL) return 0;
+ipt_recent_destroy(const struct xt_match *match, void *matchinfo,
+                   unsigned int matchsize)
-        spin_lock_init(&curr_table->list_lock);
+{
-        curr_table->next = NULL;
+        const struct ipt_recent_info *info = matchinfo;
-        curr_table->count = 1;
+        struct recent_table *t;
-        curr_table->time_pos = 0;
-        strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
-        curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
-        /* Allocate memory for this table and the list of packets in each entry. */
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
-                        sizeof(struct recent_ip_list)*ip_list_tot,
-                        info->name);
-#endif
-        curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
-        if(curr_table->table == NULL) { vfree(curr_table); return 0; }
-        memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
-                        sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
-#endif
-        hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
-#endif
-        if(hold == NULL) { 
-                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
-                vfree(curr_table->table); 
-                vfree(curr_table);
-                return 0;
-        }
-        for(c = 0; c < ip_list_tot; c++) {
-                curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
-        }
-        /* Allocate memory for the hash table */
+        mutex_lock(&recent_mutex);
-#ifdef DEBUG
+        t = recent_table_lookup(info->name);
-        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n",
+        if (--t->refcnt == 0) {
-                        sizeof(int)*ip_list_hash_size);
+                spin_lock_bh(&recent_lock);
+                list_del(&t->list);
+                spin_unlock_bh(&recent_lock);
+                recent_table_flush(t);
+#ifdef CONFIG_PROC_FS
+                remove_proc_entry(t->name, proc_dir);
 #endif
+                kfree(t);
-        curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
-        if(!curr_table->hash_table) {
-                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
-                vfree(hold);
-                vfree(curr_table->table); 
-                vfree(curr_table);
-                return 0;
-        }
-        for(c = 0; c < ip_list_hash_size; c++) {
-                curr_table->hash_table[c] = -1;
        }
+        mutex_unlock(&recent_mutex);
+}
-        /* Allocate memory for the time info */
+#ifdef CONFIG_PROC_FS
-#ifdef DEBUG
+struct recent_iter_state {
-        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n",
+        struct recent_table     *table;
-                        sizeof(struct time_info_list)*ip_list_tot);
+        unsigned int            bucket;
-#endif
+};
-        curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot);
+static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
-        if(!curr_table->time_info) {
+{
-                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n");
+        struct recent_iter_state *st = seq->private;
-                vfree(curr_table->hash_table);
+        struct recent_table *t = st->table;
-                vfree(hold);
+        struct recent_entry *e;
-                vfree(curr_table->table); 
+        loff_t p = *pos;
-                vfree(curr_table);
-                return 0;
-        }
-        for(c = 0; c < ip_list_tot; c++) {
-                curr_table->time_info[c].position = c;
-                curr_table->time_info[c].time = 0;
-        }
-        /* Put the new table in place */
        spin_lock_bh(&recent_lock);
-        find_table = r_tables;
-        while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
-        /* If a table already exists just increment the count on that table and return */
-        if(find_table) { 
-                find_table->count++;    
-                spin_unlock_bh(&recent_lock);
-#ifdef DEBUG
-                if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
-#endif
-                vfree(curr_table->time_info);
-                vfree(curr_table->hash_table);
-                vfree(hold);
-                vfree(curr_table->table);
-                vfree(curr_table);
-                return 1;
-        }
-        if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
-        spin_unlock_bh(&recent_lock);
-#ifdef CONFIG_PROC_FS
+        for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) {
-        /* Create our proc 'status' entry. */
+                list_for_each_entry(e, &t->iphash[st->bucket], list) {
-        curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent);
+                        if (p-- == 0)
-        if (!curr_table->status_proc) {
+                                return e;
-                vfree(hold);
-                printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
-                /* Destroy the created table */
-                spin_lock_bh(&recent_lock);
-                last_table = NULL;
-                curr_table = r_tables;
-                if(!curr_table) {
-#ifdef DEBUG
-                        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
-#endif
-                        spin_unlock_bh(&recent_lock);
-                        return 0;
-                }
-                while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
-                if(!curr_table) {
-#ifdef DEBUG
-                        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
-#endif
-                        spin_unlock_bh(&recent_lock);
-                        return 0;
                }
-                if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
-                spin_unlock_bh(&recent_lock);
-                vfree(curr_table->time_info);
-                vfree(curr_table->hash_table);
-                vfree(curr_table->table);
-                vfree(curr_table);
-                return 0;
        }
-        
+        return NULL;
-        curr_table->status_proc->owner = THIS_MODULE;
+}
-        curr_table->status_proc->data = curr_table;
-        wmb();
-        curr_table->status_proc->read_proc = ip_recent_get_info;
-        curr_table->status_proc->write_proc = ip_recent_ctrl;
-#endif /* CONFIG_PROC_FS */
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
-#endif
-        return 1;
+static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct recent_iter_state *st = seq->private;
+        struct recent_table *t = st->table;
+        struct recent_entry *e = v;
+        struct list_head *head = e->list.next;
+        while (head == &t->iphash[st->bucket]) {
+                if (++st->bucket >= ip_list_hash_size)
+                        return NULL;
+                head = t->iphash[st->bucket].next;
+        }
+        (*pos)++;
+        return list_entry(head, struct recent_entry, list);
 }
-/* This function is called in the event that a rule matching this module is
+static void recent_seq_stop(struct seq_file *s, void *v)
- * removed.
- * When this happens we need to check if there are no other rules matching
- * the table given.  If that is the case then we remove the table and clean
- * up its memory.
- */
-static void
-destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
 {
-        const struct ipt_recent_info *info = matchinfo;
+        spin_unlock_bh(&recent_lock);
-        struct recent_ip_tables *curr_table, *last_table;
+}
-#ifdef DEBUG
+static int recent_seq_show(struct seq_file *seq, void *v)
-        if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n");
+{
-#endif
+        struct recent_entry *e = v;
+        unsigned int i;
+        i = (e->index - 1) % ip_pkt_list_tot;
+        seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
+                   NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
+        for (i = 0; i < e->nstamps; i++)
+                seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
+        seq_printf(seq, "\n");
+        return 0;
+}
-        if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return;
+static struct seq_operations recent_seq_ops = {
+        .start          = recent_seq_start,
+        .next           = recent_seq_next,
+        .stop           = recent_seq_stop,
+        .show           = recent_seq_show,
+};
-        /* Lock the linked list while we play with it */
+static int recent_seq_open(struct inode *inode, struct file *file)
-        spin_lock_bh(&recent_lock);
+{
+        struct proc_dir_entry *pde = PDE(inode);
+        struct seq_file *seq;
+        struct recent_iter_state *st;
+        int ret;
+        st = kzalloc(sizeof(*st), GFP_KERNEL);
+        if (st == NULL)
+                return -ENOMEM;
+        ret = seq_open(file, &recent_seq_ops);
+        if (ret)
+                kfree(st);
+        st->table    = pde->data;
+        seq          = file->private_data;
+        seq->private = st;
+        return ret;
+}
-        /* Look for an entry with this name already created */
+static ssize_t recent_proc_write(struct file *file, const char __user *input,
-        /* Finds the end of the list and the entry before the end if current name does not exist */
+                                 size_t size, loff_t *loff)
-        last_table = NULL;
+{
-        curr_table = r_tables;
+        struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
-        if(!curr_table) { 
+        struct recent_table *t = pde->data;
-#ifdef DEBUG
+        struct recent_entry *e;
-                if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n");
+        char buf[sizeof("+255.255.255.255")], *c = buf;
-#endif
+        u_int32_t addr;
+        int add;
+        if (size > sizeof(buf))
+                size = sizeof(buf);
+        if (copy_from_user(buf, input, size))
+                return -EFAULT;
+        while (isspace(*c))
+                c++;
+        if (size - (c - buf) < 5)
+                return c - buf;
+        if (!strncmp(c, "clear", 5)) {
+                c += 5;
+                spin_lock_bh(&recent_lock);
+                recent_table_flush(t);
                spin_unlock_bh(&recent_lock);
-                return;
+                return c - buf;
        }
-        while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
-        /* If a table does not exist then do nothing and return */
+        switch (*c) {
-        if(!curr_table) { 
+        case '-':
-#ifdef DEBUG
+                add = 0;
-                if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n");
+                c++;
-#endif
+                break;
-                spin_unlock_bh(&recent_lock);
+        case '+':
-                return;
+                c++;
+        default:
+                add = 1;
+                break;
        }
+        addr = in_aton(c);
-        curr_table->count--;
+        spin_lock_bh(&recent_lock);
+        e = recent_entry_lookup(t, addr, 0);
-        /* If count is still non-zero then there are still rules referenceing it so we do nothing */
+        if (e == NULL) {
-        if(curr_table->count) { 
+                if (add)
-#ifdef DEBUG
+                        recent_entry_init(t, addr, 0);
-                if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n");
+        } else {
-#endif
+                if (add)
-                spin_unlock_bh(&recent_lock);
+                        recent_entry_update(t, e);
-                return;
+                else
+                        recent_entry_remove(t, e);
        }
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
-#endif
-        /* Count must be zero so we remove this table from the list */
-        if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
        spin_unlock_bh(&recent_lock);
+        return size;
+}
-        /* lock to make sure any late-runners still using this after we removed it from
+static struct file_operations recent_fops = {
-         * the list finish up then remove everything */
+        .open           = recent_seq_open,
-        spin_lock_bh(&curr_table->list_lock);
+        .read           = seq_read,
-        spin_unlock_bh(&curr_table->list_lock);
+        .write          = recent_proc_write,
+        .release        = seq_release_private,
-#ifdef CONFIG_PROC_FS
+        .owner          = THIS_MODULE,
-        if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent);
+};
 #endif /* CONFIG_PROC_FS */
-        vfree(curr_table->table[0].last_pkts);
-        vfree(curr_table->table);
-        vfree(curr_table->hash_table);
-        vfree(curr_table->time_info);
-        vfree(curr_table);
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
-#endif
-        return;
-}
-/* This is the structure we pass to ipt_register to register our
- * module with iptables.
- */
 static struct ipt_match recent_match = {
        .name           = "recent",
-        .match          = match,
+        .match          = ipt_recent_match,
        .matchsize      = sizeof(struct ipt_recent_info),
-        .checkentry     = checkentry,
+        .checkentry     = ipt_recent_checkentry,
-        .destroy        = destroy,
+        .destroy        = ipt_recent_destroy,
-        .me             = THIS_MODULE
+        .me             = THIS_MODULE,
 };
-/* Kernel module initialization. */
 static int __init ipt_recent_init(void)
 {
-        int err, count;
+        int err;
-        printk(version);
+        if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
-#ifdef CONFIG_PROC_FS
+                return -EINVAL;
-        proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net);
+        ip_list_hash_size = 1 << fls(ip_list_tot);
-        if(!proc_net_ipt_recent) return -ENOMEM;
-#endif
-        if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
-          printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
-          ip_list_hash_size = 0;
-        }
-        if(!ip_list_hash_size) {
-                ip_list_hash_size = ip_list_tot*3;
-                count = 2*2;
-                while(ip_list_hash_size > count) count = count*2;
-                ip_list_hash_size = count;
-        }
-#ifdef DEBUG
-        if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
-#endif
        err = ipt_register_match(&recent_match);
+#ifdef CONFIG_PROC_FS
        if (err)
-                remove_proc_entry("ipt_recent", proc_net);
+                return err;
+        proc_dir = proc_mkdir("ipt_recent", proc_net);
+        if (proc_dir == NULL) {
+                ipt_unregister_match(&recent_match);
+                err = -ENOMEM;
+        }
+#endif
        return err;
 }
-/* Kernel module destruction. */
+static void __exit ipt_recent_exit(void)
-static void __exit ipt_recent_fini(void)
 {
+        BUG_ON(!list_empty(&tables));
        ipt_unregister_match(&recent_match);
+#ifdef CONFIG_PROC_FS
-        remove_proc_entry("ipt_recent",proc_net);
+        remove_proc_entry("ipt_recent", proc_net);
+#endif
 }
-/* Register our module with the kernel. */
 module_init(ipt_recent_init);
-module_exit(ipt_recent_fini);
+module_exit(ipt_recent_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 77d974443c7..8cc8e1b3677 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum,
        /* This is where we call the helper: as the packet goes out. */
        ct = nf_ct_get(*pskb, &ctinfo);
-        if (!ct)
+        if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
                return NF_ACCEPT;
        help = nfct_help(ct);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b0d361cc6e..663a73ee3f2 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
        }
        /* See ip_conntrack_proto_tcp.c */
-        if (hooknum == NF_IP_PRE_ROUTING &&
+        if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
            nf_ip_checksum(skb, hooknum, dataoff, 0)) {
                if (LOG_INVALID(IPPROTO_ICMP))
                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc256241555..bd221ec3f81 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk)
 }
 struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
-                             unsigned long raddr, unsigned long laddr,
+                             __be32 raddr, __be32 laddr,
                             int dif)
 {
        struct hlist_node *node;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3adfcf0..ce4cd5f3551 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -182,14 +182,6 @@ ctl_table ipv4_table[] = {
                .strategy       = &ipv4_doint_and_flush_strategy,
        },
        {
-                .ctl_name       = NET_IPV4_AUTOCONFIG,
-                .procname       = "ip_autoconfig",
-                .data           = &ipv4_config.autoconfig,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
-        },
-        {
                .ctl_name       = NET_IPV4_NO_PMTU_DISC,
                .procname       = "ip_no_pmtu_disc",
                .data           = &ipv4_config.no_pmtu_disc,
@@ -688,6 +680,24 @@ ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec
        },
+#ifdef CONFIG_NET_DMA
+        {
+                .ctl_name       = NET_TCP_DMA_COPYBREAK,
+                .procname       = "tcp_dma_copybreak",
+                .data           = &sysctl_tcp_dma_copybreak,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+#endif
+        {
+                .ctl_name       = NET_TCP_SLOW_START_AFTER_IDLE,
+                .procname       = "tcp_slow_start_after_idle",
+                .data           = &sysctl_tcp_slow_start_after_idle,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
        { .ctl_name = 0 }
 };
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b805503..74998f25007 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -263,7 +263,7 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
+#include <net/netdma.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
        ssize_t res;
        struct sock *sk = sock->sk;
-#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
        if (!(sk->sk_route_caps & NETIF_F_SG) ||
-            !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
+            !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
                return sock_no_sendpage(sock, page, offset, size, flags);
-#undef TCP_ZC_CSUM_FLAGS
        lock_sock(sk);
        TCP_CHECK_TIMER(sk);
        res = do_tcp_sendpages(sk, &page, offset, size, flags);
@@ -726,9 +722,7 @@ new_segment:
                                /*
                                 * Check whether we can use HW checksum.
                                 */
-                                if (sk->sk_route_caps &
+                                if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
-                                    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
-                                     NETIF_F_HW_CSUM))
                                        skb->ip_summed = CHECKSUM_HW;
                                skb_entail(sk, tp, skb);
@@ -937,7 +931,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
 * calculation of whether or not we must ACK for the sake of
 * a window update.
 */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int time_to_ack = 0;
@@ -1072,11 +1066,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                                break;
                }
                if (skb->h.th->fin) {
-                        sk_eat_skb(sk, skb);
+                        sk_eat_skb(sk, skb, 0);
                        ++seq;
                        break;
                }
-                sk_eat_skb(sk, skb);
+                sk_eat_skb(sk, skb, 0);
                if (!desc->count)
                        break;
        }
@@ -1086,7 +1080,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
        /* Clean up data we have read: This will do ACK frames. */
        if (copied)
-                cleanup_rbuf(sk, copied);
+                tcp_cleanup_rbuf(sk, copied);
        return copied;
 }
@@ -1110,6 +1104,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        int target;             /* Read at least this many bytes */
        long timeo;
        struct task_struct *user_recv = NULL;
+        int copied_early = 0;
        lock_sock(sk);
@@ -1133,6 +1128,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+#ifdef CONFIG_NET_DMA
+        tp->ucopy.dma_chan = NULL;
+        preempt_disable();
+        if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
+            !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
+                preempt_enable_no_resched();
+                tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
+        } else
+                preempt_enable_no_resched();
+#endif
        do {
                struct sk_buff *skb;
                u32 offset;
@@ -1220,7 +1226,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        }
                }
-                cleanup_rbuf(sk, copied);
+                tcp_cleanup_rbuf(sk, copied);
                if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
                        /* Install new reader */
@@ -1274,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                } else
                        sk_wait_data(sk, &timeo);
+#ifdef CONFIG_NET_DMA
+                tp->ucopy.wakeup = 0;
+#endif
                if (user_recv) {
                        int chunk;
@@ -1329,13 +1339,39 @@ do_prequeue:
                }
                if (!(flags & MSG_TRUNC)) {
-                        err = skb_copy_datagram_iovec(skb, offset,
+#ifdef CONFIG_NET_DMA
-                                                      msg->msg_iov, used);
+                        if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-                        if (err) {
+                                tp->ucopy.dma_chan = get_softnet_dma();
-                                /* Exception. Bailout! */
-                                if (!copied)
+                        if (tp->ucopy.dma_chan) {
-                                        copied = -EFAULT;
+                                tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
-                                break;
+                                        tp->ucopy.dma_chan, skb, offset,
+                                        msg->msg_iov, used,
+                                        tp->ucopy.pinned_list);
+                                if (tp->ucopy.dma_cookie < 0) {
+                                        printk(KERN_ALERT "dma_cookie < 0\n");
+                                        /* Exception. Bailout! */
+                                        if (!copied)
+                                                copied = -EFAULT;
+                                        break;
+                                }
+                                if ((offset + used) == skb->len)
+                                        copied_early = 1;
+                        } else
+#endif
+                        {
+                                err = skb_copy_datagram_iovec(skb, offset,
+                                                msg->msg_iov, used);
+                                if (err) {
+                                        /* Exception. Bailout! */
+                                        if (!copied)
+                                                copied = -EFAULT;
+                                        break;
+                                }
                        }
                }
@@ -1355,15 +1391,19 @@ skip_copy:
                if (skb->h.th->fin)
                        goto found_fin_ok;
-                if (!(flags & MSG_PEEK))
+                if (!(flags & MSG_PEEK)) {
-                        sk_eat_skb(sk, skb);
+                        sk_eat_skb(sk, skb, copied_early);
+                        copied_early = 0;
+                }
                continue;
        found_fin_ok:
                /* Process the FIN. */
                ++*seq;
-                if (!(flags & MSG_PEEK))
+                if (!(flags & MSG_PEEK)) {
-                        sk_eat_skb(sk, skb);
+                        sk_eat_skb(sk, skb, copied_early);
+                        copied_early = 0;
+                }
                break;
        } while (len > 0);
@@ -1386,12 +1426,42 @@ skip_copy:
                tp->ucopy.len = 0;
        }
+#ifdef CONFIG_NET_DMA
+        if (tp->ucopy.dma_chan) {
+                struct sk_buff *skb;
+                dma_cookie_t done, used;
+                dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+                while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+                                                 tp->ucopy.dma_cookie, &done,
+                                                 &used) == DMA_IN_PROGRESS) {
+                        /* do partial cleanup of sk_async_wait_queue */
+                        while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+                               (dma_async_is_complete(skb->dma_cookie, done,
+                                                      used) == DMA_SUCCESS)) {
+                                __skb_dequeue(&sk->sk_async_wait_queue);
+                                kfree_skb(skb);
+                        }
+                }
+                /* Safe to free early-copied skbs now */
+                __skb_queue_purge(&sk->sk_async_wait_queue);
+                dma_chan_put(tp->ucopy.dma_chan);
+                tp->ucopy.dma_chan = NULL;
+        }
+        if (tp->ucopy.pinned_list) {
+                dma_unpin_iovec_pages(tp->ucopy.pinned_list);
+                tp->ucopy.pinned_list = NULL;
+        }
+#endif
        /* According to UNIX98, msg_name/msg_namelen are ignored
         * on connected socket. I was just happy when found this 8) --ANK
         */
        /* Clean up data we have read: This will do ACK frames. */
-        cleanup_rbuf(sk, copied);
+        tcp_cleanup_rbuf(sk, copied);
        TCP_CHECK_TIMER(sk);
        release_sock(sk);
@@ -1658,6 +1728,9 @@ int tcp_disconnect(struct sock *sk, int flags)
        __skb_queue_purge(&sk->sk_receive_queue);
        sk_stream_writequeue_purge(sk);
        __skb_queue_purge(&tp->out_of_order_queue);
+#ifdef CONFIG_NET_DMA
+        __skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
        inet->dport = 0;
@@ -1858,7 +1931,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
                            inet_csk_ack_scheduled(sk)) {
                                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-                                cleanup_rbuf(sk, 1);
+                                tcp_cleanup_rbuf(sk, 1);
                                if (!(val & 1))
                                        icsk->icsk_ack.pingpong = 1;
                        }
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 035f2092d73..b2d9021ad22 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
        return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-        const struct tcp_sock *tp = tcp_sk(sk);
-        return tp->snd_ssthresh;
-}
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
        if (new_state == TCP_CA_Loss)
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = {
        .cong_avoid     = bictcp_cong_avoid,
        .set_state      = bictcp_state,
        .undo_cwnd      = bictcp_undo_cwnd,
-        .min_cwnd       = bictcp_min_cwnd,
        .pkts_acked     = bictcp_acked,
        .owner          = THIS_MODULE,
        .name           = "bic",
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c
new file mode 100644
index 00000000000..bc54f7e9aea
--- /dev/null
+++ b/net/ipv4/tcp_compound.c
@@ -0,0 +1,448 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *      ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ *
+ *
+ *   TCP Compound based on TCP Vegas
+ *
+ *   further details can be found here:
+ *      ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+#define TCP_COMPOUND_ALPHA          3U
+#define TCP_COMPOUND_BETA           1U
+#define TCP_COMPOUND_GAMMA         30
+#define TCP_COMPOUND_ZETA           1
+/* TCP compound variables */
+struct compound {
+        u32 beg_snd_nxt;        /* right edge during last RTT */
+        u32 beg_snd_una;        /* left edge  during last RTT */
+        u32 beg_snd_cwnd;       /* saves the size of the cwnd */
+        u8 doing_vegas_now;     /* if true, do vegas for this RTT */
+        u16 cntRTT;             /* # of RTTs measured within last RTT */
+        u32 minRTT;             /* min of RTTs measured within last RTT (in usec) */
+        u32 baseRTT;            /* the min of all Vegas RTT measurements seen (in usec) */
+        u32 cwnd;
+        u32 dwnd;
+};
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct compound *vegas = inet_csk_ca(sk);
+        /* Begin taking Vegas samples next time we send something. */
+        vegas->doing_vegas_now = 1;
+        /* Set the beginning of the next send window. */
+        vegas->beg_snd_nxt = tp->snd_nxt;
+        vegas->cntRTT = 0;
+        vegas->minRTT = 0x7fffffff;
+}
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+        struct compound *vegas = inet_csk_ca(sk);
+        vegas->doing_vegas_now = 0;
+}
+static void tcp_compound_init(struct sock *sk)
+{
+        struct compound *vegas = inet_csk_ca(sk);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        vegas->baseRTT = 0x7fffffff;
+        vegas_enable(sk);
+        vegas->dwnd = 0;
+        vegas->cwnd = tp->snd_cwnd;
+}
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt)
+{
+        struct compound *vegas = inet_csk_ca(sk);
+        u32 vrtt = usrtt + 1;   /* Never allow zero rtt or baseRTT */
+        /* Filter to find propagation delay: */
+        if (vrtt < vegas->baseRTT)
+                vegas->baseRTT = vrtt;
+        /* Find the min RTT during the last RTT to find
+         * the current prop. delay + queuing delay:
+         */
+        vegas->minRTT = min(vegas->minRTT, vrtt);
+        vegas->cntRTT++;
+}
+static void tcp_compound_state(struct sock *sk, u8 ca_state)
+{
+        if (ca_state == TCP_CA_Open)
+                vegas_enable(sk);
+        else
+                vegas_disable(sk);
+}
+/* 64bit divisor, dividend and result. dynamic precision */
+static inline u64 div64_64(u64 dividend, u64 divisor)
+{
+        u32 d = divisor;
+        if (divisor > 0xffffffffULL) {
+                unsigned int shift = fls(divisor >> 32);
+                d = divisor >> shift;
+                dividend >>= shift;
+        }
+        /* avoid 64 bit division if possible */
+        if (dividend >> 32)
+                do_div(dividend, d);
+        else
+                dividend = (u32) dividend / d;
+        return dividend;
+}
+/* calculate the quartic root of "a" using Newton-Raphson */
+static u32 qroot(u64 a)
+{
+        u32 x, x1;
+        /* Initial estimate is based on:
+         * qrt(x) = exp(log(x) / 4)
+         */
+        x = 1u << (fls64(a) >> 2);
+        /*
+         * Iteration based on:
+         *                         3
+         * x    = ( 3 * x  +  a / x  ) / 4
+         *  k+1          k         k
+         */
+        do {
+                u64 x3 = x;
+                x1 = x;
+                x3 *= x;
+                x3 *= x;
+                x = (3 * x + (u32) div64_64(a, x3)) / 4;
+        } while (abs(x1 - x) > 1);
+        return x;
+}
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+        if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+                tcp_compound_init(sk);
+}
+static void tcp_compound_cong_avoid(struct sock *sk, u32 ack,
+                                    u32 seq_rtt, u32 in_flight, int flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct compound *vegas = inet_csk_ca(sk);
+        u8 inc = 0;
+        if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) {
+                if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) {
+                        vegas->cwnd = tp->snd_cwnd;
+                        vegas->dwnd = 0;
+                } else
+                        vegas->cwnd = tp->snd_cwnd - vegas->dwnd;
+        }
+        if (!tcp_is_cwnd_limited(sk, in_flight))
+                return;
+        if (vegas->cwnd <= tp->snd_ssthresh)
+                inc = 1;
+        else if (tp->snd_cwnd_cnt < tp->snd_cwnd)
+                tp->snd_cwnd_cnt++;
+        if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+                inc = 1;
+                tp->snd_cwnd_cnt = 0;
+        }
+        if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp)
+                vegas->cwnd++;
+        /* The key players are v_beg_snd_una and v_beg_snd_nxt.
+         *
+         * These are so named because they represent the approximate values
+         * of snd_una and snd_nxt at the beginning of the current RTT. More
+         * precisely, they represent the amount of data sent during the RTT.
+         * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+         * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+         * bytes of data have been ACKed during the course of the RTT, giving
+         * an "actual" rate of:
+         *
+         *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+         *
+         * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+         * because delayed ACKs can cover more than one segment, so they
+         * don't line up nicely with the boundaries of RTTs.
+         *
+         * Another unfortunate fact of life is that delayed ACKs delay the
+         * advance of the left edge of our send window, so that the number
+         * of bytes we send in an RTT is often less than our cwnd will allow.
+         * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+         */
+        if (after(ack, vegas->beg_snd_nxt)) {
+                /* Do the Vegas once-per-RTT cwnd adjustment. */
+                u32 old_wnd, old_snd_cwnd;
+                /* Here old_wnd is essentially the window of data that was
+                 * sent during the previous RTT, and has all
+                 * been acknowledged in the course of the RTT that ended
+                 * with the ACK we just received. Likewise, old_snd_cwnd
+                 * is the cwnd during the previous RTT.
+                 */
+                if (!tp->mss_cache)
+                        return;
+                old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
+                    tp->mss_cache;
+                old_snd_cwnd = vegas->beg_snd_cwnd;
+                /* Save the extent of the current window so we can use this
+                 * at the end of the next RTT.
+                 */
+                vegas->beg_snd_una = vegas->beg_snd_nxt;
+                vegas->beg_snd_nxt = tp->snd_nxt;
+                vegas->beg_snd_cwnd = tp->snd_cwnd;
+                /* We do the Vegas calculations only if we got enough RTT
+                 * samples that we can be reasonably sure that we got
+                 * at least one RTT sample that wasn't from a delayed ACK.
+                 * If we only had 2 samples total,
+                 * then that means we're getting only 1 ACK per RTT, which
+                 * means they're almost certainly delayed ACKs.
+                 * If  we have 3 samples, we should be OK.
+                 */
+                if (vegas->cntRTT > 2) {
+                        u32 rtt, target_cwnd, diff;
+                        u32 brtt, dwnd;
+                        /* We have enough RTT samples, so, using the Vegas
+                         * algorithm, we determine if we should increase or
+                         * decrease cwnd, and by how much.
+                         */
+                        /* Pluck out the RTT we are using for the Vegas
+                         * calculations. This is the min RTT seen during the
+                         * last RTT. Taking the min filters out the effects
+                         * of delayed ACKs, at the cost of noticing congestion
+                         * a bit later.
+                         */
+                        rtt = vegas->minRTT;
+                        /* Calculate the cwnd we should have, if we weren't
+                         * going too fast.
+                         *
+                         * This is:
+                         *     (actual rate in segments) * baseRTT
+                         * We keep it as a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary point.
+                         */
+                        if (!rtt)
+                                return;
+                        brtt = vegas->baseRTT;
+                        target_cwnd = ((old_wnd * brtt)
+                                       << V_PARAM_SHIFT) / rtt;
+                        /* Calculate the difference between the window we had,
+                         * and the window we would like to have. This quantity
+                         * is the "Diff" from the Arizona Vegas papers.
+                         *
+                         * Again, this is a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary
+                         * point.
+                         */
+                        diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+                        dwnd = vegas->dwnd;
+                        if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) {
+                                u64 v;
+                                u32 x;
+                                /*
+                                 * The TCP Compound paper describes the choice
+                                 * of "k" determines the agressiveness,
+                                 * ie. slope of the response function.
+                                 *
+                                 * For same value as HSTCP would be 0.8
+                                 * but for computaional reasons, both the
+                                 * original authors and this implementation
+                                 * use 0.75.
+                                 */
+                                v = old_wnd;
+                                x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA;
+                                if (x > 1)
+                                        dwnd = x - 1;
+                                else
+                                        dwnd = 0;
+                                dwnd += vegas->dwnd;
+                        } else if ((dwnd << V_PARAM_SHIFT) <
+                                   (diff * TCP_COMPOUND_BETA))
+                                dwnd = 0;
+                        else
+                                dwnd =
+                                    ((dwnd << V_PARAM_SHIFT) -
+                                     (diff *
+                                      TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT;
+                        vegas->dwnd = dwnd;
+                }
+                /* Wipe the slate clean for the next RTT. */
+                vegas->cntRTT = 0;
+                vegas->minRTT = 0x7fffffff;
+        }
+        tp->snd_cwnd = vegas->cwnd + vegas->dwnd;
+}
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+{
+        const struct compound *ca = inet_csk_ca(sk);
+        if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+                struct tcpvegas_info *info;
+                info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
+                                          sizeof(*info)));
+                info->tcpv_enabled = ca->doing_vegas_now;
+                info->tcpv_rttcnt = ca->cntRTT;
+                info->tcpv_rtt = ca->baseRTT;
+                info->tcpv_minrtt = ca->minRTT;
+        rtattr_failure:;
+        }
+}
+static struct tcp_congestion_ops tcp_compound = {
+        .init           = tcp_compound_init,
+        .ssthresh       = tcp_reno_ssthresh,
+        .cong_avoid     = tcp_compound_cong_avoid,
+        .rtt_sample     = tcp_compound_rtt_calc,
+        .set_state      = tcp_compound_state,
+        .cwnd_event     = tcp_compound_cwnd_event,
+        .get_info       = tcp_compound_get_info,
+        .owner          = THIS_MODULE,
+        .name           = "compound",
+};
+static int __init tcp_compound_register(void)
+{
+        BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE);
+        tcp_register_congestion_control(&tcp_compound);
+        return 0;
+}
+static void __exit tcp_compound_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_compound);
+}
+module_init(tcp_compound_register);
+module_exit(tcp_compound_unregister);
+MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Compound");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 91c2f41c7f5..857eefc52aa 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
        int ret = 0;
        /* all algorithms must implement ssthresh and cong_avoid ops */
-        if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+        if (!ca->ssthresh || !ca->cong_avoid) {
                printk(KERN_ERR "TCP %s does not implement required ops\n",
                       ca->name);
                return -EINVAL;
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-/* Lower bound on congestion window. */
+/* Lower bound on congestion window with halving. */
-u32 tcp_reno_min_cwnd(struct sock *sk)
+u32 tcp_reno_min_cwnd(const struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        return tp->snd_ssthresh/2;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 31a4986dfbf..78b7a6b9e4d 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
        return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
 }
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-        return tcp_sk(sk)->snd_ssthresh;
-}
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
        if (new_state == TCP_CA_Loss)
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = {
        .cong_avoid     = bictcp_cong_avoid,
        .set_state      = bictcp_state,
        .undo_cwnd      = bictcp_undo_cwnd,
-        .min_cwnd       = bictcp_min_cwnd,
        .pkts_acked     = bictcp_acked,
        .owner          = THIS_MODULE,
        .name           = "cubic",
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index ba7c63ca5bb..1120245b237 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,6 +98,10 @@ struct hstcp {
        u32     ai;
 };
+static int max_ssthresh = 100;
+module_param(max_ssthresh, int, 0644);
+MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)");
 static void hstcp_init(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
        if (!tcp_is_cwnd_limited(sk, in_flight))
                return;
-        if (tp->snd_cwnd <= tp->snd_ssthresh)
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
-                tcp_slow_start(tp);
+                /* RFC3742: limited slow start
-        else {
+                 * the window is increased by 1/K MSS for each arriving ACK,
+                 * for K = int(cwnd/(0.5 max_ssthresh))
+                 */
+                if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) {
+                        u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U);
+                        if (++tp->snd_cwnd_cnt >= k) {
+                                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                        tp->snd_cwnd++;
+                                tp->snd_cwnd_cnt = 0;
+                        }
+                } else {
+                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                tp->snd_cwnd++;
+                }
+        } else {
                /* Update AIMD parameters */
                if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
                        while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 1b2ff53f98e..3d92c185926 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
        }
 }
-/* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct sock *sk)
-{
-        const struct tcp_sock *tp = tcp_sk(sk);
-        return tp->snd_ssthresh;
-}
 static void htcp_init(struct sock *sk)
 {
        struct htcp *ca = inet_csk_ca(sk);
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state)
 static struct tcp_congestion_ops htcp = {
        .init           = htcp_init,
        .ssthresh       = htcp_recalc_ssthresh,
-        .min_cwnd       = htcp_min_cwnd,
        .cong_avoid     = htcp_cong_avoid,
        .set_state      = htcp_state,
        .undo_cwnd      = htcp_cwnd_undo,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b5521a9d3dc..e08245bdda3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -71,6 +71,7 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
+#include <net/netdma.h>
 int sysctl_tcp_timestamps = 1;
 int sysctl_tcp_window_scaling = 1;
@@ -1688,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
+/* Lower bound on congestion window is slow start threshold
+ * unless congestion avoidance choice decides to overide it.
+ */
+static inline u32 tcp_cwnd_min(const struct sock *sk)
+{
+        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+        return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+}
 /* Decrease cwnd each second ack. */
 static void tcp_cwnd_down(struct sock *sk)
 {
-        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int decr = tp->snd_cwnd_cnt + 1;
        tp->snd_cwnd_cnt = decr&1;
        decr >>= 1;
-        if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
+        if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
                tp->snd_cwnd -= decr;
        tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -3785,6 +3795,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk
                __tcp_checksum_complete_user(sk, skb);
 }
+#ifdef CONFIG_NET_DMA
+static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int chunk = skb->len - hlen;
+        int dma_cookie;
+        int copied_early = 0;
+        if (tp->ucopy.wakeup)
+                return 0;
+        if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+                tp->ucopy.dma_chan = get_softnet_dma();
+        if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
+                dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
+                        skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
+                if (dma_cookie < 0)
+                        goto out;
+                tp->ucopy.dma_cookie = dma_cookie;
+                copied_early = 1;
+                tp->ucopy.len -= chunk;
+                tp->copied_seq += chunk;
+                tcp_rcv_space_adjust(sk);
+                if ((tp->ucopy.len == 0) ||
+                    (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
+                    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
+                        tp->ucopy.wakeup = 1;
+                        sk->sk_data_ready(sk, 0);
+                }
+        } else if (chunk > 0) {
+                tp->ucopy.wakeup = 1;
+                sk->sk_data_ready(sk, 0);
+        }
+out:
+        return copied_early;
+}
+#endif /* CONFIG_NET_DMA */
 /*
 *      TCP receive function for the ESTABLISHED state. 
 *
@@ -3886,8 +3940,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                    tp->rcv_nxt == tp->rcv_wup)
                                        tcp_store_ts_recent(tp);
-                                tcp_rcv_rtt_measure_ts(sk, skb);
                                /* We know that such packets are checksummed
                                 * on entry.
                                 */
@@ -3901,14 +3953,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        }
                } else {
                        int eaten = 0;
+                        int copied_early = 0;
-                        if (tp->ucopy.task == current &&
+                        if (tp->copied_seq == tp->rcv_nxt &&
-                            tp->copied_seq == tp->rcv_nxt &&
+                            len - tcp_header_len <= tp->ucopy.len) {
-                            len - tcp_header_len <= tp->ucopy.len &&
+#ifdef CONFIG_NET_DMA
-                            sock_owned_by_user(sk)) {
+                                if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
-                                __set_current_state(TASK_RUNNING);
+                                        copied_early = 1;
+                                        eaten = 1;
+                                }
+#endif
+                                if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
+                                        __set_current_state(TASK_RUNNING);
-                                if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+                                        if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
+                                                eaten = 1;
+                                }
+                                if (eaten) {
                                        /* Predicted packet is in window by definition.
                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                         * Hence, check seq<=rcv_wup reduces to:
@@ -3924,8 +3985,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                        __skb_pull(skb, tcp_header_len);
                                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                                        NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
-                                        eaten = 1;
                                }
+                                if (copied_early)
+                                        tcp_cleanup_rbuf(sk, skb->len);
                        }
                        if (!eaten) {
                                if (tcp_checksum_complete_user(sk, skb))
@@ -3966,6 +4028,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        __tcp_ack_snd_check(sk, 0);
 no_ack:
+#ifdef CONFIG_NET_DMA
+                        if (copied_early)
+                                __skb_queue_tail(&sk->sk_async_wait_queue, skb);
+                        else
+#endif
                        if (eaten)
                                __kfree_skb(skb);
                        else
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 672950e54c4..25ecc6e2478 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -71,6 +71,7 @@
 #include <net/inet_common.h>
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
+#include <net/netdma.h>
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -1091,8 +1092,18 @@ process:
        bh_lock_sock(sk);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
-                if (!tcp_prequeue(sk, skb))
+#ifdef CONFIG_NET_DMA
+                struct tcp_sock *tp = tcp_sk(sk);
+                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+                        tp->ucopy.dma_chan = get_softnet_dma();
+                if (tp->ucopy.dma_chan)
                        ret = tcp_v4_do_rcv(sk, skb);
+                else
+#endif
+                {
+                        if (!tcp_prequeue(sk, skb))
+                        ret = tcp_v4_do_rcv(sk, skb);
+                }
        } else
                sk_add_backlog(sk, skb);
        bh_unlock_sock(sk);
@@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk)
        /* Cleans up our, hopefully empty, out_of_order_queue. */
        __skb_queue_purge(&tp->out_of_order_queue);
+#ifdef CONFIG_NET_DMA
+        /* Cleans up our sk_async_wait_queue */
+        __skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
        /* Clean prequeue, it must be empty really */
        __skb_queue_purge(&tp->ucopy.prequeue);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 00000000000..1f977b6ee9a
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,338 @@
+/*
+ * TCP Low Priority (TCP-LP)
+ *
+ * TCP Low Priority is a distributed algorithm whose goal is to utilize only
+ *   the excess network bandwidth as compared to the ``fair share`` of
+ *   bandwidth as targeted by TCP. Available from:
+ *     http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ *
+ * Original Author:
+ *   Aleksandar Kuzmanovic <akuzma@northwestern.edu>
+ *
+ * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation.
+ * As of 2.6.13, Linux supports pluggable congestion control algorithms.
+ * Due to the limitation of the API, we take the following changes from
+ * the original TCP-LP implementation:
+ *   o We use newReno in most core CA handling. Only add some checking
+ *     within cong_avoid.
+ *   o Error correcting in remote HZ, therefore remote HZ will be keeped
+ *     on checking and updating.
+ *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne
+ *     OWD have a similar meaning as RTT. Also correct the buggy formular.
+ *   o Handle reaction for Early Congestion Indication (ECI) within
+ *     pkts_acked, as mentioned within pseudo code.
+ *   o OWD is handled in relative format, where local time stamp will in
+ *     tcp_time_stamp format.
+ *
+ * Port from 2.4.19 to 2.6.16 as module by:
+ *   Wong Hoi Sing Edison <hswong3i@gmail.com>
+ *   Hung Hing Lun <hlhung3i@gmail.com>
+ *
+ * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+/* resolution of owd */
+#define LP_RESOL       1000
+/**
+ * enum tcp_lp_state
+ * @LP_VALID_RHZ: is remote HZ valid?
+ * @LP_VALID_OWD: is OWD valid?
+ * @LP_WITHIN_THR: are we within threshold?
+ * @LP_WITHIN_INF: are we within inference?
+ *
+ * TCP-LP's state flags.
+ * We create this set of state flag mainly for debugging.
+ */
+enum tcp_lp_state {
+        LP_VALID_RHZ = (1 << 0),
+        LP_VALID_OWD = (1 << 1),
+        LP_WITHIN_THR = (1 << 3),
+        LP_WITHIN_INF = (1 << 4),
+};
+/**
+ * struct lp
+ * @flag: TCP-LP state flag
+ * @sowd: smoothed OWD << 3
+ * @owd_min: min OWD
+ * @owd_max: max OWD
+ * @owd_max_rsv: resrved max owd
+ * @remote_hz: estimated remote HZ
+ * @remote_ref_time: remote reference time
+ * @local_ref_time: local reference time
+ * @last_drop: time for last active drop
+ * @inference: current inference
+ *
+ * TCP-LP's private struct.
+ * We get the idea from original TCP-LP implementation where only left those we
+ * found are really useful.
+ */
+struct lp {
+        u32 flag;
+        u32 sowd;
+        u32 owd_min;
+        u32 owd_max;
+        u32 owd_max_rsv;
+        u32 remote_hz;
+        u32 remote_ref_time;
+        u32 local_ref_time;
+        u32 last_drop;
+        u32 inference;
+};
+/**
+ * tcp_lp_init
+ *
+ * Init all required variables.
+ * Clone the handling from Vegas module implementation.
+ */
+static void tcp_lp_init(struct sock *sk)
+{
+        struct lp *lp = inet_csk_ca(sk);
+        lp->flag = 0;
+        lp->sowd = 0;
+        lp->owd_min = 0xffffffff;
+        lp->owd_max = 0;
+        lp->owd_max_rsv = 0;
+        lp->remote_hz = 0;
+        lp->remote_ref_time = 0;
+        lp->local_ref_time = 0;
+        lp->last_drop = 0;
+        lp->inference = 0;
+}
+/**
+ * tcp_lp_cong_avoid
+ *
+ * Implementation of cong_avoid.
+ * Will only call newReno CA when away from inference.
+ * From TCP-LP's paper, this will be handled in additive increasement.
+ */
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
+                              int flag)
+{
+        struct lp *lp = inet_csk_ca(sk);
+        if (!(lp->flag & LP_WITHIN_INF))
+                tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
+}
+/**
+ * tcp_lp_remote_hz_estimator
+ *
+ * Estimate remote HZ.
+ * We keep on updating the estimated value, where original TCP-LP
+ * implementation only guest it for once and use forever.
+ */
+static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct lp *lp = inet_csk_ca(sk);
+        s64 rhz = lp->remote_hz << 6;   /* remote HZ << 6 */
+        s64 m = 0;
+        /* not yet record reference time
+         * go away!! record it before come back!! */
+        if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
+                goto out;
+        /* we can't calc remote HZ with no different!! */
+        if (tp->rx_opt.rcv_tsval == lp->remote_ref_time
+            || tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+                goto out;
+        m = HZ * (tp->rx_opt.rcv_tsval -
+                  lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
+                                          lp->local_ref_time);
+        if (m < 0)
+                m = -m;
+        if (rhz != 0) {
+                m -= rhz >> 6;  /* m is now error in remote HZ est */
+                rhz += m;       /* 63/64 old + 1/64 new */
+        } else
+                rhz = m << 6;
+        /* record time for successful remote HZ calc */
+        lp->flag |= LP_VALID_RHZ;
+ out:
+        /* record reference time stamp */
+        lp->remote_ref_time = tp->rx_opt.rcv_tsval;
+        lp->local_ref_time = tp->rx_opt.rcv_tsecr;
+        return rhz >> 6;
+}
+/**
+ * tcp_lp_owd_calculator
+ *
+ * Calculate one way delay (in relative format).
+ * Original implement OWD as minus of remote time difference to local time
+ * difference directly. As this time difference just simply equal to RTT, when
+ * the network status is stable, remote RTT will equal to local RTT, and result
+ * OWD into zero.
+ * It seems to be a bug and so we fixed it.
+ */
+static u32 tcp_lp_owd_calculator(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct lp *lp = inet_csk_ca(sk);
+        s64 owd = 0;
+        lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
+        if (lp->flag & LP_VALID_RHZ) {
+                owd =
+                    tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
+                    tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
+                if (owd < 0)
+                        owd = -owd;
+        }
+        if (owd > 0)
+                lp->flag |= LP_VALID_OWD;
+        else
+                lp->flag &= ~LP_VALID_OWD;
+        return owd;
+}
+/**
+ * tcp_lp_rtt_sample
+ *
+ * Implementation or rtt_sample.
+ * Will take the following action,
+ *   1. calc OWD,
+ *   2. record the min/max OWD,
+ *   3. calc smoothed OWD (SOWD).
+ * Most ideas come from the original TCP-LP implementation.
+ */
+static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
+{
+        struct lp *lp = inet_csk_ca(sk);
+        s64 mowd = tcp_lp_owd_calculator(sk);
+        /* sorry that we don't have valid data */
+        if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
+                return;
+        /* record the next min owd */
+        if (mowd < lp->owd_min)
+                lp->owd_min = mowd;
+        /* always forget the max of the max
+         * we just set owd_max as one below it */
+        if (mowd > lp->owd_max) {
+                if (mowd > lp->owd_max_rsv) {
+                        if (lp->owd_max_rsv == 0)
+                                lp->owd_max = mowd;
+                        else
+                                lp->owd_max = lp->owd_max_rsv;
+                        lp->owd_max_rsv = mowd;
+                } else
+                        lp->owd_max = mowd;
+        }
+        /* calc for smoothed owd */
+        if (lp->sowd != 0) {
+                mowd -= lp->sowd >> 3;  /* m is now error in owd est */
+                lp->sowd += mowd;       /* owd = 7/8 owd + 1/8 new */
+        } else
+                lp->sowd = mowd << 3;   /* take the measured time be owd */
+}
+/**
+ * tcp_lp_pkts_acked
+ *
+ * Implementation of pkts_acked.
+ * Deal with active drop under Early Congestion Indication.
+ * Only drop to half and 1 will be handle, because we hope to use back
+ * newReno in increase case.
+ * We work it out by following the idea from TCP-LP's paper directly
+ */
+static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct lp *lp = inet_csk_ca(sk);
+        /* calc inference */
+        if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
+                lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+        /* test if within inference */
+        if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
+                lp->flag |= LP_WITHIN_INF;
+        else
+                lp->flag &= ~LP_WITHIN_INF;
+        /* test if within threshold */
+        if (lp->sowd >> 3 <
+            lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
+                lp->flag |= LP_WITHIN_THR;
+        else
+                lp->flag &= ~LP_WITHIN_THR;
+        pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
+                 tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+                 lp->sowd >> 3);
+        if (lp->flag & LP_WITHIN_THR)
+                return;
+        /* FIXME: try to reset owd_min and owd_max here
+         * so decrease the chance the min/max is no longer suitable
+         * and will usually within threshold when whithin inference */
+        lp->owd_min = lp->sowd >> 3;
+        lp->owd_max = lp->sowd >> 2;
+        lp->owd_max_rsv = lp->sowd >> 2;
+        /* happened within inference
+         * drop snd_cwnd into 1 */
+        if (lp->flag & LP_WITHIN_INF)
+                tp->snd_cwnd = 1U;
+        /* happened after inference
+         * cut snd_cwnd into half */
+        else
+                tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+        /* record this drop time */
+        lp->last_drop = tcp_time_stamp;
+}
+static struct tcp_congestion_ops tcp_lp = {
+        .init = tcp_lp_init,
+        .ssthresh = tcp_reno_ssthresh,
+        .cong_avoid = tcp_lp_cong_avoid,
+        .min_cwnd = tcp_reno_min_cwnd,
+        .rtt_sample = tcp_lp_rtt_sample,
+        .pkts_acked = tcp_lp_pkts_acked,
+        .owner = THIS_MODULE,
+        .name = "lp"
+};
+static int __init tcp_lp_register(void)
+{
+        BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
+        return tcp_register_congestion_control(&tcp_lp);
+}
+static void __exit tcp_lp_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_lp);
+}
+module_init(tcp_lp_register);
+module_exit(tcp_lp_unregister);
+MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f33c9dddaa1..07bb5a2b375 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3;
 int sysctl_tcp_mtu_probing = 0;
 int sysctl_tcp_base_mss = 512;
+/* By default, RFC2861 behavior.  */
+int sysctl_tcp_slow_start_after_idle = 1;
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
                             struct sk_buff *skb)
 {
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
        struct inet_connection_sock *icsk = inet_csk(sk);
        const u32 now = tcp_time_stamp;
-        if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+        if (sysctl_tcp_slow_start_after_idle &&
+            (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
                tcp_cwnd_restart(sk, __sk_dst_get(sk));
        tp->lsndtime = now;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
new file mode 100644
index 00000000000..d7d517a3a23
--- /dev/null
+++ b/net/ipv4/tcp_probe.c
@@ -0,0 +1,181 @@
+/*
+ * tcpprobe - Observe the TCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/vmalloc.h>
+#include <net/tcp.h>
+MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>");
+MODULE_DESCRIPTION("TCP cwnd snooper");
+MODULE_LICENSE("GPL");
+static int port = 0;
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+static int bufsize = 64*1024;
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+static const char procname[] = "tcpprobe";
+struct {
+        struct kfifo  *fifo;
+        spinlock_t    lock;
+        wait_queue_head_t wait;
+        struct timeval tstart;
+} tcpw;
+static void printl(const char *fmt, ...)
+{
+        va_list args;
+        int len;
+        struct timeval now;
+        char tbuf[256];
+        va_start(args, fmt);
+        do_gettimeofday(&now);
+        now.tv_sec -= tcpw.tstart.tv_sec;
+        now.tv_usec -= tcpw.tstart.tv_usec;
+        if (now.tv_usec < 0) {
+                --now.tv_sec;
+                now.tv_usec += 1000000;
+        }
+        len = sprintf(tbuf, "%lu.%06lu ",
+                      (unsigned long) now.tv_sec,
+                      (unsigned long) now.tv_usec);
+        len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+        va_end(args);
+        kfifo_put(tcpw.fifo, tbuf, len);
+        wake_up(&tcpw.wait);
+}
+static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk,
+                        struct msghdr *msg, size_t size)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        const struct inet_sock *inet = inet_sk(sk);
+        if (port == 0 || ntohs(inet->dport) == port ||
+            ntohs(inet->sport) == port) {
+                printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n",
+                       NIPQUAD(inet->saddr), ntohs(inet->sport),
+                       NIPQUAD(inet->daddr), ntohs(inet->dport),
+                       size, tp->snd_nxt, tp->snd_una,
+                       tp->snd_cwnd, tcp_current_ssthresh(sk),
+                       tp->snd_wnd);
+        }
+        jprobe_return();
+        return 0;
+}
+static struct jprobe tcp_send_probe = {
+        .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, },
+        .entry = (kprobe_opcode_t *) &jtcp_sendmsg,
+};
+static int tcpprobe_open(struct inode * inode, struct file * file)
+{
+        kfifo_reset(tcpw.fifo);
+        do_gettimeofday(&tcpw.tstart);
+        return 0;
+}
+static ssize_t tcpprobe_read(struct file *file, char __user *buf,
+                             size_t len, loff_t *ppos)
+{
+        int error = 0, cnt;
+        unsigned char *tbuf;
+        if (!buf || len < 0)
+                return -EINVAL;
+        if (len == 0)
+                return 0;
+        tbuf = vmalloc(len);
+        if (!tbuf)
+                return -ENOMEM;
+        error = wait_event_interruptible(tcpw.wait,
+                                         __kfifo_len(tcpw.fifo) != 0);
+        if (error)
+                return error;
+        cnt = kfifo_get(tcpw.fifo, tbuf, len);
+        error = copy_to_user(buf, tbuf, cnt);
+        vfree(tbuf);
+        return error ? error : cnt;
+}
+static struct file_operations tcpprobe_fops = {
+        .owner   = THIS_MODULE,
+        .open    = tcpprobe_open,
+        .read    = tcpprobe_read,
+};
+static __init int tcpprobe_init(void)
+{
+        int ret = -ENOMEM;
+        init_waitqueue_head(&tcpw.wait);
+        spin_lock_init(&tcpw.lock);
+        tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock);
+        if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops))
+                goto err0;
+        ret = register_jprobe(&tcp_send_probe);
+        if (ret)
+                goto err1;
+        pr_info("TCP watch registered (port=%d)\n", port);
+        return 0;
+ err1:
+        proc_net_remove(procname);
+ err0:
+        kfifo_free(tcpw.fifo);
+        return ret;
+}
+module_init(tcpprobe_init);
+static __exit void tcpprobe_exit(void)
+{
+        kfifo_free(tcpw.fifo);
+        proc_net_remove(procname);
+        unregister_jprobe(&tcp_send_probe);
+}
+module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 00000000000..11b42a7135c
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,231 @@
+/*
+ * TCP Veno congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    C. P. Fu, S. C. Liew.
+ *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
+ *    IEEE Journal on Selected Areas in Communication,
+ *    Feb. 2003.
+ *      See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+/* Default values of the Veno variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static const int beta = 3 << V_PARAM_SHIFT;
+/* Veno variables */
+struct veno {
+        u8 doing_veno_now;      /* if true, do veno for this rtt */
+        u16 cntrtt;             /* # of rtts measured within last rtt */
+        u32 minrtt;             /* min of rtts measured within last rtt (in usec) */
+        u32 basertt;            /* the min of all Veno rtt measurements seen (in usec) */
+        u32 inc;                /* decide whether to increase cwnd */
+        u32 diff;               /* calculate the diff rate */
+};
+/* There are several situations when we must "re-start" Veno:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ */
+static inline void veno_enable(struct sock *sk)
+{
+        struct veno *veno = inet_csk_ca(sk);
+        /* turn on Veno */
+        veno->doing_veno_now = 1;
+        veno->minrtt = 0x7fffffff;
+}
+static inline void veno_disable(struct sock *sk)
+{
+        struct veno *veno = inet_csk_ca(sk);
+        /* turn off Veno */
+        veno->doing_veno_now = 0;
+}
+static void tcp_veno_init(struct sock *sk)
+{
+        struct veno *veno = inet_csk_ca(sk);
+        veno->basertt = 0x7fffffff;
+        veno->inc = 1;
+        veno_enable(sk);
+}
+/* Do rtt sampling needed for Veno. */
+static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt)
+{
+        struct veno *veno = inet_csk_ca(sk);
+        u32 vrtt = usrtt + 1;   /* Never allow zero rtt or basertt */
+        /* Filter to find propagation delay: */
+        if (vrtt < veno->basertt)
+                veno->basertt = vrtt;
+        /* Find the min rtt during the last rtt to find
+         * the current prop. delay + queuing delay:
+         */
+        veno->minrtt = min(veno->minrtt, vrtt);
+        veno->cntrtt++;
+}
+static void tcp_veno_state(struct sock *sk, u8 ca_state)
+{
+        if (ca_state == TCP_CA_Open)
+                veno_enable(sk);
+        else
+                veno_disable(sk);
+}
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Veno calculations
+ * until we get fresh rtt samples.  So when we
+ * restart, we reset our Veno state to a clean
+ * state. After we get acks for this flight of
+ * packets, _then_ we can make Veno calculations
+ * again.
+ */
+static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+        if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+                tcp_veno_init(sk);
+}
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
+                                u32 seq_rtt, u32 in_flight, int flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct veno *veno = inet_csk_ca(sk);
+        if (!veno->doing_veno_now)
+                return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+        /* limited by applications */
+        if (!tcp_is_cwnd_limited(sk, in_flight))
+                return;
+        /* We do the Veno calculations only if we got enough rtt samples */
+        if (veno->cntrtt <= 2) {
+                /* We don't have enough rtt samples to do the Veno
+                 * calculation, so we'll behave like Reno.
+                 */
+                tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+        } else {
+                u32 rtt, target_cwnd;
+                /* We have enough rtt samples, so, using the Veno
+                 * algorithm, we determine the state of the network.
+                 */
+                rtt = veno->minrtt;
+                target_cwnd = ((tp->snd_cwnd * veno->basertt)
+                               << V_PARAM_SHIFT) / rtt;
+                veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+                if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                        /* Slow start.  */
+                        tcp_slow_start(tp);
+                } else {
+                        /* Congestion avoidance. */
+                        if (veno->diff < beta) {
+                                /* In the "non-congestive state", increase cwnd
+                                 *  every rtt.
+                                 */
+                                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+                                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                                tp->snd_cwnd++;
+                                        tp->snd_cwnd_cnt = 0;
+                                } else
+                                        tp->snd_cwnd_cnt++;
+                        } else {
+                                /* In the "congestive state", increase cwnd
+                                 * every other rtt.
+                                 */
+                                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+                                        if (veno->inc
+                                            && tp->snd_cwnd <
+                                            tp->snd_cwnd_clamp) {
+                                                tp->snd_cwnd++;
+                                                veno->inc = 0;
+                                        } else
+                                                veno->inc = 1;
+                                        tp->snd_cwnd_cnt = 0;
+                                } else
+                                        tp->snd_cwnd_cnt++;
+                        }
+                }
+                if (tp->snd_cwnd < 2)
+                        tp->snd_cwnd = 2;
+                else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+                        tp->snd_cwnd = tp->snd_cwnd_clamp;
+        }
+        /* Wipe the slate clean for the next rtt. */
+        /* veno->cntrtt = 0; */
+        veno->minrtt = 0x7fffffff;
+}
+/* Veno MD phase */
+static u32 tcp_veno_ssthresh(struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct veno *veno = inet_csk_ca(sk);
+        if (veno->diff < beta)
+                /* in "non-congestive state", cut cwnd by 1/5 */
+                return max(tp->snd_cwnd * 4 / 5, 2U);
+        else
+                /* in "congestive state", cut cwnd by 1/2 */
+                return max(tp->snd_cwnd >> 1U, 2U);
+}
+static struct tcp_congestion_ops tcp_veno = {
+        .init           = tcp_veno_init,
+        .ssthresh       = tcp_veno_ssthresh,
+        .cong_avoid     = tcp_veno_cong_avoid,
+        .rtt_sample     = tcp_veno_rtt_calc,
+        .set_state      = tcp_veno_state,
+        .cwnd_event     = tcp_veno_cwnd_event,
+        .owner          = THIS_MODULE,
+        .name           = "veno",
+};
+static int __init tcp_veno_register(void)
+{
+        BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
+        tcp_register_congestion_control(&tcp_veno);
+        return 0;
+}
+static void __exit tcp_veno_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_veno);
+}
+module_init(tcp_veno_register);
+module_exit(tcp_veno_unregister);
+MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 0c340c3756c..4247da1384b 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -1,7 +1,24 @@
 /*
- * TCP Westwood+
+ * TCP Westwood+: end-to-end bandwidth estimation for TCP
 *
- *      Angelo Dell'Aera:       TCP Westwood+ support
+ *      Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
+ *
+ * Support at http://c3lab.poliba.it/index.php/Westwood
+ * Main references in literature:
+ *
+ * - Mascolo S, Casetti, M. Gerla et al.
+ *   "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
+ *
+ * - A. Grieco, s. Mascolo
+ *   "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
+ *     Comm. Review, 2004
+ *
+ * - A. Dell'Aera, L. Grieco, S. Mascolo.
+ *   "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
+ *    A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
+ *
+ * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
+ * ssthresh after packet loss. The probing phase is as the original Reno.
 */
 #include <linux/config.h>
@@ -22,6 +39,8 @@ struct westwood {
        u32    accounted;
        u32    rtt;
        u32    rtt_min;          /* minimum observed RTT */
+        u8     first_ack;        /* flag which infers that this is the first ack */
+        u8     reset_rtt_min;    /* Reset RTT min to next RTT sample*/
 };
@@ -49,9 +68,11 @@ static void tcp_westwood_init(struct sock *sk)
        w->bw_est = 0;
        w->accounted = 0;
        w->cumul_ack = 0;
+        w->reset_rtt_min = 1;
        w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
        w->rtt_win_sx = tcp_time_stamp;
        w->snd_una = tcp_sk(sk)->snd_una;
+        w->first_ack = 1;
 }
 /*
@@ -63,10 +84,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b)
        return (((7 * a) + b) >> 3);
 }
-static inline void westwood_filter(struct westwood *w, u32 delta)
+static void westwood_filter(struct westwood *w, u32 delta)
 {
-        w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+        /* If the filter is empty fill it with the first sample of bandwidth  */
-        w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+        if (w->bw_ns_est == 0 && w->bw_est == 0) {
+                w->bw_ns_est = w->bk / delta;
+                w->bw_est = w->bw_ns_est;
+        } else {
+                w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+                w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+        }
 }
 /*
@@ -91,6 +118,15 @@ static void westwood_update_window(struct sock *sk)
        struct westwood *w = inet_csk_ca(sk);
        s32 delta = tcp_time_stamp - w->rtt_win_sx;
+        /* Initialize w->snd_una with the first acked sequence number in order
+         * to fix mismatch between tp->snd_una and w->snd_una for the first
+         * bandwidth sample
+         */
+        if (w->first_ack) {
+                w->snd_una = tcp_sk(sk)->snd_una;
+                w->first_ack = 0;
+        }
        /*
         * See if a RTT-window has passed.
         * Be careful since if RTT is less than
@@ -108,6 +144,16 @@ static void westwood_update_window(struct sock *sk)
        }
 }
+static inline void update_rtt_min(struct westwood *w)
+{
+        if (w->reset_rtt_min) {
+                w->rtt_min = w->rtt;
+                w->reset_rtt_min = 0;   
+        } else
+                w->rtt_min = min(w->rtt, w->rtt_min);
+}
 /*
 * @westwood_fast_bw
 * It is called when we are in fast path. In particular it is called when
@@ -123,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk)
        w->bk += tp->snd_una - w->snd_una;
        w->snd_una = tp->snd_una;
-        w->rtt_min = min(w->rtt, w->rtt_min);
+        update_rtt_min(w);
 }
 /*
@@ -162,12 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
        return w->cumul_ack;
 }
-static inline u32 westwood_bw_rttmin(const struct sock *sk)
-{
-        const struct tcp_sock *tp = tcp_sk(sk);
-        const struct westwood *w = inet_csk_ca(sk);
-        return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
-}
 /*
 * TCP Westwood
@@ -175,9 +215,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk)
 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
 * so avoids ever returning 0.
 */
-static u32 tcp_westwood_cwnd_min(struct sock *sk)
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
 {
-        return westwood_bw_rttmin(sk);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        const struct westwood *w = inet_csk_ca(sk);
+        return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
@@ -191,17 +233,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
                break;
        case CA_EVENT_COMPLETE_CWR:
-                tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
+                tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
                break;
        case CA_EVENT_FRTO:
-                tp->snd_ssthresh = westwood_bw_rttmin(sk);
+                tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+                /* Update RTT_min when next ack arrives */
+                w->reset_rtt_min = 1;
                break;
        case CA_EVENT_SLOW_ACK:
                westwood_update_window(sk);
                w->bk += westwood_acked_count(sk);
-                w->rtt_min = min(w->rtt, w->rtt_min);
+                update_rtt_min(w);
                break;
        default:
@@ -235,7 +279,7 @@ static struct tcp_congestion_ops tcp_westwood = {
        .init           = tcp_westwood_init,
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_reno_cong_avoid,
-        .min_cwnd       = tcp_westwood_cwnd_min,
+        .min_cwnd       = tcp_westwood_bw_rttmin,
        .cwnd_event     = tcp_westwood_event,
        .get_info       = tcp_westwood_info,
        .pkts_acked     = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 3e174c83bfe..817ed84511a 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb)
 EXPORT_SYMBOL(xfrm4_rcv);
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
-        struct iphdr *outer_iph = skb->nh.iph;
-        struct iphdr *inner_iph = skb->h.ipiph;
-        if (INET_ECN_is_ce(outer_iph->tos))
-                IP_ECN_set_ce(inner_iph);
-}
 static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
 {
        switch (nexthdr) {
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
                xfrm_vec[xfrm_nr++] = x;
-                iph = skb->nh.iph;
+                if (x->mode->input(x, skb))
+                        goto drop;
                if (x->props.mode) {
-                        if (iph->protocol != IPPROTO_IPIP)
-                                goto drop;
-                        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-                                goto drop;
-                        if (skb_cloned(skb) &&
-                            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-                                goto drop;
-                        if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-                                ipv4_copy_dscp(iph, skb->h.ipiph);
-                        if (!(x->props.flags & XFRM_STATE_NOECN))
-                                ipip_ecn_decapsulate(skb);
-                        skb->mac.raw = memmove(skb->data - skb->mac_len,
-                                               skb->mac.raw, skb->mac_len);
-                        skb->nh.raw = skb->data;
-                        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
                        decaps = 1;
                        break;
                }
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 00000000000..a9e6b3dd19c
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,83 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_transport_output(struct sk_buff *skb)
+{
+        struct xfrm_state *x;
+        struct iphdr *iph;
+        int ihl;
+        iph = skb->nh.iph;
+        skb->h.ipiph = iph;
+        ihl = iph->ihl * 4;
+        skb->h.raw += ihl;
+        x = skb->dst->xfrm;
+        skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
+        return 0;
+}
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is.  skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int ihl = skb->data - skb->h.raw;
+        if (skb->h.raw != skb->nh.raw)
+                skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl);
+        skb->nh.iph->tot_len = htons(skb->len + ihl);
+        skb->h.raw = skb->data;
+        return 0;
+}
+static struct xfrm_mode xfrm4_transport_mode = {
+        .input = xfrm4_transport_input,
+        .output = xfrm4_transport_output,
+        .owner = THIS_MODULE,
+        .encap = XFRM_MODE_TRANSPORT,
+};
+static int __init xfrm4_transport_init(void)
+{
+        return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+static void __exit xfrm4_transport_exit(void)
+{
+        int err;
+        err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+        BUG_ON(err);
+}
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 00000000000..f8d880beb12
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+        struct iphdr *outer_iph = skb->nh.iph;
+        struct iphdr *inner_iph = skb->h.ipiph;
+        if (INET_ECN_is_ce(outer_iph->tos))
+                IP_ECN_set_ce(inner_iph);
+}
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *      tot_len
+ *      check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_tunnel_output(struct sk_buff *skb)
+{
+        struct dst_entry *dst = skb->dst;
+        struct xfrm_state *x = dst->xfrm;
+        struct iphdr *iph, *top_iph;
+        int flags;
+        iph = skb->nh.iph;
+        skb->h.ipiph = iph;
+        skb->nh.raw = skb_push(skb, x->props.header_len);
+        top_iph = skb->nh.iph;
+        top_iph->ihl = 5;
+        top_iph->version = 4;
+        /* DS disclosed */
+        top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+        flags = x->props.flags;
+        if (flags & XFRM_STATE_NOECN)
+                IP_ECN_clear(top_iph);
+        top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+                0 : (iph->frag_off & htons(IP_DF));
+        if (!top_iph->frag_off)
+                __ip_select_ident(top_iph, dst->child, 0);
+        top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+        top_iph->saddr = x->props.saddr.a4;
+        top_iph->daddr = x->id.daddr.a4;
+        top_iph->protocol = IPPROTO_IPIP;
+        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+        return 0;
+}
+static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+        struct iphdr *iph = skb->nh.iph;
+        int err = -EINVAL;
+        if (iph->protocol != IPPROTO_IPIP)
+                goto out;
+        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                goto out;
+        if (skb_cloned(skb) &&
+            (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+                goto out;
+        if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+                ipv4_copy_dscp(iph, skb->h.ipiph);
+        if (!(x->props.flags & XFRM_STATE_NOECN))
+                ipip_ecn_decapsulate(skb);
+        skb->mac.raw = memmove(skb->data - skb->mac_len,
+                               skb->mac.raw, skb->mac_len);
+        skb->nh.raw = skb->data;
+        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+        err = 0;
+out:
+        return err;
+}
+static struct xfrm_mode xfrm4_tunnel_mode = {
+        .input = xfrm4_tunnel_input,
+        .output = xfrm4_tunnel_output,
+        .owner = THIS_MODULE,
+        .encap = XFRM_MODE_TUNNEL,
+};
+static int __init xfrm4_tunnel_init(void)
+{
+        return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+static void __exit xfrm4_tunnel_exit(void)
+{
+        int err;
+        err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+        BUG_ON(err);
+}
+module_init(xfrm4_tunnel_init);
+module_exit(xfrm4_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4ef8efaf6a6..ac9d91d4bb0 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,67 +12,10 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 #include <net/icmp.h>
-/* Add encapsulation header.
- *
- * In transport mode, the IP header will be moved forward to make space
- * for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *      tot_len
- *      check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static void xfrm4_encap(struct sk_buff *skb)
-{
-        struct dst_entry *dst = skb->dst;
-        struct xfrm_state *x = dst->xfrm;
-        struct iphdr *iph, *top_iph;
-        int flags;
-        iph = skb->nh.iph;
-        skb->h.ipiph = iph;
-        skb->nh.raw = skb_push(skb, x->props.header_len);
-        top_iph = skb->nh.iph;
-        if (!x->props.mode) {
-                skb->h.raw += iph->ihl*4;
-                memmove(top_iph, iph, iph->ihl*4);
-                return;
-        }
-        top_iph->ihl = 5;
-        top_iph->version = 4;
-        /* DS disclosed */
-        top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
-        flags = x->props.flags;
-        if (flags & XFRM_STATE_NOECN)
-                IP_ECN_clear(top_iph);
-        top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
-                0 : (iph->frag_off & htons(IP_DF));
-        if (!top_iph->frag_off)
-                __ip_select_ident(top_iph, dst->child, 0);
-        top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
-        top_iph->saddr = x->props.saddr.a4;
-        top_iph->daddr = x->id.daddr.a4;
-        top_iph->protocol = IPPROTO_IPIP;
-        memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
-}
 static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 {
        int mtu, ret = 0;
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb)
                if (err)
                        goto error;
-                xfrm4_encap(skb);
+                err = x->mode->output(skb);
+                if (err)
+                        goto error;
                err = x->type->output(x, skb);
                if (err)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8604c747bca..c0465284dfa 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
 static struct dst_ops xfrm4_dst_ops;
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
 static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
        return __ip_route_output_key((struct rtable**)dst, fl);
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
 static inline int xfrm4_garbage_collect(void)
 {
-        read_lock(&xfrm4_policy_afinfo.lock);
        xfrm4_policy_afinfo.garbage_collect();
-        read_unlock(&xfrm4_policy_afinfo.lock);
        return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
 }
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = {
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
        .family =               AF_INET,
-        .lock =                 RW_LOCK_UNLOCKED,
-        .type_map =             &xfrm4_type_map,
        .dst_ops =              &xfrm4_dst_ops,
        .dst_lookup =           xfrm4_dst_lookup,
        .find_bundle =          __xfrm4_find_bundle,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index dbabf81a9b7..81e1751c966 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
        .family                 = AF_INET,
-        .lock                   = RW_LOCK_UNLOCKED,
        .init_flags             = xfrm4_init_flags,
        .init_tempsel           = __xfrm4_init_tempsel,
        .state_lookup           = __xfrm4_state_lookup,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index f8a107ab559..e923d4dea41 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -106,6 +106,26 @@ config INET6_TUNNEL
        tristate
        default n
+config INET6_XFRM_MODE_TRANSPORT
+        tristate "IPv6: IPsec transport mode"
+        depends on IPV6
+        default IPV6
+        select XFRM
+        ---help---
+          Support for IPsec transport mode.
+          If unsure, say Y.
+config INET6_XFRM_MODE_TUNNEL
+        tristate "IPv6: IPsec tunnel mode"
+        depends on IPV6
+        default IPV6
+        select XFRM
+        ---help---
+          Support for IPsec tunnel mode.
+          If unsure, say Y.
 config IPV6_TUNNEL
        tristate "IPv6: IPv6-in-IPv6 tunnel"
        select INET6_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a760b0988fb..386e0a62694 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
+obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
+obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
 obj-$(CONFIG_NETFILTER) += netfilter/
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 445006ee452..c2c26fa0943 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2860,6 +2860,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
        return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen);
 }
+/* Maximum length of ifa_cacheinfo attributes */
+#define INET6_IFADDR_RTA_SPACE \
+                RTA_SPACE(16) /* IFA_ADDRESS */ + \
+                RTA_SPACE(sizeof(struct ifa_cacheinfo)) /* CACHEINFO */
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
                             u32 pid, u32 seq, int event, unsigned int flags)
 {
@@ -3092,7 +3097,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
 static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 {
        struct sk_buff *skb;
-        int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128);
+        int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + INET6_IFADDR_RTA_SPACE);
        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
@@ -3142,6 +3147,17 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 #endif
 }
+/* Maximum length of ifinfomsg attributes */
+#define INET6_IFINFO_RTA_SPACE \
+                RTA_SPACE(IFNAMSIZ) /* IFNAME */ + \
+                RTA_SPACE(MAX_ADDR_LEN) /* ADDRESS */ + \
+                RTA_SPACE(sizeof(u32)) /* MTU */ + \
+                RTA_SPACE(sizeof(int)) /* LINK */ + \
+                RTA_SPACE(0) /* PROTINFO */ + \
+                RTA_SPACE(sizeof(u32)) /* FLAGS */ + \
+                RTA_SPACE(sizeof(struct ifla_cacheinfo)) /* CACHEINFO */ + \
+                RTA_SPACE(sizeof(__s32[DEVCONF_MAX])) /* CONF */
 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 
                             u32 pid, u32 seq, int event, unsigned int flags)
 {
@@ -3235,8 +3251,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
 {
        struct sk_buff *skb;
-        /* 128 bytes ?? */
+        int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + INET6_IFINFO_RTA_SPACE);
-        int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+128);
        
        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
@@ -3252,6 +3267,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
        netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC);
 }
+/* Maximum length of prefix_cacheinfo attributes */
+#define INET6_PREFIX_RTA_SPACE \
+                RTA_SPACE(sizeof(((struct prefix_info *)NULL)->prefix)) /* ADDRESS */ + \
+                RTA_SPACE(sizeof(struct prefix_cacheinfo)) /* CACHEINFO */
 static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
                        struct prefix_info *pinfo, u32 pid, u32 seq, 
                        int event, unsigned int flags)
@@ -3296,7 +3316,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
                         struct prefix_info *pinfo)
 {
        struct sk_buff *skb;
-        int size = NLMSG_SPACE(sizeof(struct prefixmsg)+128);
+        int size = NLMSG_SPACE(sizeof(struct prefixmsg) + INET6_PREFIX_RTA_SPACE);
        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 6778173a3dd..d31c0d6c044 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -292,7 +292,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
                memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
                memset(ah->auth_data, 0, ahp->icv_trunc_len);
-                skb_push(skb, skb->data - skb->nh.raw);
+                skb_push(skb, hdr_len);
                ahp->icv(ahp, skb, ah->auth_data);
                if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
                        LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
@@ -301,12 +301,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
                }
        }
-        skb->nh.raw = skb_pull(skb, ah_hlen);
+        skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len);
-        memcpy(skb->nh.raw, tmp_hdr, hdr_len);
+        __skb_pull(skb, ah_hlen + hdr_len);
-        skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
-        skb_pull(skb, hdr_len);
-        skb->h.raw = skb->data;
        kfree(tmp_hdr);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 22f04607903..a15a6f320f7 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -142,25 +142,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
        int hdr_len = skb->h.raw - skb->nh.raw;
        int nfrags;
-        unsigned char *tmp_hdr = NULL;
        int ret = 0;
        if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) {
                ret = -EINVAL;
-                goto out_nofree;
+                goto out;
        }
        if (elen <= 0 || (elen & (blksize-1))) {
                ret = -EINVAL;
-                goto out_nofree;
+                goto out;
-        }
-        tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC);
-        if (!tmp_hdr) {
-                ret = -ENOMEM;
-                goto out_nofree;
        }
-        memcpy(tmp_hdr, skb->nh.raw, hdr_len);
        /* If integrity check is required, do this. */
        if (esp->auth.icv_full_len) {
@@ -222,16 +214,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
                /* ... check padding bits here. Silly. :-) */ 
                pskb_trim(skb, skb->len - alen - padlen - 2);
-                skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen);
-                skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen;
-                memcpy(skb->nh.raw, tmp_hdr, hdr_len);
-                skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
                ret = nexthdr[1];
        }
+        skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len;
 out:
-        kfree(tmp_hdr);
-out_nofree:
        return ret;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e46048974f3..d29620f4910 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
 #include <linux/in6.h>
 #include <linux/tcp.h>
 #include <linux/route.h>
+#include <linux/module.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
@@ -458,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        nf_bridge_get(to->nf_bridge);
 #endif
 #endif
+        skb_copy_secmark(to, from);
 }
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
@@ -488,6 +490,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
        return offset;
 }
+EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 48636436028..f28cd37feed 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -65,38 +65,25 @@ static LIST_HEAD(ipcomp6_tfms_list);
 static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-        int err = 0;
+        int err = -ENOMEM;
-        u8 nexthdr = 0;
-        int hdr_len = skb->h.raw - skb->nh.raw;
-        unsigned char *tmp_hdr = NULL;
        struct ipv6hdr *iph;
+        struct ipv6_comp_hdr *ipch;
        int plen, dlen;
        struct ipcomp_data *ipcd = x->data;
        u8 *start, *scratch;
        struct crypto_tfm *tfm;
        int cpu;
-        if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+        if (skb_linearize_cow(skb))
-                skb_linearize(skb, GFP_ATOMIC) != 0) {
-                err = -ENOMEM;
                goto out;
-        }
        skb->ip_summed = CHECKSUM_NONE;
        /* Remove ipcomp header and decompress original payload */
        iph = skb->nh.ipv6h;
-        tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC);
+        ipch = (void *)skb->data;
-        if (!tmp_hdr)
+        skb->h.raw = skb->nh.raw + sizeof(*ipch);
-                goto out;
+        __skb_pull(skb, sizeof(*ipch));
-        memcpy(tmp_hdr, iph, hdr_len);
-        nexthdr = *(u8 *)skb->data;
-        skb_pull(skb, sizeof(struct ipv6_comp_hdr)); 
-        skb->nh.raw += sizeof(struct ipv6_comp_hdr);
-        memcpy(skb->nh.raw, tmp_hdr, hdr_len);
-        iph = skb->nh.ipv6h;
-        iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr));
-        skb->h.raw = skb->data;
        /* decompression */
        plen = skb->len;
@@ -125,18 +112,11 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
        skb_put(skb, dlen - plen);
        memcpy(skb->data, scratch, dlen);
+        err = ipch->nexthdr;
-        iph = skb->nh.ipv6h;
-        iph->payload_len = htons(skb->len);
-        
 out_put_cpu:
        put_cpu();
 out:
-        kfree(tmp_hdr);
-        if (err)
-                goto error_out;
-        return nexthdr;
-error_out:
        return err;
 }
@@ -159,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb)
                goto out_ok;
        }
-        if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+        if (skb_linearize_cow(skb))
-                skb_linearize(skb, GFP_ATOMIC) != 0) {
                goto out_ok;
-        }
        /* compression */
        plen = skb->len - hdr_len;
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 93bae36f266..2a71c3b669f 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -189,7 +189,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
        /* This is where we call the helper: as the packet goes out. */
        ct = nf_ct_get(*pskb, &ctinfo);
-        if (!ct)
+        if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
                goto out;
        help = nfct_help(ct);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 86c6703265d..ef18a7b7014 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
                return -NF_ACCEPT;
        }
-        if (hooknum == NF_IP6_PRE_ROUTING &&
+        if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING &&
            nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
                nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
                              "nf_ct_icmpv6: ICMPv6 checksum failed\n");
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3e319035f82..c32a029e43f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -456,13 +456,9 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
                DEBUGP("queue: message is too short.\n");
                goto err;
        }
-        if (end-offset < skb->len) {
+        if (pskb_trim_rcsum(skb, end - offset)) {
-                if (pskb_trim(skb, end - offset)) {
+                DEBUGP("Can't trim\n");
-                        DEBUGP("Can't trim\n");
+                goto err;
-                        goto err;
-                }
-                if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-                        skb->ip_summed = CHECKSUM_NONE;
        }
        /* Find out which fragments are in front and at the back of us
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 301eee726b0..a50eb306e9e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1218,8 +1218,16 @@ process:
        bh_lock_sock(sk);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
-                if (!tcp_prequeue(sk, skb))
+#ifdef CONFIG_NET_DMA
-                        ret = tcp_v6_do_rcv(sk, skb);
+                struct tcp_sock *tp = tcp_sk(sk);
+                if (tp->ucopy.dma_chan)
+                        ret = tcp_v6_do_rcv(sk, skb);
+                else
+#endif
+                {
+                        if (!tcp_prequeue(sk, skb))
+                                ret = tcp_v6_do_rcv(sk, skb);
+                }
        } else
                sk_add_backlog(sk, skb);
        bh_unlock_sock(sk);
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00cfdee18dc..0405d74ff91 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -13,21 +13,9 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
-static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
-{
-        struct ipv6hdr *outer_iph = skb->nh.ipv6h;
-        struct ipv6hdr *inner_iph = skb->h.ipv6h;
-        if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
-                IP6_ECN_set_ce(inner_iph);
-}
 int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 {
        int err;
@@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
                xfrm_vec[xfrm_nr++] = x;
+                if (x->mode->input(x, skb))
+                        goto drop;
                if (x->props.mode) { /* XXX */
-                        if (nexthdr != IPPROTO_IPV6)
-                                goto drop;
-                        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-                                goto drop;
-                        if (skb_cloned(skb) &&
-                            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-                                goto drop;
-                        if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-                                ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
-                        if (!(x->props.flags & XFRM_STATE_NOECN))
-                                ipip6_ecn_decapsulate(skb);
-                        skb->mac.raw = memmove(skb->data - skb->mac_len,
-                                               skb->mac.raw, skb->mac_len);
-                        skb->nh.raw = skb->data;
                        decaps = 1;
                        break;
                }
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
new file mode 100644
index 00000000000..711d713e36d
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -0,0 +1,88 @@
+/*
+ * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+/* Add encapsulation header.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the encapsulation header.
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_transport_output(struct sk_buff *skb)
+{
+        struct xfrm_state *x = skb->dst->xfrm;
+        struct ipv6hdr *iph;
+        u8 *prevhdr;
+        int hdr_len;
+        skb_push(skb, x->props.header_len);
+        iph = skb->nh.ipv6h;
+        hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
+        skb->nh.raw = prevhdr - x->props.header_len;
+        skb->h.raw = skb->data + hdr_len;
+        memmove(skb->data, iph, hdr_len);
+        return 0;
+}
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is.  skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int ihl = skb->data - skb->h.raw;
+        if (skb->h.raw != skb->nh.raw)
+                skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl);
+        skb->nh.ipv6h->payload_len = htons(skb->len + ihl -
+                                           sizeof(struct ipv6hdr));
+        skb->h.raw = skb->data;
+        return 0;
+}
+static struct xfrm_mode xfrm6_transport_mode = {
+        .input = xfrm6_transport_input,
+        .output = xfrm6_transport_output,
+        .owner = THIS_MODULE,
+        .encap = XFRM_MODE_TRANSPORT,
+};
+static int __init xfrm6_transport_init(void)
+{
+        return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
+}
+static void __exit xfrm6_transport_exit(void)
+{
+        int err;
+        err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
+        BUG_ON(err);
+}
+module_init(xfrm6_transport_init);
+module_exit(xfrm6_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
new file mode 100644
index 00000000000..8af79be2edc
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -0,0 +1,121 @@
+/*
+ * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
+{
+        struct ipv6hdr *outer_iph = skb->nh.ipv6h;
+        struct ipv6hdr *inner_iph = skb->h.ipv6h;
+        if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
+                IP6_ECN_set_ce(inner_iph);
+}
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *      payload_len
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_tunnel_output(struct sk_buff *skb)
+{
+        struct dst_entry *dst = skb->dst;
+        struct xfrm_state *x = dst->xfrm;
+        struct ipv6hdr *iph, *top_iph;
+        int dsfield;
+        skb_push(skb, x->props.header_len);
+        iph = skb->nh.ipv6h;
+        skb->nh.raw = skb->data;
+        top_iph = skb->nh.ipv6h;
+        skb->nh.raw = &top_iph->nexthdr;
+        skb->h.ipv6h = top_iph + 1;
+        top_iph->version = 6;
+        top_iph->priority = iph->priority;
+        top_iph->flow_lbl[0] = iph->flow_lbl[0];
+        top_iph->flow_lbl[1] = iph->flow_lbl[1];
+        top_iph->flow_lbl[2] = iph->flow_lbl[2];
+        dsfield = ipv6_get_dsfield(top_iph);
+        dsfield = INET_ECN_encapsulate(dsfield, dsfield);
+        if (x->props.flags & XFRM_STATE_NOECN)
+                dsfield &= ~INET_ECN_MASK;
+        ipv6_change_dsfield(top_iph, 0, dsfield);
+        top_iph->nexthdr = IPPROTO_IPV6; 
+        top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
+        ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
+        ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
+        return 0;
+}
+static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+        int err = -EINVAL;
+        if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6)
+                goto out;
+        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+                goto out;
+        if (skb_cloned(skb) &&
+            (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+                goto out;
+        if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+                ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
+        if (!(x->props.flags & XFRM_STATE_NOECN))
+                ipip6_ecn_decapsulate(skb);
+        skb->mac.raw = memmove(skb->data - skb->mac_len,
+                               skb->mac.raw, skb->mac_len);
+        skb->nh.raw = skb->data;
+        err = 0;
+out:
+        return err;
+}
+static struct xfrm_mode xfrm6_tunnel_mode = {
+        .input = xfrm6_tunnel_input,
+        .output = xfrm6_tunnel_output,
+        .owner = THIS_MODULE,
+        .encap = XFRM_MODE_TUNNEL,
+};
+static int __init xfrm6_tunnel_init(void)
+{
+        return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
+}
+static void __exit xfrm6_tunnel_exit(void)
+{
+        int err;
+        err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
+        BUG_ON(err);
+}
+module_init(xfrm6_tunnel_init);
+module_exit(xfrm6_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 80242172a5d..16e84254a25 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -14,68 +14,9 @@
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
-/* Add encapsulation header.
- *
- * In transport mode, the IP header and mutable extension headers will be moved
- * forward to make space for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *      payload_len
- *
- * On exit, skb->h will be set to the start of the encapsulation header to be
- * filled in by x->type->output and skb->nh will be set to the nextheader field
- * of the extension header directly preceding the encapsulation header, or in
- * its absence, that of the top IP header.  The value of skb->data will always
- * point to the top IP header.
- */
-static void xfrm6_encap(struct sk_buff *skb)
-{
-        struct dst_entry *dst = skb->dst;
-        struct xfrm_state *x = dst->xfrm;
-        struct ipv6hdr *iph, *top_iph;
-        int dsfield;
-        skb_push(skb, x->props.header_len);
-        iph = skb->nh.ipv6h;
-        if (!x->props.mode) {
-                u8 *prevhdr;
-                int hdr_len;
-                hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
-                skb->nh.raw = prevhdr - x->props.header_len;
-                skb->h.raw = skb->data + hdr_len;
-                memmove(skb->data, iph, hdr_len);
-                return;
-        }
-        skb->nh.raw = skb->data;
-        top_iph = skb->nh.ipv6h;
-        skb->nh.raw = &top_iph->nexthdr;
-        skb->h.ipv6h = top_iph + 1;
-        top_iph->version = 6;
-        top_iph->priority = iph->priority;
-        top_iph->flow_lbl[0] = iph->flow_lbl[0];
-        top_iph->flow_lbl[1] = iph->flow_lbl[1];
-        top_iph->flow_lbl[2] = iph->flow_lbl[2];
-        dsfield = ipv6_get_dsfield(top_iph);
-        dsfield = INET_ECN_encapsulate(dsfield, dsfield);
-        if (x->props.flags & XFRM_STATE_NOECN)
-                dsfield &= ~INET_ECN_MASK;
-        ipv6_change_dsfield(top_iph, 0, dsfield);
-        top_iph->nexthdr = IPPROTO_IPV6; 
-        top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
-        ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
-        ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
-}
 static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 {
        int mtu, ret = 0;
@@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb)
                if (err)
                        goto error;
-                xfrm6_encap(skb);
+                err = x->mode->output(skb);
+                if (err)
+                        goto error;
                err = x->type->output(x, skb);
                if (err)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 88c840f1beb..ee715f2691e 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -23,8 +23,6 @@
 static struct dst_ops xfrm6_dst_ops;
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
-static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED };
 static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
        int err = 0;
@@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl)
 static inline int xfrm6_garbage_collect(void)
 {
-        read_lock(&xfrm6_policy_afinfo.lock);
        xfrm6_policy_afinfo.garbage_collect();
-        read_unlock(&xfrm6_policy_afinfo.lock);
        return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2);
 }
@@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = {
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
        .family =               AF_INET6,
-        .lock =                 RW_LOCK_UNLOCKED,
-        .type_map =             &xfrm6_type_map,
        .dst_ops =              &xfrm6_dst_ops,
        .dst_lookup =           xfrm6_dst_lookup,
        .find_bundle =          __xfrm6_find_bundle,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index a5723024d3b..b33296b3f6d 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto,
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
        .family                 = AF_INET6,
-        .lock                   = RW_LOCK_UNLOCKED,
        .init_tempsel           = __xfrm6_init_tempsel,
        .state_lookup           = __xfrm6_state_lookup,
        .find_acq               = __xfrm6_find_acq,
diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c
index a394c6fe19a..bba3431cd9a 100644
--- a/net/ipx/ipx_route.c
+++ b/net/ipx/ipx_route.c
@@ -238,7 +238,7 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
        }       
        /* Apply checksum. Not allowed on 802.3 links. */
-        if (sk->sk_no_check || intrfc->if_dlink_type == IPX_FRAME_8023)
+        if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023))
                ipx->ipx_checksum = 0xFFFF;
        else
                ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr));
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index c19e9ce05a3..57ea160f470 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -44,6 +44,8 @@
 #include <net/irda/irlmp.h>
 #include <net/irda/irlmp_frame.h>
+#include <asm/unaligned.h>
 static __u8 irlmp_find_free_slsap(void);
 static int irlmp_slsap_inuse(__u8 slsap_sel);
@@ -840,6 +842,7 @@ void irlmp_do_expiry(void)
 void irlmp_do_discovery(int nslots)
 {
        struct lap_cb *lap;
+        __u16 *data_hintsp;
        /* Make sure the value is sane */
        if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){
@@ -849,7 +852,8 @@ void irlmp_do_discovery(int nslots)
        }
        /* Construct new discovery info to be used by IrLAP, */
-        u16ho(irlmp->discovery_cmd.data.hints) = irlmp->hints.word;
+        data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints;
+        put_unaligned(irlmp->hints.word, data_hintsp);
        /*
         *  Set character set for device name (we use ASCII), and
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 859582275ca..d5e2121ea20 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1454,21 +1454,23 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
        if (x == NULL)
                return -ESRCH;
+        if ((err = security_xfrm_state_delete(x)))
+                goto out;
        if (xfrm_state_kern(x)) {
-                xfrm_state_put(x);
+                err = -EPERM;
-                return -EPERM;
+                goto out;
        }
        
        err = xfrm_state_delete(x);
-        if (err < 0) {
+        if (err < 0)
-                xfrm_state_put(x);
+                goto out;
-                return err;
-        }
        c.seq = hdr->sadb_msg_seq;
        c.pid = hdr->sadb_msg_pid;
        c.event = XFRM_MSG_DELSA;
        km_state_notify(x, &c);
+out:
        xfrm_state_put(x);
        return err;
@@ -2274,11 +2276,14 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
        err = 0;
+        if ((err = security_xfrm_policy_delete(xp)))
+                goto out;
        c.seq = hdr->sadb_msg_seq;
        c.pid = hdr->sadb_msg_pid;
        c.event = XFRM_MSG_DELPOLICY;
        km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+out:
        xfrm_pol_put(xp);
        return err;
 }
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db745c8..75c9b148080 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -674,7 +674,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
        lock_sock(sk);
        copied = -ENOTCONN;
-        if (sk->sk_state == TCP_LISTEN)
+        if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN))
                goto out;
        timeo = sock_rcvtimeo(sk, nonblock);
@@ -733,7 +733,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                break;
-                        if (sk->sk_state == TCP_CLOSE) {
+                        if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) {
                                if (!sock_flag(sk, SOCK_DONE)) {
                                        /*
                                         * This occurs when user tries to read
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
                        continue;
                if (!(flags & MSG_PEEK)) {
-                        sk_eat_skb(sk, skb);
+                        sk_eat_skb(sk, skb, 0);
                        *seq = 0;
                }
        } while (len > 0);
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index ba90f7f0801..5ae47be7dde 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -26,8 +26,6 @@
 #include <net/llc_c_st.h>
 #include <net/tcp_states.h>
-u8 llc_mac_null_var[IFHWADDRLEN];
 /**
 *      llc_build_and_send_pkt - Connection data sending for upper layers.
 *      @sk: connection
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index d62e0f9b9da..94d2368ade9 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -142,6 +142,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
        struct llc_sap *sap;
        struct llc_pdu_sn *pdu;
        int dest;
+        int (*rcv)(struct sk_buff *, struct net_device *,
+                   struct packet_type *, struct net_device *);
        /*
         * When the interface is in promisc. mode, drop all the crap that it
@@ -169,9 +171,11 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
         * First the upper layer protocols that don't need the full
         * LLC functionality
         */
-        if (sap->rcv_func) {
+        rcv = rcu_dereference(sap->rcv_func);
-                sap->rcv_func(skb, dev, pt, orig_dev);
+        if (rcv) {
-                goto out_put;
+                struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
+                if (cskb)
+                        rcv(cskb, dev, pt, orig_dev);
        }
        dest = llc_pdu_type(skb);
        if (unlikely(!dest || !llc_type_handlers[dest - 1]))
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 4029ceee9b9..20c4eb5c1ac 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb)
 *      mac, and local sap. Returns pointer for socket found, %NULL otherwise.
 */
 static struct sock *llc_lookup_dgram(struct llc_sap *sap,
-                                     struct llc_addr *laddr)
+                                     const struct llc_addr *laddr)
 {
        struct sock *rc;
        struct hlist_node *node;
@@ -304,19 +304,62 @@ found:
        return rc;
 }
+/**
+ *      llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets.
+ *      @sap: SAP
+ *      @laddr: address of local LLC (MAC + SAP)
+ *
+ *      Search socket list of the SAP and finds connections with same sap.
+ *      Deliver clone to each.
+ */
+static void llc_sap_mcast(struct llc_sap *sap,
+                          const struct llc_addr *laddr,
+                          struct sk_buff *skb)
+{
+        struct sock *sk;
+        struct hlist_node *node;
+        read_lock_bh(&sap->sk_list.lock);
+        sk_for_each(sk, node, &sap->sk_list.list) {
+                struct llc_sock *llc = llc_sk(sk);
+                struct sk_buff *skb1;
+                if (sk->sk_type != SOCK_DGRAM)
+                        continue;
+                if (llc->laddr.lsap != laddr->lsap)
+                        continue;
+                skb1 = skb_clone(skb, GFP_ATOMIC);
+                if (!skb1)
+                        break;
+                sock_hold(sk);
+                skb_set_owner_r(skb1, sk);
+                llc_sap_rcv(sap, skb1);
+                sock_put(sk);
+        }
+        read_unlock_bh(&sap->sk_list.lock);
+}
 void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
 {
        struct llc_addr laddr;
-        struct sock *sk;
        llc_pdu_decode_da(skb, laddr.mac);
        llc_pdu_decode_dsap(skb, &laddr.lsap);
-        sk = llc_lookup_dgram(sap, &laddr);
+        if (llc_mac_multicast(laddr.mac)) {
-        if (sk) {
+                llc_sap_mcast(sap, &laddr, skb);
-                skb_set_owner_r(skb, sk);
-                llc_sap_rcv(sap, skb);
-                sock_put(sk);
-        } else
                kfree_skb(skb);
+        } else {
+                struct sock *sk = llc_lookup_dgram(sap, &laddr);
+                if (sk) {
+                        skb_set_owner_r(skb, sk);
+                        llc_sap_rcv(sap, skb);
+                        sock_put(sk);
+                } else
+                        kfree_skb(skb);
+        }
 }
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e2893effdfa..b1622b7de1c 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK
          of packets, but this mark value is kept in the conntrack session
          instead of the individual packets.
+config NF_CONNTRACK_SECMARK
+        bool  'Connection tracking security mark support'
+        depends on NF_CONNTRACK && NETWORK_SECMARK
+        help
+          This option enables security markings to be applied to
+          connections.  Typically they are copied to connections from
+          packets using the CONNSECMARK target and copied back from
+          connections to packets with the same target, with the packets
+          being originally labeled via SECMARK.
+          If unsure, say 'N'.
 config NF_CONNTRACK_EVENTS
        bool "Connection tracking events (EXPERIMENTAL)"
        depends on EXPERIMENTAL && NF_CONNTRACK
@@ -174,6 +186,26 @@ config NETFILTER_XT_TARGET_NOTRACK
          If you want to compile it as a module, say M here and read
          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config NETFILTER_XT_TARGET_SECMARK
+        tristate '"SECMARK" target support'
+        depends on NETFILTER_XTABLES && NETWORK_SECMARK
+        help
+          The SECMARK target allows security marking of network
+          packets, for use with security subsystems.
+          To compile it as a module, choose M here.  If unsure, say N.
+config NETFILTER_XT_TARGET_CONNSECMARK
+        tristate '"CONNSECMARK" target support'
+        depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK)
+        help
+          The CONNSECMARK target copies security markings from packets
+          to connections, and restores security markings from connections
+          to packets (if the packets are not already marked).  This would
+          normally be used in conjunction with the SECMARK target.
+          To compile it as a module, choose M here.  If unsure, say N.
 config NETFILTER_XT_MATCH_COMMENT
        tristate  '"comment" match support'
        depends on NETFILTER_XTABLES
@@ -329,6 +361,16 @@ config NETFILTER_XT_MATCH_PKTTYPE
          To compile it as a module, choose M here.  If unsure, say N.
+config NETFILTER_XT_MATCH_QUOTA
+        tristate '"quota" match support'
+        depends on NETFILTER_XTABLES
+        help
+          This option adds a `quota' match, which allows to match on a
+          byte counter.
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
 config NETFILTER_XT_MATCH_REALM
        tristate  '"realm" match support'
        depends on NETFILTER_XTABLES
@@ -365,6 +407,12 @@ config NETFILTER_XT_MATCH_STATE
          To compile it as a module, choose M here.  If unsure, say N.
+config NETFILTER_XT_MATCH_STATISTIC
+        tristate '"statistic" match support'
+        depends on NETFILTER_XTABLES
+        help
+          statistic module
 config NETFILTER_XT_MATCH_STRING
        tristate  '"string" match support'
        depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 95b7e416512..6fa4b758045 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -28,6 +28,8 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
@@ -44,9 +46,11 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f9b83f91371..cd299f4b7db 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -990,6 +990,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
 #ifdef CONFIG_NF_CONNTRACK_MARK
                conntrack->mark = exp->master->mark;
 #endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+                conntrack->secmark = exp->master->secmark;
+#endif
                nf_conntrack_get(&conntrack->master->ct_general);
                NF_CT_STAT_INC(expect_new);
        } else
@@ -1396,6 +1399,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
        write_lock_bh(&nf_conntrack_lock);
+        /* Only update if this is not a fixed timeout */
+        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+                write_unlock_bh(&nf_conntrack_lock);
+                return;
+        }
        /* If not in hash table, timer will not be active yet */
        if (!nf_ct_is_confirmed(ct)) {
                ct->timeout.expires = extra_jiffies;
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e38a4b5a308..11d3be24353 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -67,37 +67,48 @@ static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
                             char);
 static struct ftp_search {
-        enum ip_conntrack_dir dir;
        const char *pattern;
        size_t plen;
        char skip;
        char term;
        enum ip_ct_ftp_type ftptype;
        int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
-} search[] = {
+} search[IP_CT_DIR_MAX][2] = {
-        {
+        [IP_CT_DIR_ORIGINAL] = {
-                IP_CT_DIR_ORIGINAL,
+                {
-                "PORT", sizeof("PORT") - 1, ' ', '\r',
+                        .pattern        = "PORT",
-                IP_CT_FTP_PORT,
+                        .plen           = sizeof("PORT") - 1,
-                try_rfc959,
+                        .skip           = ' ',
+                        .term           = '\r',
+                        .ftptype        = IP_CT_FTP_PORT,
+                        .getnum         = try_rfc959,
+                },
+                {
+                        .pattern        = "EPRT",
+                        .plen           = sizeof("EPRT") - 1,
+                        .skip           = ' ',
+                        .term           = '\r',
+                        .ftptype        = IP_CT_FTP_EPRT,
+                        .getnum         = try_eprt,
+                },
        },
-        {
+        [IP_CT_DIR_REPLY] = {
-                IP_CT_DIR_REPLY,
+                {
-                "227 ", sizeof("227 ") - 1, '(', ')',
+                        .pattern        = "227 ",
-                IP_CT_FTP_PASV,
+                        .plen           = sizeof("227 ") - 1,
-                try_rfc959,
+                        .skip           = '(',
-        },
+                        .term           = ')',
-        {
+                        .ftptype        = IP_CT_FTP_PASV,
-                IP_CT_DIR_ORIGINAL,
+                        .getnum         = try_rfc959,
-                "EPRT", sizeof("EPRT") - 1, ' ', '\r',
+                },
-                IP_CT_FTP_EPRT,
+                {
-                try_eprt,
+                        .pattern        = "229 ",
-        },
+                        .plen           = sizeof("229 ") - 1,
-        {
+                        .skip           = '(',
-                IP_CT_DIR_REPLY,
+                        .term           = ')',
-                "229 ", sizeof("229 ") - 1, '(', ')',
+                        .ftptype        = IP_CT_FTP_EPSV,
-                IP_CT_FTP_EPSV,
+                        .getnum         = try_epsv_response,
-                try_epsv_response,
+                },
        },
 };
@@ -492,17 +503,15 @@ static int help(struct sk_buff **pskb,
        memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
               sizeof(cmd.u3.all));
-        for (i = 0; i < ARRAY_SIZE(search); i++) {
+        for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
-                if (search[i].dir != dir) continue;
                found = find_pattern(fb_ptr, datalen,
-                                     search[i].pattern,
+                                     search[dir][i].pattern,
-                                     search[i].plen,
+                                     search[dir][i].plen,
-                                     search[i].skip,
+                                     search[dir][i].skip,
-                                     search[i].term,
+                                     search[dir][i].term,
                                     &matchoff, &matchlen,
                                     &cmd,
-                                     search[i].getnum);
+                                     search[dir][i].getnum);
                if (found) break;
        }
        if (found == -1) {
@@ -512,7 +521,7 @@ static int help(struct sk_buff **pskb,
                   this case. */
                if (net_ratelimit())
                        printk("conntrack_ftp: partial %s %u+%u\n",
-                               search[i].pattern,
+                               search[dir][i].pattern,
                               ntohl(th->seq), datalen);
                ret = NF_DROP;
                goto out;
@@ -597,7 +606,7 @@ static int help(struct sk_buff **pskb,
        /* Now, NAT might want to mangle the packet, and register the
         * (possibly changed) expectation itself. */
        if (nf_nat_ftp_hook)
-                ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+                ret = nf_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype,
                                      matchoff, matchlen, exp, &seq);
        else {
                /* Can't expect this?  Best to drop packet now. */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index bd10eb944b6..b8c7c567c9d 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -407,6 +407,8 @@ nfattr_failure:
 static int ctnetlink_done(struct netlink_callback *cb)
 {
+        if (cb->args[1])
+                nf_ct_put((struct nf_conn *)cb->args[1]);
        DEBUGP("entered %s\n", __FUNCTION__);
        return 0;
 }
@@ -416,10 +418,9 @@ static int ctnetlink_done(struct netlink_callback *cb)
 static int
 ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
-        struct nf_conn *ct = NULL;
+        struct nf_conn *ct, *last;
        struct nf_conntrack_tuple_hash *h;
        struct list_head *i;
-        u_int32_t *id = (u_int32_t *) &cb->args[1];
        struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh);
        u_int8_t l3proto = nfmsg->nfgen_family;
@@ -427,7 +428,9 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
                        cb->args[0], *id);
        read_lock_bh(&nf_conntrack_lock);
-        for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) {
+        for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
+restart:
+                last = (struct nf_conn *)cb->args[1];
                list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) {
                        h = (struct nf_conntrack_tuple_hash *) i;
                        if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -438,17 +441,30 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
                         * then dump everything. */
                        if (l3proto && L3PROTO(ct) != l3proto)
                                continue;
-                        if (ct->id <= *id)
+                        if (last != NULL) {
-                                continue;
+                                if (ct == last) {
+                                        nf_ct_put(last);
+                                        cb->args[1] = 0;
+                                        last = NULL;
+                                } else
+                                        continue;
+                        }
                        if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
                                                cb->nlh->nlmsg_seq,
                                                IPCTNL_MSG_CT_NEW,
-                                                1, ct) < 0)
+                                                1, ct) < 0) {
+                                nf_conntrack_get(&ct->ct_general);
+                                cb->args[1] = (unsigned long)ct;
                                goto out;
-                        *id = ct->id;
+                        }
+                }
+                if (last != NULL) {
+                        nf_ct_put(last);
+                        cb->args[1] = 0;
+                        goto restart;
                }
        }
-out:    
+out:
        read_unlock_bh(&nf_conntrack_lock);
        DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
@@ -641,7 +657,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
 };
 static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
+ctnetlink_parse_nat(struct nfattr *nat,
                    const struct nf_conn *ct, struct ip_nat_range *range)
 {
        struct nfattr *tb[CTA_NAT_MAX];
@@ -651,7 +667,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
        memset(range, 0, sizeof(*range));
        
-        nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+        nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
        if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
                return -EINVAL;
@@ -866,39 +882,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[])
                /* ASSURED bit can only be set */
                return -EINVAL;
-        if (cda[CTA_NAT-1]) {
+        if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 #ifndef CONFIG_IP_NF_NAT_NEEDED
                return -EINVAL;
 #else
-                unsigned int hooknum;
                struct ip_nat_range range;
-                if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+                if (cda[CTA_NAT_DST-1]) {
-                        return -EINVAL;
+                        if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
+                                                &range) < 0)
-                DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
+                                return -EINVAL;
-                       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+                        if (ip_nat_initialized(ct,
-                       htons(range.min.all), htons(range.max.all));
+                                               HOOK2MANIP(NF_IP_PRE_ROUTING)))
-                
+                                return -EEXIST;
-                /* This is tricky but it works. ip_nat_setup_info needs the
+                        ip_nat_setup_info(ct, &range, hooknum);
-                 * hook number as parameter, so let's do the correct 
+                }
-                 * conversion and run away */
+                if (cda[CTA_NAT_SRC-1]) {
-                if (status & IPS_SRC_NAT_DONE)
+                        if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
-                        hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+                                                &range) < 0)
-                else if (status & IPS_DST_NAT_DONE)
+                                return -EINVAL;
-                        hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
+                        if (ip_nat_initialized(ct,
-                else 
+                                               HOOK2MANIP(NF_IP_POST_ROUTING)))
-                        return -EINVAL; /* Missing NAT flags */
+                                return -EEXIST;
+                        ip_nat_setup_info(ct, &range, hooknum);
-                DEBUGP("NAT status: %lu\n", 
+                }
-                       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-                
-                if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
-                        return -EEXIST;
-                ip_nat_setup_info(ct, &range, hooknum);
-                DEBUGP("NAT status after setup_info: %lu\n",
-                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
 #endif
        }
@@ -1122,7 +1129,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
        /* implicit 'else' */
        /* we only allow nat config for new conntracks */
-        if (cda[CTA_NAT-1]) {
+        if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
                err = -EINVAL;
                goto out_unlock;
        }
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69899f27d26..12fb7c0a150 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb,
         * and moreover root might send raw packets.
         */
        /* FIXME: Source route IP option packets --RR */
-        if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+        if (nf_conntrack_checksum &&
-             (pf == PF_INET6 && hooknum  == NF_IP6_PRE_ROUTING)) &&
+            ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+             (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
            nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
                if (LOG_INVALID(IPPROTO_TCP))
                        nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index d93edbfde9e..ae07ebe3ab3 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
         * because the semantic of CHECKSUM_HW is different there
         * and moreover root might send raw packets.
         * FIXME: Source route IP option packets --RR */
-        if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+        if (nf_conntrack_checksum &&
+            ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
             (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
            nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
                if (LOG_INVALID(IPPROTO_UDP))
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 408960c6a54..e34c574f035 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
                return -ENOSPC;
 #endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+        if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+                return -ENOSPC;
+#endif
        if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
                return -ENOSPC;
        
@@ -455,6 +460,8 @@ extern unsigned int nf_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
+int nf_conntrack_checksum = 1;
 static struct ctl_table_header *nf_ct_sysctl_header;
 static ctl_table nf_ct_sysctl_table[] = {
@@ -483,6 +490,14 @@ static ctl_table nf_ct_sysctl_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = NET_NF_CONNTRACK_CHECKSUM,
+                .procname       = "nf_conntrack_checksum",
+                .data           = &nf_conntrack_checksum,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
                .procname       = "nf_conntrack_tcp_timeout_syn_sent",
                .data           = &nf_ct_tcp_timeout_syn_sent,
@@ -851,6 +866,7 @@ EXPORT_SYMBOL(nf_ct_proto_put);
 EXPORT_SYMBOL(nf_ct_l3proto_find_get);
 EXPORT_SYMBOL(nf_ct_l3proto_put);
 EXPORT_SYMBOL(nf_ct_l3protos);
+EXPORT_SYMBOL_GPL(nf_conntrack_checksum);
 EXPORT_SYMBOL(nf_conntrack_expect_alloc);
 EXPORT_SYMBOL(nf_conntrack_expect_put);
 EXPORT_SYMBOL(nf_conntrack_expect_related);
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
new file mode 100644
index 00000000000..8c011e02076
--- /dev/null
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -0,0 +1,155 @@
+/*
+ * This module is used to copy security markings from packets
+ * to connections, and restore security markings from connections
+ * back to packets.  This would normally be performed in conjunction
+ * with the SECMARK target and state match.
+ *
+ * Based somewhat on CONNMARK:
+ *   Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ *    by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CONNSECMARK.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+#define PFX "CONNSECMARK: "
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module");
+MODULE_ALIAS("ipt_CONNSECMARK");
+MODULE_ALIAS("ip6t_CONNSECMARK");
+/*
+ * If the packet has a security mark and the connection does not, copy
+ * the security mark from the packet to the connection.
+ */
+static void secmark_save(struct sk_buff *skb)
+{
+        if (skb->secmark) {
+                u32 *connsecmark;
+                enum ip_conntrack_info ctinfo;
+                connsecmark = nf_ct_get_secmark(skb, &ctinfo);
+                if (connsecmark && !*connsecmark)
+                        if (*connsecmark != skb->secmark)
+                                *connsecmark = skb->secmark;
+        }
+}
+/*
+ * If packet has no security mark, and the connection does, restore the
+ * security mark from the connection to the packet.
+ */
+static void secmark_restore(struct sk_buff *skb)
+{
+        if (!skb->secmark) {
+                u32 *connsecmark;
+                enum ip_conntrack_info ctinfo;
+                connsecmark = nf_ct_get_secmark(skb, &ctinfo);
+                if (connsecmark && *connsecmark)
+                        if (skb->secmark != *connsecmark)
+                                skb->secmark = *connsecmark;
+        }
+}
+static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
+                           const struct net_device *out, unsigned int hooknum,
+                           const struct xt_target *target,
+                           const void *targinfo, void *userinfo)
+{
+        struct sk_buff *skb = *pskb;
+        const struct xt_connsecmark_target_info *info = targinfo;
+        switch (info->mode) {
+        case CONNSECMARK_SAVE:
+                secmark_save(skb);
+                break;
+        case CONNSECMARK_RESTORE:
+                secmark_restore(skb);
+                break;
+        default:
+                BUG();
+        }
+        return XT_CONTINUE;
+}
+static int checkentry(const char *tablename, const void *entry,
+                      const struct xt_target *target, void *targinfo,
+                      unsigned int targinfosize, unsigned int hook_mask)
+{
+        struct xt_connsecmark_target_info *info = targinfo;
+        switch (info->mode) {
+        case CONNSECMARK_SAVE:
+        case CONNSECMARK_RESTORE:
+                break;
+        default:
+                printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
+                return 0;
+        }
+        return 1;
+}
+static struct xt_target ipt_connsecmark_reg = {
+        .name           = "CONNSECMARK",
+        .target         = target,
+        .targetsize     = sizeof(struct xt_connsecmark_target_info),
+        .table          = "mangle",
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+        .family         = AF_INET,
+        .revision       = 0,
+};
+static struct xt_target ip6t_connsecmark_reg = {
+        .name           = "CONNSECMARK",
+        .target         = target,
+        .targetsize     = sizeof(struct xt_connsecmark_target_info),
+        .table          = "mangle",
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+        .family         = AF_INET6,
+        .revision       = 0,
+};
+static int __init xt_connsecmark_init(void)
+{
+        int err;
+        need_conntrack();
+        err = xt_register_target(&ipt_connsecmark_reg);
+        if (err)
+                return err;
+        err = xt_register_target(&ip6t_connsecmark_reg);
+        if (err)
+                xt_unregister_target(&ipt_connsecmark_reg);
+        return err;
+}
+static void __exit xt_connsecmark_fini(void)
+{
+        xt_unregister_target(&ip6t_connsecmark_reg);
+        xt_unregister_target(&ipt_connsecmark_reg);
+}
+module_init(xt_connsecmark_init);
+module_exit(xt_connsecmark_fini);
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
new file mode 100644
index 00000000000..c2ce9c4011c
--- /dev/null
+++ b/net/netfilter/xt_SECMARK.c
@@ -0,0 +1,156 @@
+/*
+ * Module for modifying the secmark field of the skb, for use by
+ * security subsystems.
+ *
+ * Based on the nfmark match by:
+ * (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/selinux.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SECMARK.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("ip[6]tables SECMARK modification module");
+MODULE_ALIAS("ipt_SECMARK");
+MODULE_ALIAS("ip6t_SECMARK");
+#define PFX "SECMARK: "
+static u8 mode;
+static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
+                           const struct net_device *out, unsigned int hooknum,
+                           const struct xt_target *target,
+                           const void *targinfo, void *userinfo)
+{
+        u32 secmark = 0;
+        const struct xt_secmark_target_info *info = targinfo;
+        BUG_ON(info->mode != mode);
+        switch (mode) {
+        case SECMARK_MODE_SEL:
+                secmark = info->u.sel.selsid;
+                break;
+        default:
+                BUG();
+        }
+        if ((*pskb)->secmark != secmark)
+                (*pskb)->secmark = secmark;
+        return XT_CONTINUE;
+}
+static int checkentry_selinux(struct xt_secmark_target_info *info)
+{
+        int err;
+        struct xt_secmark_target_selinux_info *sel = &info->u.sel;
+        err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+        if (err) {
+                if (err == -EINVAL)
+                        printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n",
+                               sel->selctx);
+                return 0;
+        }
+        if (!sel->selsid) {
+                printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n",
+                       sel->selctx);
+                return 0;
+        }
+        err = selinux_relabel_packet_permission(sel->selsid);
+        if (err) {
+                printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
+                return 0;
+        }
+        return 1;
+}
+static int checkentry(const char *tablename, const void *entry,
+                      const struct xt_target *target, void *targinfo,
+                      unsigned int targinfosize, unsigned int hook_mask)
+{
+        struct xt_secmark_target_info *info = targinfo;
+        if (mode && mode != info->mode) {
+                printk(KERN_INFO PFX "mode already set to %hu cannot mix with "
+                       "rules for mode %hu\n", mode, info->mode);
+                return 0;
+        }
+        switch (info->mode) {
+        case SECMARK_MODE_SEL:
+                if (!checkentry_selinux(info))
+                        return 0;
+                break;
+        default:
+                printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
+                return 0;
+        }
+        if (!mode)
+                mode = info->mode;
+        return 1;
+}
+static struct xt_target ipt_secmark_reg = {
+        .name           = "SECMARK",
+        .target         = target,
+        .targetsize     = sizeof(struct xt_secmark_target_info),
+        .table          = "mangle",
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+        .family         = AF_INET,
+        .revision       = 0,
+};
+static struct xt_target ip6t_secmark_reg = {
+        .name           = "SECMARK",
+        .target         = target,
+        .targetsize     = sizeof(struct xt_secmark_target_info),
+        .table          = "mangle",
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+        .family         = AF_INET6,
+        .revision       = 0,
+};
+static int __init xt_secmark_init(void)
+{
+        int err;
+        err = xt_register_target(&ipt_secmark_reg);
+        if (err)
+                return err;
+        err = xt_register_target(&ip6t_secmark_reg);
+        if (err)
+                xt_unregister_target(&ipt_secmark_reg);
+        return err;
+}
+static void __exit xt_secmark_fini(void)
+{
+        xt_unregister_target(&ip6t_secmark_reg);
+        xt_unregister_target(&ipt_secmark_reg);
+}
+module_init(xt_secmark_init);
+module_exit(xt_secmark_fini);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index dc26a27cbca..56324c8aff0 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -58,7 +58,7 @@ checkentry(const char *tablename,
           unsigned int matchsize,
           unsigned int hook_mask)
 {
-        struct xt_connmark_info *cm = (struct xt_connmark_info *)matchinfo;
+        struct xt_connmark_info *cm = matchinfo;
        if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
                printk(KERN_WARNING "connmark: only support 32bit mark\n");
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index dfb10b648e5..2e2f825dad4 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -101,8 +101,7 @@ match(const struct sk_buff *skb,
      unsigned int protoff,
      int *hotdrop)
 {
-        const struct xt_dccp_info *info = 
+        const struct xt_dccp_info *info = matchinfo;
-                                (const struct xt_dccp_info *)matchinfo;
        struct dccp_hdr _dh, *dh;
        if (offset)
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 8b385a34886..876bc579773 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -42,7 +42,7 @@ checkentry(const char *tablename,
           unsigned int matchsize,
           unsigned int hook_mask)
 {
-        struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo;
+        const struct xt_mark_info *minfo = matchinfo;
        if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
                printk(KERN_WARNING "mark: only supports 32bit mark\n");
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index b56cd2baaac..1ff0a25396e 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -1,4 +1,4 @@
-/* Kernel module to match one of a list of TCP/UDP ports: ports are in
+/* Kernel module to match one of a list of TCP/UDP/SCTP/DCCP ports: ports are in
   the same place so we can treat them as equal. */
 /* (C) 1999-2001 Paul `Rusty' Russell
@@ -160,8 +160,9 @@ check(u_int16_t proto,
      u_int8_t match_flags,
      u_int8_t count)
 {
-        /* Must specify proto == TCP/UDP, no unknown flags or bad count */
+        /* Must specify supported protocol, no unknown flags or bad count */
-        return (proto == IPPROTO_TCP || proto == IPPROTO_UDP)
+        return (proto == IPPROTO_TCP || proto == IPPROTO_UDP
+                || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP)
                && !(ip_invflags & XT_INV_PROTO)
                && (match_flags == XT_MULTIPORT_SOURCE
                    || match_flags == XT_MULTIPORT_DESTINATION
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
new file mode 100644
index 00000000000..4cdba7469dc
--- /dev/null
+++ b/net/netfilter/xt_quota.c
@@ -0,0 +1,96 @@
+/*
+ * netfilter module to enforce network quotas
+ *
+ * Sam Johnston <samj@samj.net>
+ */
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+static DEFINE_SPINLOCK(quota_lock);
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in, const struct net_device *out,
+      const struct xt_match *match, const void *matchinfo,
+      int offset, unsigned int protoff, int *hotdrop)
+{
+        struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master;
+        int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0;
+        spin_lock_bh(&quota_lock);
+        if (q->quota >= skb->len) {
+                q->quota -= skb->len;
+                ret ^= 1;
+        } else {
+                /* we do not allow even small packets from now on */
+                q->quota = 0;
+        }
+        spin_unlock_bh(&quota_lock);
+        return ret;
+}
+static int
+checkentry(const char *tablename, const void *entry,
+           const struct xt_match *match, void *matchinfo,
+           unsigned int matchsize, unsigned int hook_mask)
+{
+        struct xt_quota_info *q = (struct xt_quota_info *)matchinfo;
+        if (q->flags & ~XT_QUOTA_MASK)
+                return 0;
+        /* For SMP, we only want to use one set of counters. */
+        q->master = q;
+        return 1;
+}
+static struct xt_match quota_match = {
+        .name           = "quota",
+        .family         = AF_INET,
+        .match          = match,
+        .matchsize      = sizeof(struct xt_quota_info),
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE
+};
+static struct xt_match quota_match6 = {
+        .name           = "quota",
+        .family         = AF_INET6,
+        .match          = match,
+        .matchsize      = sizeof(struct xt_quota_info),
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE
+};
+static int __init xt_quota_init(void)
+{
+        int ret;
+        ret = xt_register_match(&quota_match);
+        if (ret)
+                goto err1;
+        ret = xt_register_match(&quota_match6);
+        if (ret)
+                goto err2;
+        return ret;
+err2:
+        xt_unregister_match(&quota_match);
+err1:
+        return ret;
+}
+static void __exit xt_quota_fini(void)
+{
+        xt_unregister_match(&quota_match6);
+        xt_unregister_match(&quota_match);
+}
+module_init(xt_quota_init);
+module_exit(xt_quota_fini);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 34bd87259a0..b5110e5b54b 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -129,11 +129,9 @@ match(const struct sk_buff *skb,
      unsigned int protoff,
      int *hotdrop)
 {
-        const struct xt_sctp_info *info;
+        const struct xt_sctp_info *info = matchinfo;
        sctp_sctphdr_t _sh, *sh;
-        info = (const struct xt_sctp_info *)matchinfo;
        if (offset) {
                duprintf("Dropping non-first fragment.. FIXME\n");
                return 0;
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
new file mode 100644
index 00000000000..de1037f5859
--- /dev/null
+++ b/net/netfilter/xt_statistic.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>.
+ */
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/netfilter/xt_statistic.h>
+#include <linux/netfilter/x_tables.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("xtables statistical match module");
+MODULE_ALIAS("ipt_statistic");
+MODULE_ALIAS("ip6t_statistic");
+static DEFINE_SPINLOCK(nth_lock);
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in, const struct net_device *out,
+      const struct xt_match *match, const void *matchinfo,
+      int offset, unsigned int protoff, int *hotdrop)
+{
+        struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
+        int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0;
+        switch (info->mode) {
+        case XT_STATISTIC_MODE_RANDOM:
+                if ((net_random() & 0x7FFFFFFF) < info->u.random.probability)
+                        ret ^= 1;
+                break;
+        case XT_STATISTIC_MODE_NTH:
+                info = info->master;
+                spin_lock_bh(&nth_lock);
+                if (info->u.nth.count++ == info->u.nth.every) {
+                        info->u.nth.count = 0;
+                        ret ^= 1;
+                }
+                spin_unlock_bh(&nth_lock);
+                break;
+        }
+        return ret;
+}
+static int
+checkentry(const char *tablename, const void *entry,
+           const struct xt_match *match, void *matchinfo,
+           unsigned int matchsize, unsigned int hook_mask)
+{
+        struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
+        if (info->mode > XT_STATISTIC_MODE_MAX ||
+            info->flags & ~XT_STATISTIC_MASK)
+                return 0;
+        info->master = info;
+        return 1;
+}
+static struct xt_match statistic_match = {
+        .name           = "statistic",
+        .match          = match,
+        .matchsize      = sizeof(struct xt_statistic_info),
+        .checkentry     = checkentry,
+        .family         = AF_INET,
+        .me             = THIS_MODULE,
+};
+static struct xt_match statistic_match6 = {
+        .name           = "statistic",
+        .match          = match,
+        .matchsize      = sizeof(struct xt_statistic_info),
+        .checkentry     = checkentry,
+        .family         = AF_INET6,
+        .me             = THIS_MODULE,
+};
+static int __init xt_statistic_init(void)
+{
+        int ret;
+        ret = xt_register_match(&statistic_match);
+        if (ret)
+                goto err1;
+        ret = xt_register_match(&statistic_match6);
+        if (ret)
+                goto err2;
+        return ret;
+err2:
+        xt_unregister_match(&statistic_match);
+err1:
+        return ret;
+}
+static void __exit xt_statistic_fini(void)
+{
+        xt_unregister_match(&statistic_match6);
+        xt_unregister_match(&statistic_match);
+}
+module_init(xt_statistic_init);
+module_exit(xt_statistic_fini);
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 79d9ea6964b..0ebb6ac2c8c 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -30,8 +30,8 @@ static int match(const struct sk_buff *skb,
                 unsigned int protoff,
                 int *hotdrop)
 {
+        const struct xt_string_info *conf = matchinfo;
        struct ts_state state;
-        struct xt_string_info *conf = (struct xt_string_info *) matchinfo;
        memset(&state, 0, sizeof(struct ts_state));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 138ea92ed26..b1e4c5e20ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -72,9 +72,9 @@ void qdisc_unlock_tree(struct net_device *dev)
   dev->queue_lock serializes queue accesses for this device
   AND dev->qdisc pointer itself.
-   dev->xmit_lock serializes accesses to device driver.
+   netif_tx_lock serializes accesses to device driver.
-   dev->queue_lock and dev->xmit_lock are mutually exclusive,
+   dev->queue_lock and netif_tx_lock are mutually exclusive,
   if one is grabbed, another must be free.
 */
@@ -108,7 +108,7 @@ int qdisc_restart(struct net_device *dev)
                 * will be requeued.
                 */
                if (!nolock) {
-                        if (!spin_trylock(&dev->xmit_lock)) {
+                        if (!netif_tx_trylock(dev)) {
                        collision:
                                /* So, someone grabbed the driver. */
                                
@@ -126,8 +126,6 @@ int qdisc_restart(struct net_device *dev)
                                __get_cpu_var(netdev_rx_stat).cpu_collision++;
                                goto requeue;
                        }
-                        /* Remember that the driver is grabbed by us. */
-                        dev->xmit_lock_owner = smp_processor_id();
                }
                
                {
@@ -142,8 +140,7 @@ int qdisc_restart(struct net_device *dev)
                                ret = dev->hard_start_xmit(skb, dev);
                                if (ret == NETDEV_TX_OK) { 
                                        if (!nolock) {
-                                                dev->xmit_lock_owner = -1;
+                                                netif_tx_unlock(dev);
-                                                spin_unlock(&dev->xmit_lock);
                                        }
                                        spin_lock(&dev->queue_lock);
                                        return -1;
@@ -157,8 +154,7 @@ int qdisc_restart(struct net_device *dev)
                        /* NETDEV_TX_BUSY - we need to requeue */
                        /* Release the driver */
                        if (!nolock) { 
-                                dev->xmit_lock_owner = -1;
+                                netif_tx_unlock(dev);
-                                spin_unlock(&dev->xmit_lock);
                        } 
                        spin_lock(&dev->queue_lock);
                        q = dev->qdisc;
@@ -187,7 +183,7 @@ static void dev_watchdog(unsigned long arg)
 {
        struct net_device *dev = (struct net_device *)arg;
-        spin_lock(&dev->xmit_lock);
+        netif_tx_lock(dev);
        if (dev->qdisc != &noop_qdisc) {
                if (netif_device_present(dev) &&
                    netif_running(dev) &&
@@ -203,7 +199,7 @@ static void dev_watchdog(unsigned long arg)
                                dev_hold(dev);
                }
        }
-        spin_unlock(&dev->xmit_lock);
+        netif_tx_unlock(dev);
        dev_put(dev);
 }
@@ -227,17 +223,17 @@ void __netdev_watchdog_up(struct net_device *dev)
 static void dev_watchdog_up(struct net_device *dev)
 {
-        spin_lock_bh(&dev->xmit_lock);
+        netif_tx_lock_bh(dev);
        __netdev_watchdog_up(dev);
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
 }
 static void dev_watchdog_down(struct net_device *dev)
 {
-        spin_lock_bh(&dev->xmit_lock);
+        netif_tx_lock_bh(dev);
        if (del_timer(&dev->watchdog_timer))
                dev_put(dev);
-        spin_unlock_bh(&dev->xmit_lock);
+        netif_tx_unlock_bh(dev);
 }
 void netif_carrier_on(struct net_device *dev)
@@ -582,7 +578,7 @@ void dev_deactivate(struct net_device *dev)
        while (test_bit(__LINK_STATE_SCHED, &dev->state))
                yield();
-        spin_unlock_wait(&dev->xmit_lock);
+        spin_unlock_wait(&dev->_xmit_lock);
 }
 void dev_init_scheduler(struct net_device *dev)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 79b8ef34c6e..4c16ad57a3e 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -302,20 +302,17 @@ restart:
                switch (teql_resolve(skb, skb_res, slave)) {
                case 0:
-                        if (spin_trylock(&slave->xmit_lock)) {
+                        if (netif_tx_trylock(slave)) {
-                                slave->xmit_lock_owner = smp_processor_id();
                                if (!netif_queue_stopped(slave) &&
                                    slave->hard_start_xmit(skb, slave) == 0) {
-                                        slave->xmit_lock_owner = -1;
+                                        netif_tx_unlock(slave);
-                                        spin_unlock(&slave->xmit_lock);
                                        master->slaves = NEXT_SLAVE(q);
                                        netif_wake_queue(dev);
                                        master->stats.tx_packets++;
                                        master->stats.tx_bytes += len;
                                        return 0;
                                }
-                                slave->xmit_lock_owner = -1;
+                                netif_tx_unlock(slave);
-                                spin_unlock(&slave->xmit_lock);
                        }
                        if (netif_queue_stopped(dev))
                                busy = 1;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1662f9cc869..42b66e74bbb 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -141,7 +141,8 @@ int sctp_rcv(struct sk_buff *skb)
        __skb_pull(skb, skb->h.raw - skb->data);
        if (skb->len < sizeof(struct sctphdr))
                goto discard_it;
-        if (sctp_rcv_checksum(skb) < 0)
+        if ((skb->ip_summed != CHECKSUM_UNNECESSARY) &&
+            (sctp_rcv_checksum(skb) < 0))
                goto discard_it;
        skb_pull(skb, sizeof(struct sctphdr));
@@ -170,7 +171,8 @@ int sctp_rcv(struct sk_buff *skb)
         * IP broadcast addresses cannot be used in an SCTP transport
         * address."
         */
-        if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL))
+        if (!af->addr_valid(&src, NULL, skb) ||
+            !af->addr_valid(&dest, NULL, skb))
                goto discard_it;
        asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c20d282fac0..8ef08070c8b 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -523,7 +523,9 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
 * Return 0 - If the address is a non-unicast or an illegal address.
 * Return 1 - If the address is a unicast.
 */
-static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
+static int sctp_v6_addr_valid(union sctp_addr *addr,
+                              struct sctp_sock *sp,
+                              const struct sk_buff *skb)
 {
        int ret = ipv6_addr_type(&addr->v6.sin6_addr);
@@ -537,7 +539,7 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
                if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
                        return 0;
                sctp_v6_map_v4(addr);
-                return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp);
+                return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb);
        }
        /* Is this a non-unicast address */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 437cba7260a..cdc5a393676 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -295,14 +295,14 @@ int sctp_packet_transmit(struct sctp_packet *packet)
        struct sctp_transport *tp = packet->transport;
        struct sctp_association *asoc = tp->asoc;
        struct sctphdr *sh;
-        __u32 crc32;
+        __u32 crc32 = 0;
        struct sk_buff *nskb;
        struct sctp_chunk *chunk, *tmp;
        struct sock *sk;
        int err = 0;
        int padding;            /* How much padding do we need?  */
        __u8 has_data = 0;
-        struct dst_entry *dst;
+        struct dst_entry *dst = tp->dst;
        SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
@@ -327,6 +327,19 @@ int sctp_packet_transmit(struct sctp_packet *packet)
         */
        skb_set_owner_w(nskb, sk);
+        /* The 'obsolete' field of dst is set to 2 when a dst is freed. */
+        if (!dst || (dst->obsolete > 1)) {
+                dst_release(dst);
+                sctp_transport_route(tp, NULL, sctp_sk(sk));
+                if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
+                        sctp_assoc_sync_pmtu(asoc);
+                }
+        }
+        nskb->dst = dst_clone(tp->dst);
+        if (!nskb->dst)
+                goto no_route;
+        dst = nskb->dst;
        /* Build the SCTP header.  */
        sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
        sh->source = htons(packet->source_port);
@@ -350,7 +363,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
         * Note: Adler-32 is no longer applicable, as has been replaced
         * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
         */
-        crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr));
+        if (!(dst->dev->features & NETIF_F_NO_CSUM))
+                crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr));
        /**
         * 6.10 Bundling
@@ -402,9 +416,14 @@ int sctp_packet_transmit(struct sctp_packet *packet)
                if (padding)
                        memset(skb_put(chunk->skb, padding), 0, padding);
-                crc32 = sctp_update_copy_cksum(skb_put(nskb, chunk->skb->len),
+                if (dst->dev->features & NETIF_F_NO_CSUM)
-                                               chunk->skb->data,
+                        memcpy(skb_put(nskb, chunk->skb->len),
-                                               chunk->skb->len, crc32);
+                               chunk->skb->data, chunk->skb->len);
+                else
+                        crc32 = sctp_update_copy_cksum(skb_put(nskb,
+                                                        chunk->skb->len),
+                                                chunk->skb->data,
+                                                chunk->skb->len, crc32);
                SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n",
                                  "*** Chunk", chunk,
@@ -427,7 +446,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
        }
        /* Perform final transformation on checksum. */
-        crc32 = sctp_end_cksum(crc32);
+        if (!(dst->dev->features & NETIF_F_NO_CSUM))
+                crc32 = sctp_end_cksum(crc32);
        /* 3) Put the resultant value into the checksum field in the
         *    common header, and leave the rest of the bits unchanged.
@@ -477,20 +497,6 @@ int sctp_packet_transmit(struct sctp_packet *packet)
                }
        }
-        dst = tp->dst;
-        /* The 'obsolete' field of dst is set to 2 when a dst is freed. */
-        if (!dst || (dst->obsolete > 1)) {
-                dst_release(dst);
-                sctp_transport_route(tp, NULL, sctp_sk(sk));
-                if (asoc->param_flags & SPP_PMTUD_ENABLE) {
-                        sctp_assoc_sync_pmtu(asoc);
-                }
-        }
-        nskb->dst = dst_clone(tp->dst);
-        if (!nskb->dst)
-                goto no_route;
        SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
                          nskb->len);
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f148f9576dd..e5faa351aaa 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1262,6 +1262,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
                                if (!tchunk->tsn_gap_acked &&
                                    !tchunk->resent &&
                                    tchunk->rtt_in_progress) {
+                                        tchunk->rtt_in_progress = 0;
                                        rtt = jiffies - tchunk->sent_at;
                                        sctp_transport_update_rto(transport,
                                                                  rtt);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2088aa992b7..816c033d788 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -365,12 +365,18 @@ static int sctp_v4_is_any(const union sctp_addr *addr)
 * Return 0 - If the address is a non-unicast or an illegal address.
 * Return 1 - If the address is a unicast.
 */
-static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
+static int sctp_v4_addr_valid(union sctp_addr *addr,
+                              struct sctp_sock *sp,
+                              const struct sk_buff *skb)
 {
        /* Is this a non-unicast address or a unusable SCTP address? */
        if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr))
                return 0;
+        /* Is this a broadcast address? */
+        if (skb && ((struct rtable *)skb->dst)->rt_flags & RTCF_BROADCAST)
+                return 0;
        return 1;
 }
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8bc279219a7..9e58144f485 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5293,10 +5293,18 @@ static int sctp_eat_data(const struct sctp_association *asoc,
         * seems a bit troublesome in that frag_point varies based on
         * PMTU.  In cases, such as loopback, this might be a rather
         * large spill over.
+         * NOTE: If we have a full receive buffer here, we only renege if
+         * our receiver can still make progress without the tsn being
+         * received. We do this because in the event that the associations
+         * receive queue is empty we are filling a leading gap, and since
+         * reneging moves the gap to the end of the tsn stream, we are likely
+         * to stall again very shortly. Avoiding the renege when we fill a
+         * leading gap is a good heuristic for avoiding such steady state
+         * stalls.
         */
        if (!asoc->rwnd || asoc->rwnd_over ||
            (datalen > asoc->rwnd + asoc->frag_point) ||
-            rcvbuf_over) {
+            (rcvbuf_over && (!skb_queue_len(&sk->sk_receive_queue)))) {
                /* If this is the next TSN, consider reneging to make
                 * room.   Note: Playing nice with a confused sender.  A
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 174d4d35e95..b811691c35b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -172,7 +172,7 @@ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
                return -EINVAL;
        /* Is this a valid SCTP address?  */
-        if (!af->addr_valid(addr, sctp_sk(sk)))
+        if (!af->addr_valid(addr, sctp_sk(sk), NULL))
                return -EINVAL;
        if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr)))
@@ -2530,8 +2530,32 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, int o
        /* Set the values to the specific association */
        if (asoc) {
-                if (assocparams.sasoc_asocmaxrxt != 0)
+                if (assocparams.sasoc_asocmaxrxt != 0) {
+                        __u32 path_sum = 0;
+                        int   paths = 0;
+                        struct list_head *pos;
+                        struct sctp_transport *peer_addr;
+                        list_for_each(pos, &asoc->peer.transport_addr_list) {
+                                peer_addr = list_entry(pos,
+                                                struct sctp_transport,
+                                                transports);
+                                path_sum += peer_addr->pathmaxrxt;
+                                paths++;
+                        }
+                        /* Only validate asocmaxrxt if we have more then
+                         * one path/transport.  We do this because path
+                         * retransmissions are only counted when we have more
+                         * then one path.
+                         */
+                        if (paths > 1 &&
+                            assocparams.sasoc_asocmaxrxt > path_sum)
+                                return -EINVAL;
                        asoc->max_retrans = assocparams.sasoc_asocmaxrxt;
+                }
                if (assocparams.sasoc_cookie_life != 0) {
                        asoc->cookie_life.tv_sec =
                                        assocparams.sasoc_cookie_life / 1000;
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index ba97f974f57..ee236784a6b 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -51,6 +51,8 @@
 static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
                                       struct sctp_association *asoc);
 static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
+static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event);
 /* Initialize an ULP event from an given skb.  */
 SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
@@ -883,6 +885,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
 static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
 {
        struct sk_buff *skb, *frag;
+        unsigned int    len;
        /* Current stack structures assume that the rcv buffer is
         * per socket.   For UDP style sockets this is not true as
@@ -892,7 +895,30 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
         */
        skb = sctp_event2skb(event);
-        sctp_assoc_rwnd_increase(event->asoc, skb_headlen(skb));
+        len = skb->len;
+        if (!skb->data_len)
+                goto done;
+        /* Don't forget the fragments. */
+        for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+                /* NOTE:  skb_shinfos are recursive. Although IP returns
+                 * skb's with only 1 level of fragments, SCTP reassembly can
+                 * increase the levels.
+                 */
+                sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
+        }
+done:
+        sctp_assoc_rwnd_increase(event->asoc, len);
+        sctp_ulpevent_release_owner(event);
+}
+static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
+{
+        struct sk_buff *skb, *frag;
+        skb = sctp_event2skb(event);
        if (!skb->data_len)
                goto done;
@@ -903,7 +929,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
                 * skb's with only 1 level of fragments, SCTP reassembly can
                 * increase the levels.
                 */
-                sctp_ulpevent_release_data(sctp_skb2event(frag));
+                sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
        }
 done:
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b469c8b5461..b8936926c24 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
+static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family);
+static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo);
 int xfrm_register_type(struct xfrm_type *type, unsigned short family)
 {
-        struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+        struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
-        struct xfrm_type_map *typemap;
+        struct xfrm_type **typemap;
        int err = 0;
        if (unlikely(afinfo == NULL))
                return -EAFNOSUPPORT;
        typemap = afinfo->type_map;
-        write_lock_bh(&typemap->lock);
+        if (likely(typemap[type->proto] == NULL))
-        if (likely(typemap->map[type->proto] == NULL))
+                typemap[type->proto] = type;
-                typemap->map[type->proto] = type;
        else
                err = -EEXIST;
-        write_unlock_bh(&typemap->lock);
+        xfrm_policy_unlock_afinfo(afinfo);
-        xfrm_policy_put_afinfo(afinfo);
        return err;
 }
 EXPORT_SYMBOL(xfrm_register_type);
 int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
 {
-        struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+        struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
-        struct xfrm_type_map *typemap;
+        struct xfrm_type **typemap;
        int err = 0;
        if (unlikely(afinfo == NULL))
                return -EAFNOSUPPORT;
        typemap = afinfo->type_map;
-        write_lock_bh(&typemap->lock);
+        if (unlikely(typemap[type->proto] != type))
-        if (unlikely(typemap->map[type->proto] != type))
                err = -ENOENT;
        else
-                typemap->map[type->proto] = NULL;
+                typemap[type->proto] = NULL;
-        write_unlock_bh(&typemap->lock);
+        xfrm_policy_unlock_afinfo(afinfo);
-        xfrm_policy_put_afinfo(afinfo);
        return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_type);
@@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type);
 struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
 {
        struct xfrm_policy_afinfo *afinfo;
-        struct xfrm_type_map *typemap;
+        struct xfrm_type **typemap;
        struct xfrm_type *type;
        int modload_attempted = 0;
@@ -102,11 +100,9 @@ retry:
                return NULL;
        typemap = afinfo->type_map;
-        read_lock(&typemap->lock);
+        type = typemap[proto];
-        type = typemap->map[proto];
        if (unlikely(type && !try_module_get(type->owner)))
                type = NULL;
-        read_unlock(&typemap->lock);
        if (!type && !modload_attempted) {
                xfrm_policy_put_afinfo(afinfo);
                request_module("xfrm-type-%d-%d",
@@ -142,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type)
        module_put(type->owner);
 }
+int xfrm_register_mode(struct xfrm_mode *mode, int family)
+{
+        struct xfrm_policy_afinfo *afinfo;
+        struct xfrm_mode **modemap;
+        int err;
+        if (unlikely(mode->encap >= XFRM_MODE_MAX))
+                return -EINVAL;
+        afinfo = xfrm_policy_lock_afinfo(family);
+        if (unlikely(afinfo == NULL))
+                return -EAFNOSUPPORT;
+        err = -EEXIST;
+        modemap = afinfo->mode_map;
+        if (likely(modemap[mode->encap] == NULL)) {
+                modemap[mode->encap] = mode;
+                err = 0;
+        }
+        xfrm_policy_unlock_afinfo(afinfo);
+        return err;
+}
+EXPORT_SYMBOL(xfrm_register_mode);
+int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
+{
+        struct xfrm_policy_afinfo *afinfo;
+        struct xfrm_mode **modemap;
+        int err;
+        if (unlikely(mode->encap >= XFRM_MODE_MAX))
+                return -EINVAL;
+        afinfo = xfrm_policy_lock_afinfo(family);
+        if (unlikely(afinfo == NULL))
+                return -EAFNOSUPPORT;
+        err = -ENOENT;
+        modemap = afinfo->mode_map;
+        if (likely(modemap[mode->encap] == mode)) {
+                modemap[mode->encap] = NULL;
+                err = 0;
+        }
+        xfrm_policy_unlock_afinfo(afinfo);
+        return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_mode);
+struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
+{
+        struct xfrm_policy_afinfo *afinfo;
+        struct xfrm_mode *mode;
+        int modload_attempted = 0;
+        if (unlikely(encap >= XFRM_MODE_MAX))
+                return NULL;
+retry:
+        afinfo = xfrm_policy_get_afinfo(family);
+        if (unlikely(afinfo == NULL))
+                return NULL;
+        mode = afinfo->mode_map[encap];
+        if (unlikely(mode && !try_module_get(mode->owner)))
+                mode = NULL;
+        if (!mode && !modload_attempted) {
+                xfrm_policy_put_afinfo(afinfo);
+                request_module("xfrm-mode-%d-%d", family, encap);
+                modload_attempted = 1;
+                goto retry;
+        }
+        xfrm_policy_put_afinfo(afinfo);
+        return mode;
+}
+void xfrm_put_mode(struct xfrm_mode *mode)
+{
+        module_put(mode->owner);
+}
 static inline unsigned long make_jiffies(long secs)
 {
        if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
@@ -1306,17 +1385,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
                return NULL;
        read_lock(&xfrm_policy_afinfo_lock);
        afinfo = xfrm_policy_afinfo[family];
-        if (likely(afinfo != NULL))
+        if (unlikely(!afinfo))
-                read_lock(&afinfo->lock);
+                read_unlock(&xfrm_policy_afinfo_lock);
-        read_unlock(&xfrm_policy_afinfo_lock);
        return afinfo;
 }
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
-        if (unlikely(afinfo == NULL))
+        read_unlock(&xfrm_policy_afinfo_lock);
-                return;
+}
-        read_unlock(&afinfo->lock);
+static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family)
+{
+        struct xfrm_policy_afinfo *afinfo;
+        if (unlikely(family >= NPROTO))
+                return NULL;
+        write_lock_bh(&xfrm_policy_afinfo_lock);
+        afinfo = xfrm_policy_afinfo[family];
+        if (unlikely(!afinfo))
+                write_unlock_bh(&xfrm_policy_afinfo_lock);
+        return afinfo;
+}
+static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+        write_unlock_bh(&xfrm_policy_afinfo_lock);
 }
 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 93a2f36ad3d..17b29ec3c41 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
        kfree(x->ealg);
        kfree(x->calg);
        kfree(x->encap);
+        if (x->mode)
+                xfrm_put_mode(x->mode);
        if (x->type) {
                x->type->destructor(x);
                xfrm_put_type(x->type);
@@ -1103,17 +1105,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
                return NULL;
        read_lock(&xfrm_state_afinfo_lock);
        afinfo = xfrm_state_afinfo[family];
-        if (likely(afinfo != NULL))
+        if (unlikely(!afinfo))
-                read_lock(&afinfo->lock);
+                read_unlock(&xfrm_state_afinfo_lock);
-        read_unlock(&xfrm_state_afinfo_lock);
        return afinfo;
 }
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
 {
-        if (unlikely(afinfo == NULL))
+        read_unlock(&xfrm_state_afinfo_lock);
-                return;
-        read_unlock(&afinfo->lock);
 }
 /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
@@ -1196,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x)
        if (err)
                goto error;
+        x->mode = xfrm_get_mode(x->props.mode, family);
+        if (x->mode == NULL)
+                goto error;
        x->km.state = XFRM_STATE_VALID;
 error:
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 81d1005830f..c21dc26141e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -427,23 +427,25 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
        if (x == NULL)
                return -ESRCH;
+        if ((err = security_xfrm_state_delete(x)) != 0)
+                goto out;
        if (xfrm_state_kern(x)) {
-                xfrm_state_put(x);
+                err = -EPERM;
-                return -EPERM;
+                goto out;
        }
        err = xfrm_state_delete(x);
-        if (err < 0) {
+        if (err < 0)
-                xfrm_state_put(x);
+                goto out;
-                return err;
-        }
        c.seq = nlh->nlmsg_seq;
        c.pid = nlh->nlmsg_pid;
        c.event = nlh->nlmsg_type;
        km_state_notify(x, &c);
-        xfrm_state_put(x);
+out:
+        xfrm_state_put(x);
        return err;
 }
@@ -1055,6 +1057,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
                                              MSG_DONTWAIT);
                }
        } else {
+                if ((err = security_xfrm_policy_delete(xp)) != 0)
+                        goto out;
                c.data.byid = p->index;
                c.event = nlh->nlmsg_type;
                c.seq = nlh->nlmsg_seq;
@@ -1064,6 +1068,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
        xfrm_pol_put(xp);
+out:
        return err;
 }