/spare/repo/netdev-2.6 branch 'ieee80211'

author: Jeff Garzik <jgarzik@pobox.com> 2005-09-01 18:02:27 -0400
committer: Jeff Garzik <jgarzik@pobox.com> 2005-09-01 18:02:27 -0400
commit: ceeec3dc375e3b0618f16b34efc56fe093918f8b (patch)
tree: 2293d02721ee05131aaf1c60e4fba7e281585eec /net
parent: fbff868db3a4cc6a89d51da9a6d49b26c29d04fb (diff)
parent: e3ee3b78f83688a0ae4315e8be71b2eac559904a (diff)
292 files changed, 21640 insertions, 6397 deletions
diff --git a/net/802/fc.c b/net/802/fc.c
index 640d34e026c2..282c4ab1abe6 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -87,7 +87,7 @@ static int fc_rebuild_header(struct sk_buff *skb)
        struct fch_hdr *fch=(struct fch_hdr *)skb->data;
        struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
        if(fcllc->ethertype != htons(ETH_P_IP)) {
-                printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype));
+                printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
                return 0;
        }
 #ifdef CONFIG_INET
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 5ce24c4bb840..ac242a4bc346 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -108,8 +108,8 @@ static int fddi_rebuild_header(struct sk_buff	*skb)
        else
 #endif  
        {
-                printk("%s: Don't know how to resolve type %02X addresses.\n",
+                printk("%s: Don't know how to resolve type %04X addresses.\n",
-                       skb->dev->name, htons(fddi->hdr.llc_snap.ethertype));
+                       skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
                return(0);
        }
 }
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 051e8af56a77..6d7fed3dd99a 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
                        unsigned len)
 {
        struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
+        struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
        if (!len){
                len = skb->len - HIPPI_HLEN;
@@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
        if (daddr)
        {
                memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
-                memcpy(&skb->private.ifield, daddr + 2, 4);
+                memcpy(&hcb->ifield, daddr + 2, 4);
                return HIPPI_HLEN;
        }
+        hcb->ifield = 0;
        return -((int)HIPPI_HLEN);
 }
@@ -122,7 +124,7 @@ static int hippi_rebuild_header(struct sk_buff *skb)
 *      Determine the packet's protocol ID.
 */
 
-unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
        struct hippi_hdr *hip;
        
diff --git a/net/802/p8022.c b/net/802/p8022.c
index 5ae63416df6d..b24817c63ca8 100644
--- a/net/802/p8022.c
+++ b/net/802/p8022.c
@@ -35,7 +35,8 @@ static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
 struct datalink_proto *register_8022_client(unsigned char type,
                                            int (*func)(struct sk_buff *skb,
                                                        struct net_device *dev,
-                                                        struct packet_type *pt))
+                                                        struct packet_type *pt,
+                                                        struct net_device *orig_dev))
 {
        struct datalink_proto *proto;
diff --git a/net/802/p8023.c b/net/802/p8023.c
index a0b61b40225f..6368d3dce444 100644
--- a/net/802/p8023.c
+++ b/net/802/p8023.c
@@ -20,6 +20,7 @@
 #include <linux/skbuff.h>
 #include <net/datalink.h>
+#include <net/p8022.h>
 /*
 *      Place an 802.3 header on a packet. The driver will do the mac
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 1053821ddf93..ab80b1fab53c 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_client(unsigned char *desc)
 *      A SNAP packet has arrived
 */
 static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
-                    struct packet_type *pt)
+                    struct packet_type *pt, struct net_device *orig_dev)
 {
        int rc = 1;
        struct datalink_proto *proto;
@@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
                /* Pass the frame on. */
                skb->h.raw  += 5;
                skb_pull(skb, 5);
-                rc = proto->rcvfunc(skb, dev, &snap_packet_type);
+                rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
        } else {
                skb->sk = NULL;
                kfree_skb(skb);
@@ -118,7 +118,8 @@ module_exit(snap_exit);
 struct datalink_proto *register_snap_client(unsigned char *desc,
                                            int (*rcvfunc)(struct sk_buff *,
                                                           struct net_device *,
-                                                           struct packet_type *))
+                                                           struct packet_type *,
+                                                           struct net_device *))
 {
        struct datalink_proto *proto = NULL;
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c
index 36079630c49f..700129556c13 100644
--- a/net/802/sysctl_net_802.c
+++ b/net/802/sysctl_net_802.c
@@ -10,9 +10,10 @@
 *              2 of the License, or (at your option) any later version.
 */
+#include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/if_tr.h>
 #include <linux/sysctl.h>
-#include <linux/config.h>
 #ifdef CONFIG_TR
 extern int sysctl_tr_rif_timeout;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 508b1fa14546..9ae3a14dd016 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struct net_device* real_dev,
 /* found in vlan_dev.c */
 int vlan_dev_rebuild_header(struct sk_buff *skb);
 int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
-                  struct packet_type* ptype);
+                  struct packet_type *ptype, struct net_device *orig_dev);
 int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                         unsigned short type, void *daddr, void *saddr,
                         unsigned len);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 49c487413518..145f5cde96cf 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -113,7 +113,7 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
 *
 */
 int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
-                  struct packet_type* ptype)
+                  struct packet_type* ptype, struct net_device *orig_dev)
 {
        unsigned char *rawp = NULL;
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data);
diff --git a/net/Kconfig b/net/Kconfig
index 32327d0a56ad..2bdd5623fdd5 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -147,6 +147,7 @@ source "net/bridge/netfilter/Kconfig"
 endif
+source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/atm/Kconfig"
 source "net/bridge/Kconfig"
@@ -205,6 +206,8 @@ config NET_PKTGEN
          To compile this code as a module, choose M here: the
          module will be called pktgen.
+source "net/netfilter/Kconfig"
 endmenu
 endmenu
diff --git a/net/Makefile b/net/Makefile
index 83bc52d87bae..4aa2f46d2a56 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_NET)		+= $(tmp-y)
 obj-$(CONFIG_LLC)               += llc/
 obj-$(CONFIG_NET)               += ethernet/ 802/ sched/ netlink/
 obj-$(CONFIG_INET)              += ipv4/
+obj-$(CONFIG_NETFILTER)         += netfilter/
 obj-$(CONFIG_XFRM)              += xfrm/
 obj-$(CONFIG_UNIX)              += unix/
 ifneq ($(CONFIG_IPV6),)
@@ -41,6 +42,7 @@ obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_DECNET)            += decnet/
 obj-$(CONFIG_ECONET)            += econet/
 obj-$(CONFIG_VLAN_8021Q)        += 8021q/
+obj-$(CONFIG_IP_DCCP)           += dccp/
 obj-$(CONFIG_IP_SCTP)           += sctp/
 obj-$(CONFIG_IEEE80211)         += ieee80211/
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index c34614ea5fce..7076097debc2 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -698,7 +698,7 @@ static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a,
 *      frame. We currently only support Ethernet.
 */
 static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
-                    struct packet_type *pt)
+                    struct packet_type *pt, struct net_device *orig_dev)
 {
        struct elapaarp *ea = aarp_hdr(skb);
        int hash, ret = 0;
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 192b529f86a4..1d31b3a3f1e5 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -53,12 +53,12 @@
 #include <linux/config.h>
 #include <linux/module.h>
-#include <linux/tcp.h>
 #include <linux/if_arp.h>
 #include <linux/termios.h>      /* For TIOCOUTQ/INQ */
 #include <net/datalink.h>
 #include <net/psnap.h>
 #include <net/sock.h>
+#include <net/tcp_states.h>
 #include <net/route.h>
 #include <linux/atalk.h>
@@ -1390,7 +1390,7 @@ free_it:
 *      [ie ARPHRD_ETHERTALK]
 */
 static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
-                     struct packet_type *pt)
+                     struct packet_type *pt, struct net_device *orig_dev)
 {
        struct ddpehdr *ddp;
        struct sock *sock;
@@ -1482,7 +1482,7 @@ freeit:
 * header and append a long one.
 */
 static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
-                        struct packet_type *pt)
+                     struct packet_type *pt, struct net_device *orig_dev)
 {
        /* Expand any short form frames */
        if (skb->mac.raw[2] == 1) {
@@ -1528,7 +1528,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
        }
        skb->h.raw = skb->data;
-        return atalk_rcv(skb, dev, pt);
+        return atalk_rcv(skb, dev, pt, orig_dev);
 freeit:
        kfree_skb(skb);
        return 0;
diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c
index 181a3002d8ad..4b1faca5013f 100644
--- a/net/atm/ipcommon.c
+++ b/net/atm/ipcommon.c
@@ -34,7 +34,6 @@
 void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
 {
-        struct sk_buff *skb;
        unsigned long flags;
        struct sk_buff *skb_from = (struct sk_buff *) from;
        struct sk_buff *skb_to = (struct sk_buff *) to;
@@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
        prev->next = skb_to;
        to->prev->next = from->next;
        to->prev = from->prev;
-        for (skb = from->next; skb != skb_to; skb = skb->next)
-                skb->list = to;
        to->qlen += from->qlen;
        spin_unlock(&to->lock);
        from->prev = skb_from;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index a5c94f11547c..ea43dfb774e2 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -45,7 +45,7 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ip.h>
 #include <net/arp.h>
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 8adc0022cf58..edcaa897027c 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,8 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/ip.h>                     /* For ip_rcv */
+#include <net/tcp_states.h>
-#include <net/tcp.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 3a8b67316fc3..061083efc1dc 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -18,7 +18,7 @@
 #include <linux/string.h>
 #include <linux/sockios.h>
 #include <linux/net.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ax25.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 3dc808fde33f..810c9c76c2e0 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -9,7 +9,6 @@
 * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
 * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
 */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -26,9 +25,7 @@
 #include <linux/skbuff.h>
 #include <linux/netfilter.h>
 #include <net/sock.h>
-#include <net/ip.h>                     /* For ip_rcv */
+#include <net/tcp_states.h>
-#include <net/tcp.h>
-#include <net/arp.h>                    /* For arp_rcv */
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
@@ -114,7 +111,6 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
        pid = *skb->data;
-#ifdef CONFIG_INET
        if (pid == AX25_P_IP) {
                /* working around a TCP bug to keep additional listeners
                 * happy. TCP re-uses the buffer and destroys the original
@@ -132,10 +128,9 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
                skb->dev      = ax25->ax25_dev->dev;
                skb->pkt_type = PACKET_HOST;
                skb->protocol = htons(ETH_P_IP);
-                ip_rcv(skb, skb->dev, NULL);    /* Wrong ptype */
+                netif_rx(skb);
                return 1;
        }
-#endif
        if (pid == AX25_P_SEGMENT) {
                skb_pull(skb, 1);       /* Remove PID */
                return ax25_rx_fragment(ax25, skb);
@@ -250,7 +245,6 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
                /* Now we are pointing at the pid byte */
                switch (skb->data[1]) {
-#ifdef CONFIG_INET
                case AX25_P_IP:
                        skb_pull(skb,2);                /* drop PID/CTRL */
                        skb->h.raw    = skb->data;
@@ -258,7 +252,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
                        skb->dev      = dev;
                        skb->pkt_type = PACKET_HOST;
                        skb->protocol = htons(ETH_P_IP);
-                        ip_rcv(skb, dev, ptype);        /* Note ptype here is the wrong one, fix me later */
+                        netif_rx(skb);
                        break;
                case AX25_P_ARP:
@@ -268,9 +262,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
                        skb->dev      = dev;
                        skb->pkt_type = PACKET_HOST;
                        skb->protocol = htons(ETH_P_ARP);
-                        arp_rcv(skb, dev, ptype);       /* Note ptype here is wrong... */
+                        netif_rx(skb);
                        break;
-#endif
                case AX25_P_TEXT:
                        /* Now find a suitable dgram socket */
                        sk = ax25_get_socket(&dest, &src, SOCK_DGRAM);
@@ -454,7 +447,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
 *      Receive an AX.25 frame via a SLIP interface.
 */
 int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
-                  struct packet_type *ptype)
+                  struct packet_type *ptype, struct net_device *orig_dev)
 {
        skb->sk = NULL;         /* Initially we don't know who it's for */
        skb->destructor = NULL; /* Who initializes this, dammit?! */
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 7131873322c4..f6ed283e9de8 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,8 +29,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/ip.h>                     /* For ip_rcv */
+#include <net/tcp_states.h>
-#include <net/tcp.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 066897bc0749..a29c480a4dc1 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 99694b57f6f5..c41dbe5fadee 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -24,7 +24,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
@@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25)
                if (skb_prev == NULL)
                        skb_queue_head(&ax25->write_queue, skb);
                else
-                        skb_append(skb_prev, skb);
+                        skb_append(skb_prev, skb, &ax25->write_queue);
                skb_prev = skb;
        }
 }
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ffa26c10bfe8..55dc42eac92c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -191,7 +191,7 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
        /* Special commands */
        while ((skb = skb_dequeue(&hdev->driver_init))) {
-                skb->pkt_type = HCI_COMMAND_PKT;
+                bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
                skb->dev = (void *) hdev;
                skb_queue_tail(&hdev->cmd_q, skb);
                hci_sched_cmd(hdev);
@@ -995,11 +995,11 @@ static int hci_send_frame(struct sk_buff *skb)
                return -ENODEV;
        }
-        BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len);
+        BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
        if (atomic_read(&hdev->promisc)) {
                /* Time stamp */
-                do_gettimeofday(&skb->stamp);
+                __net_timestamp(skb);
                hci_send_to_sock(hdev, skb);
        }
@@ -1034,7 +1034,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
        BT_DBG("skb len %d", skb->len);
-        skb->pkt_type = HCI_COMMAND_PKT;
+        bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
        skb->dev = (void *) hdev;
        skb_queue_tail(&hdev->cmd_q, skb);
        hci_sched_cmd(hdev);
@@ -1081,7 +1081,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
        BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags);
        skb->dev = (void *) hdev;
-        skb->pkt_type = HCI_ACLDATA_PKT;
+        bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
        hci_add_acl_hdr(skb, conn->handle, flags | ACL_START);
        if (!(list = skb_shinfo(skb)->frag_list)) {
@@ -1103,7 +1103,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
                        skb = list; list = list->next;
                        
                        skb->dev = (void *) hdev;
-                        skb->pkt_type = HCI_ACLDATA_PKT;
+                        bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
                        hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT);
                        BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
@@ -1139,7 +1139,7 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
        memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE);
        skb->dev = (void *) hdev;
-        skb->pkt_type = HCI_SCODATA_PKT;
+        bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
        skb_queue_tail(&conn->data_q, skb);
        hci_sched_tx(hdev);
        return 0;
@@ -1369,7 +1369,7 @@ void hci_rx_task(unsigned long arg)
                if (test_bit(HCI_INIT, &hdev->flags)) {
                        /* Don't process data packets in this states. */
-                        switch (skb->pkt_type) {
+                        switch (bt_cb(skb)->pkt_type) {
                        case HCI_ACLDATA_PKT:
                        case HCI_SCODATA_PKT:
                                kfree_skb(skb);
@@ -1378,7 +1378,7 @@ void hci_rx_task(unsigned long arg)
                }
                /* Process frame */
-                switch (skb->pkt_type) {
+                switch (bt_cb(skb)->pkt_type) {
                case HCI_EVENT_PKT:
                        hci_event_packet(hdev, skb);
                        break;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 46367bd129c3..d6da0939216d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,14 +484,18 @@ static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff
 /* Inquiry Result */
 static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
+        struct inquiry_data data;
        struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1);
        int num_rsp = *((__u8 *) skb->data);
        BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+        if (!num_rsp)
+                return;
        hci_dev_lock(hdev);
        for (; num_rsp; num_rsp--) {
-                struct inquiry_data data;
                bacpy(&data.bdaddr, &info->bdaddr);
                data.pscan_rep_mode     = info->pscan_rep_mode;
                data.pscan_period_mode  = info->pscan_period_mode;
@@ -502,30 +506,55 @@ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *
                info++;
                hci_inquiry_cache_update(hdev, &data);
        }
        hci_dev_unlock(hdev);
 }
 /* Inquiry Result With RSSI */
 static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
-        struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1);
+        struct inquiry_data data;
        int num_rsp = *((__u8 *) skb->data);
        BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+        if (!num_rsp)
+                return;
        hci_dev_lock(hdev);
-        for (; num_rsp; num_rsp--) {
-                struct inquiry_data data;
+        if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
-                bacpy(&data.bdaddr, &info->bdaddr);
+                struct inquiry_info_with_rssi_and_pscan_mode *info =
-                data.pscan_rep_mode     = info->pscan_rep_mode;
+                        (struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1);
-                data.pscan_period_mode  = info->pscan_period_mode;
-                data.pscan_mode         = 0x00;
+                for (; num_rsp; num_rsp--) {
-                memcpy(data.dev_class, info->dev_class, 3);
+                        bacpy(&data.bdaddr, &info->bdaddr);
-                data.clock_offset       = info->clock_offset;
+                        data.pscan_rep_mode     = info->pscan_rep_mode;
-                data.rssi               = info->rssi;
+                        data.pscan_period_mode  = info->pscan_period_mode;
-                info++;
+                        data.pscan_mode         = info->pscan_mode;
-                hci_inquiry_cache_update(hdev, &data);
+                        memcpy(data.dev_class, info->dev_class, 3);
+                        data.clock_offset       = info->clock_offset;
+                        data.rssi               = info->rssi;
+                        info++;
+                        hci_inquiry_cache_update(hdev, &data);
+                }
+        } else {
+                struct inquiry_info_with_rssi *info =
+                        (struct inquiry_info_with_rssi *) (skb->data + 1);
+                for (; num_rsp; num_rsp--) {
+                        bacpy(&data.bdaddr, &info->bdaddr);
+                        data.pscan_rep_mode     = info->pscan_rep_mode;
+                        data.pscan_period_mode  = info->pscan_period_mode;
+                        data.pscan_mode         = 0x00;
+                        memcpy(data.dev_class, info->dev_class, 3);
+                        data.clock_offset       = info->clock_offset;
+                        data.rssi               = info->rssi;
+                        info++;
+                        hci_inquiry_cache_update(hdev, &data);
+                }
        }
        hci_dev_unlock(hdev);
 }
@@ -865,6 +894,24 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk
        hci_dev_unlock(hdev);
 }
+/* Page Scan Repetition Mode */
+static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+        struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data;
+        struct inquiry_entry *ie;
+        BT_DBG("%s", hdev->name);
+        hci_dev_lock(hdev);
+        if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) {
+                ie->data.pscan_rep_mode = ev->pscan_rep_mode;
+                ie->timestamp = jiffies;
+        }
+        hci_dev_unlock(hdev);
+}
 void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 {
        struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data;
@@ -937,6 +984,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
                hci_clock_offset_evt(hdev, skb);
                break;
+        case HCI_EV_PSCAN_REP_MODE:
+                hci_pscan_rep_mode_evt(hdev, skb);
+                break;
        case HCI_EV_CMD_STATUS:
                cs = (struct hci_ev_cmd_status *) skb->data;
                skb_pull(skb, sizeof(cs));
@@ -1036,9 +1087,9 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
        memcpy(ev->data, data, dlen);
        bt_cb(skb)->incoming = 1;
-        do_gettimeofday(&skb->stamp);
+        __net_timestamp(skb);
-        skb->pkt_type = HCI_EVENT_PKT;
+        bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
        skb->dev = (void *) hdev;
        hci_send_to_sock(hdev, skb);
        kfree_skb(skb);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index ebdcce5e7ca0..32ef7975a139 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -110,11 +110,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
                /* Apply filter */
                flt = &hci_pi(sk)->filter;
-                if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ?
+                if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ?
-                                0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
+                                0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
                        continue;
-                if (skb->pkt_type == HCI_EVENT_PKT) {
+                if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) {
                        register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
                        if (!hci_test_bit(evt, &flt->event_mask))
@@ -131,7 +131,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
                        continue;
                /* Put type byte before the data */
-                memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1);
+                memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1);
                if (sock_queue_rcv_skb(sk, nskb))
                        kfree_skb(nskb);
@@ -327,11 +327,17 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_
 {
        __u32 mask = hci_pi(sk)->cmsg_mask;
-        if (mask & HCI_CMSG_DIR)
+        if (mask & HCI_CMSG_DIR) {
-                put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming);
+                int incoming = bt_cb(skb)->incoming;
+                put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming);
+        }
+        if (mask & HCI_CMSG_TSTAMP) {
+                struct timeval tv;
-        if (mask & HCI_CMSG_TSTAMP)
+                skb_get_timestamp(skb, &tv);
-                put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp);
+                put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv);
+        }
 }
 
 static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, 
@@ -405,11 +411,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
                goto drop;
        }
-        skb->pkt_type = *((unsigned char *) skb->data);
+        bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
        skb_pull(skb, 1);
        skb->dev = (void *) hdev;
-        if (skb->pkt_type == HCI_COMMAND_PKT) {
+        if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
                u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data));
                u16 ogf = hci_opcode_ogf(opcode);
                u16 ocf = hci_opcode_ocf(opcode);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 32fccfb5bfa5..d3d6bc547212 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -372,7 +372,7 @@ static struct proto l2cap_proto = {
        .obj_size       = sizeof(struct l2cap_pinfo)
 };
-static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
 {
        struct sock *sk;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 27bf5047cd33..173f46e8cdae 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -21,10 +21,6 @@
   SOFTWARE IS DISCLAIMED.
 */
-/* 
-   RPN support    -    Dirk Husemann <hud@zurich.ibm.com>
-*/
 /*
 * Bluetooth RFCOMM core.
 *
@@ -115,10 +111,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
 #define __get_mcc_len(b)  ((b & 0xfe) >> 1)
 /* RPN macros */
-#define __rpn_line_settings(data, stop, parity)  ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3))
+#define __rpn_line_settings(data, stop, parity)  ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3))
 #define __get_rpn_data_bits(line) ((line) & 0x3)
 #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
-#define __get_rpn_parity(line)    (((line) >> 3) & 0x3)
+#define __get_rpn_parity(line)    (((line) >> 3) & 0x7)
 static inline void rfcomm_schedule(uint event)
 {
@@ -233,7 +229,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
        d->rx_credits = RFCOMM_DEFAULT_CREDITS;
 }
-struct rfcomm_dlc *rfcomm_dlc_alloc(int prio)
+struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio)
 {
        struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio);
        if (!d)
@@ -780,10 +776,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d
        return rfcomm_send_frame(s, buf, ptr - buf);
 }
-static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
+int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
-                           u8 bit_rate, u8 data_bits, u8 stop_bits,
+                        u8 bit_rate, u8 data_bits, u8 stop_bits,
-                           u8 parity, u8 flow_ctrl_settings, 
+                        u8 parity, u8 flow_ctrl_settings, 
-                           u8 xon_char, u8 xoff_char, u16 param_mask)
+                        u8 xon_char, u8 xoff_char, u16 param_mask)
 {
        struct rfcomm_hdr *hdr;
        struct rfcomm_mcc *mcc;
@@ -791,9 +787,9 @@ static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
        u8 buf[16], *ptr = buf;
        BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
-               "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", 
+                        " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", 
-                        s, cr, dlci, bit_rate, data_bits, stop_bits, parity, 
+                s, cr, dlci, bit_rate, data_bits, stop_bits, parity, 
-                        flow_ctrl_settings, xon_char, xoff_char, param_mask);
+                flow_ctrl_settings, xon_char, xoff_char, param_mask);
        hdr = (void *) ptr; ptr += sizeof(*hdr);
        hdr->addr = __addr(s->initiator, 0);
@@ -1265,16 +1261,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
        u8 xon_char  = 0;
        u8 xoff_char = 0;
        u16 rpn_mask = RFCOMM_RPN_PM_ALL;
-        
-        BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", 
+        BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
-               dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
+                dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
-               rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+                rpn->xon_char, rpn->xoff_char, rpn->param_mask);
-        
-        if (!cr) 
+        if (!cr)
                return 0;
-        
        if (len == 1) {
-                /* request: return default setting */
+                /* This is a request, return default settings */
                bit_rate  = RFCOMM_RPN_BR_115200;
                data_bits = RFCOMM_RPN_DATA_8;
                stop_bits = RFCOMM_RPN_STOP_1;
@@ -1282,11 +1278,12 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
                flow_ctrl = RFCOMM_RPN_FLOW_NONE;
                xon_char  = RFCOMM_RPN_XON_CHAR;
                xoff_char = RFCOMM_RPN_XOFF_CHAR;
                goto rpn_out;
        }
-        /* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity,
-                                  no flow control lines, normal XON/XOFF chars */
+        /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
+         * no parity, no flow control lines, normal XON/XOFF chars */
        if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) {
                bit_rate = rpn->bit_rate;
                if (bit_rate != RFCOMM_RPN_BR_115200) {
@@ -1295,6 +1292,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
                        rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
                }
        }
        if (rpn->param_mask & RFCOMM_RPN_PM_DATA) {
                data_bits = __get_rpn_data_bits(rpn->line_settings);
                if (data_bits != RFCOMM_RPN_DATA_8) {
@@ -1303,6 +1301,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
                        rpn_mask ^= RFCOMM_RPN_PM_DATA;
                }
        }
        if (rpn->param_mask & RFCOMM_RPN_PM_STOP) {
                stop_bits = __get_rpn_stop_bits(rpn->line_settings);
                if (stop_bits != RFCOMM_RPN_STOP_1) {
@@ -1311,6 +1310,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
                        rpn_mask ^= RFCOMM_RPN_PM_STOP;
                }
        }
        if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) {
                parity = __get_rpn_parity(rpn->line_settings);
                if (parity != RFCOMM_RPN_PARITY_NONE) {
@@ -1319,6 +1319,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
                        rpn_mask ^= RFCOMM_RPN_PM_PARITY;
                }
        }
        if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) {
                flow_ctrl = rpn->flow_ctrl;
                if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) {
@@ -1327,6 +1328,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
                        rpn_mask ^= RFCOMM_RPN_PM_FLOW;
                }
        }
        if (rpn->param_mask & RFCOMM_RPN_PM_XON) {
                xon_char = rpn->xon_char;
                if (xon_char != RFCOMM_RPN_XON_CHAR) {
@@ -1335,6 +1337,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
                        rpn_mask ^= RFCOMM_RPN_PM_XON;
                }
        }
        if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) {
                xoff_char = rpn->xoff_char;
                if (xoff_char != RFCOMM_RPN_XOFF_CHAR) {
@@ -1345,9 +1348,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
        }
 rpn_out:
-        rfcomm_send_rpn(s, 0, dlci, 
+        rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits,
-                        bit_rate, data_bits, stop_bits, parity, flow_ctrl,
+                        parity, flow_ctrl, xon_char, xoff_char, rpn_mask);
-                        xon_char, xoff_char, rpn_mask);
        return 0;
 }
@@ -1358,14 +1360,13 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb
        u8 dlci = __get_dlci(rls->dlci);
        BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
-        
        if (!cr)
                return 0;
-        /* FIXME: We should probably do something with this
+        /* We should probably do something with this information here. But
-           information here. But for now it's sufficient just
+         * for now it's sufficient just to reply -- Bluetooth 1.1 says it's
-           to reply -- Bluetooth 1.1 says it's mandatory to 
+         * mandatory to recognise and respond to RLS */
-           recognise and respond to RLS */
        rfcomm_send_rls(s, 0, dlci, rls->status);
@@ -1381,7 +1382,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
        BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
        d = rfcomm_dlc_get(s, dlci);
-        if (!d) 
+        if (!d)
                return 0;
        if (cr) {
@@ -1389,7 +1390,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
                        set_bit(RFCOMM_TX_THROTTLED, &d->flags);
                else
                        clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
-                
                rfcomm_dlc_lock(d);
                if (d->modem_status)
                        d->modem_status(d, msc->v24_sig);
@@ -1398,7 +1399,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
                rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
                d->mscex |= RFCOMM_MSCEX_RX;
-        } else 
+        } else
                d->mscex |= RFCOMM_MSCEX_TX;
        return 0;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 63a123c5c41b..90e19eb6d3cc 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -284,7 +284,7 @@ static struct proto rfcomm_proto = {
        .obj_size       = sizeof(struct rfcomm_pinfo)
 };
-static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
 {
        struct rfcomm_dlc *d;
        struct sock *sk;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6304590fd36a..1bca860a6109 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de
        skb->destructor = rfcomm_wfree;
 }
-static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority)
+static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, unsigned int __nocast priority)
 {
        if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
                struct sk_buff *skb = alloc_skb(size, priority);
@@ -528,9 +528,14 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
        struct rfcomm_dev *dev = dlc->owner;
        if (!dev)
                return;
-        
        BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
+        if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
+                if (dev->tty && !C_CLOCAL(dev->tty))
+                        tty_hangup(dev->tty);
+        }
        dev->modem_status = 
                ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
                ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
@@ -740,20 +745,143 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
        return -ENOIOCTLCMD;
 }
-#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
 static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
 {
-        BT_DBG("tty %p", tty);
+        struct termios *new = (struct termios *) tty->termios;
+        int old_baud_rate = tty_termios_baud_rate(old);
+        int new_baud_rate = tty_termios_baud_rate(new);
-        if ((tty->termios->c_cflag == old->c_cflag) &&
+        u8 baud, data_bits, stop_bits, parity, x_on, x_off;
-                (RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag)))
+        u16 changes = 0;
-                return;
+        struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+        BT_DBG("tty %p termios %p", tty, old);
+        /* Handle turning off CRTSCTS */
+        if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) 
+                BT_DBG("Turning off CRTSCTS unsupported");
+        /* Parity on/off and when on, odd/even */
+        if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
+                        ((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) {
+                changes |= RFCOMM_RPN_PM_PARITY;
+                BT_DBG("Parity change detected.");
+        }
+        /* Mark and space parity are not supported! */
+        if (new->c_cflag & PARENB) {
+                if (new->c_cflag & PARODD) {
+                        BT_DBG("Parity is ODD");
+                        parity = RFCOMM_RPN_PARITY_ODD;
+                } else {
+                        BT_DBG("Parity is EVEN");
+                        parity = RFCOMM_RPN_PARITY_EVEN;
+                }
+        } else {
+                BT_DBG("Parity is OFF");
+                parity = RFCOMM_RPN_PARITY_NONE;
+        }
+        /* Setting the x_on / x_off characters */
+        if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) {
+                BT_DBG("XOFF custom");
+                x_on = new->c_cc[VSTOP];
+                changes |= RFCOMM_RPN_PM_XON;
+        } else {
+                BT_DBG("XOFF default");
+                x_on = RFCOMM_RPN_XON_CHAR;
+        }
+        if (old->c_cc[VSTART] != new->c_cc[VSTART]) {
+                BT_DBG("XON custom");
+                x_off = new->c_cc[VSTART];
+                changes |= RFCOMM_RPN_PM_XOFF;
+        } else {
+                BT_DBG("XON default");
+                x_off = RFCOMM_RPN_XOFF_CHAR;
+        }
+        /* Handle setting of stop bits */
+        if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB))
+                changes |= RFCOMM_RPN_PM_STOP;
+        /* POSIX does not support 1.5 stop bits and RFCOMM does not
+         * support 2 stop bits. So a request for 2 stop bits gets
+         * translated to 1.5 stop bits */
+        if (new->c_cflag & CSTOPB) {
+                stop_bits = RFCOMM_RPN_STOP_15;
+        } else {
+                stop_bits = RFCOMM_RPN_STOP_1;
+        }
+        /* Handle number of data bits [5-8] */
+        if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE)) 
+                changes |= RFCOMM_RPN_PM_DATA;
+        switch (new->c_cflag & CSIZE) {
+        case CS5:
+                data_bits = RFCOMM_RPN_DATA_5;
+                break;
+        case CS6:
+                data_bits = RFCOMM_RPN_DATA_6;
+                break;
+        case CS7:
+                data_bits = RFCOMM_RPN_DATA_7;
+                break;
+        case CS8:
+                data_bits = RFCOMM_RPN_DATA_8;
+                break;
+        default:
+                data_bits = RFCOMM_RPN_DATA_8;
+                break;
+        }
+        /* Handle baudrate settings */
+        if (old_baud_rate != new_baud_rate)
+                changes |= RFCOMM_RPN_PM_BITRATE;
-        /* handle turning off CRTSCTS */
+        switch (new_baud_rate) {
-        if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) {
+        case 2400:
-                BT_DBG("turning off CRTSCTS");
+                baud = RFCOMM_RPN_BR_2400;
+                break;
+        case 4800:
+                baud = RFCOMM_RPN_BR_4800;
+                break;
+        case 7200:
+                baud = RFCOMM_RPN_BR_7200;
+                break;
+        case 9600:
+                baud = RFCOMM_RPN_BR_9600;
+                break;
+        case 19200: 
+                baud = RFCOMM_RPN_BR_19200;
+                break;
+        case 38400:
+                baud = RFCOMM_RPN_BR_38400;
+                break;
+        case 57600:
+                baud = RFCOMM_RPN_BR_57600;
+                break;
+        case 115200:
+                baud = RFCOMM_RPN_BR_115200;
+                break;
+        case 230400:
+                baud = RFCOMM_RPN_BR_230400;
+                break;
+        default:
+                /* 9600 is standard accordinag to the RFCOMM specification */
+                baud = RFCOMM_RPN_BR_9600;
+                break;
+        
        }
+        if (changes)
+                rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
+                                data_bits, stop_bits, parity,
+                                RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
+        return;
 }
 static void rfcomm_tty_throttle(struct tty_struct *tty)
@@ -761,7 +889,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty)
        struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
        BT_DBG("tty %p dev %p", tty, dev);
-        
        rfcomm_dlc_throttle(dev->dlc);
 }
@@ -770,7 +898,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty)
        struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
        BT_DBG("tty %p dev %p", tty, dev);
-        
        rfcomm_dlc_unthrottle(dev->dlc);
 }
@@ -841,35 +969,35 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
 static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear)
 {
-        struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+        struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
-        struct rfcomm_dlc *dlc = dev->dlc;
+        struct rfcomm_dlc *dlc = dev->dlc;
-        u8 v24_sig;
+        u8 v24_sig;
        BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear);
-        rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+        rfcomm_dlc_get_modem_status(dlc, &v24_sig);
-        if (set & TIOCM_DSR || set & TIOCM_DTR)
+        if (set & TIOCM_DSR || set & TIOCM_DTR)
-                v24_sig |= RFCOMM_V24_RTC;
+                v24_sig |= RFCOMM_V24_RTC;
-        if (set & TIOCM_RTS || set & TIOCM_CTS)
+        if (set & TIOCM_RTS || set & TIOCM_CTS)
-                v24_sig |= RFCOMM_V24_RTR;
+                v24_sig |= RFCOMM_V24_RTR;
-        if (set & TIOCM_RI)
+        if (set & TIOCM_RI)
-                v24_sig |= RFCOMM_V24_IC;
+                v24_sig |= RFCOMM_V24_IC;
-        if (set & TIOCM_CD)
+        if (set & TIOCM_CD)
-                v24_sig |= RFCOMM_V24_DV;
+                v24_sig |= RFCOMM_V24_DV;
-        if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+        if (clear & TIOCM_DSR || clear & TIOCM_DTR)
-                v24_sig &= ~RFCOMM_V24_RTC;
+                v24_sig &= ~RFCOMM_V24_RTC;
-        if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+        if (clear & TIOCM_RTS || clear & TIOCM_CTS)
-                v24_sig &= ~RFCOMM_V24_RTR;
+                v24_sig &= ~RFCOMM_V24_RTR;
-        if (clear & TIOCM_RI)
+        if (clear & TIOCM_RI)
-                v24_sig &= ~RFCOMM_V24_IC;
+                v24_sig &= ~RFCOMM_V24_IC;
-        if (clear & TIOCM_CD)
+        if (clear & TIOCM_CD)
-                v24_sig &= ~RFCOMM_V24_DV;
+                v24_sig &= ~RFCOMM_V24_DV;
-        rfcomm_dlc_set_modem_status(dlc, v24_sig);
+        rfcomm_dlc_set_modem_status(dlc, v24_sig);
-        return 0;
+        return 0;
 }
 /* ---- TTY structure ---- */
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 746c11fc017e..ce7ab7dfa0b2 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -418,7 +418,7 @@ static struct proto sco_proto = {
        .obj_size       = sizeof(struct sco_pinfo)
 };
-static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *sco_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
 {
        struct sock *sk;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e6c2200b7ca3..24396b914d11 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,7 +23,7 @@
 #include <asm/atomic.h>
 #include "br_private.h"
-static kmem_cache_t *br_fdb_cache;
+static kmem_cache_t *br_fdb_cache __read_mostly;
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
                      const unsigned char *addr);
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 02c632b4d325..c93d35ab95c0 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr,
 {
        struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data;
-        if ((*pskb)->nfmark != info->mark) {
+        if ((*pskb)->nfmark != info->mark)
                (*pskb)->nfmark = info->mark;
-                (*pskb)->nfcache |= NFC_ALTERED;
-        }
        return info->target;
 }
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 01af4fcef26d..aae26ae2e61f 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -78,8 +78,8 @@ static void ulog_send(unsigned int nlgroup)
        if (ub->qlen > 1)
                ub->lastnlh->nlmsg_type = NLMSG_DONE;
-        NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup;
+        NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
-        netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC);
+        netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
        ub->qlen = 0;
        ub->skb = NULL;
@@ -162,7 +162,7 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
        pm->version = EBT_ULOG_VERSION;
        do_gettimeofday(&pm->stamp);
        if (ub->qlen == 1)
-                ub->skb->stamp = pm->stamp;
+                skb_set_timestamp(ub->skb, &pm->stamp);
        pm->data_len = copy_len;
        pm->mark = skb->nfmark;
        pm->hook = hooknr;
@@ -258,7 +258,8 @@ static int __init init(void)
                spin_lock_init(&ulog_buffers[i].lock);
        }
-        ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+        ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS,
+                                          NULL, THIS_MODULE);
        if (!ebtulognl)
                ret = -ENOMEM;
        else if ((ret = ebt_register_watcher(&ulog)))
diff --git a/net/core/Makefile b/net/core/Makefile
index f5f5e58943e8..630da0f0579e 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -12,7 +12,6 @@ obj-y		     += dev.o ethtool.o dev_mcast.o dst.o \
 obj-$(CONFIG_XFRM) += flow.o
 obj-$(CONFIG_SYSFS) += net-sysfs.o
-obj-$(CONFIG_NETFILTER) += netfilter.o
 obj-$(CONFIG_NET_DIVERT) += dv.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NET_RADIO) += wireless.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fcee054b6f75..da9bf71421a7 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -43,7 +43,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/inet.h>
-#include <linux/tcp.h>
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/poll.h>
@@ -51,9 +50,10 @@
 #include <net/protocol.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/checksum.h>
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
 /*
 *      Is a socket 'connection oriented' ?
diff --git a/net/core/dev.c b/net/core/dev.c
index faf59b02c4bf..c01511e3d0c1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -267,10 +267,6 @@ void dev_add_pack(struct packet_type *pt)
        spin_unlock_bh(&ptype_lock);
 }
-extern void linkwatch_run_queue(void);
 /**
 *      __dev_remove_pack        - remove packet handler
 *      @pt: packet type declaration
@@ -1009,13 +1005,22 @@ void net_disable_timestamp(void)
        atomic_dec(&netstamp_needed);
 }
-static inline void net_timestamp(struct timeval *stamp)
+void __net_timestamp(struct sk_buff *skb)
+{
+        struct timeval tv;
+        do_gettimeofday(&tv);
+        skb_set_timestamp(skb, &tv);
+}
+EXPORT_SYMBOL(__net_timestamp);
+static inline void net_timestamp(struct sk_buff *skb)
 {
        if (atomic_read(&netstamp_needed))
-                do_gettimeofday(stamp);
+                __net_timestamp(skb);
        else {
-                stamp->tv_sec = 0;
+                skb->tstamp.off_sec = 0;
-                stamp->tv_usec = 0;
+                skb->tstamp.off_usec = 0;
        }
 }
@@ -1027,7 +1032,8 @@ static inline void net_timestamp(struct timeval *stamp)
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
        struct packet_type *ptype;
-        net_timestamp(&skb->stamp);
+        net_timestamp(skb);
        rcu_read_lock();
        list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1058,7 +1064,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
                        skb2->h.raw = skb2->nh.raw;
                        skb2->pkt_type = PACKET_OUTGOING;
-                        ptype->func(skb2, skb->dev, ptype);
+                        ptype->func(skb2, skb->dev, ptype, skb->dev);
                }
        }
        rcu_read_unlock();
@@ -1123,8 +1129,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)       (0)
 #endif
-extern void skb_release_data(struct sk_buff *);
 /* Keep head the same: replace data */
 int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
@@ -1379,8 +1383,8 @@ int netif_rx(struct sk_buff *skb)
        if (netpoll_rx(skb))
                return NET_RX_DROP;
-        if (!skb->stamp.tv_sec)
+        if (!skb->tstamp.off_sec)
-                net_timestamp(&skb->stamp);
+                net_timestamp(skb);
        /*
         * The code is rearranged so that the path is the most
@@ -1425,14 +1429,14 @@ int netif_rx_ni(struct sk_buff *skb)
 EXPORT_SYMBOL(netif_rx_ni);
-static __inline__ void skb_bond(struct sk_buff *skb)
+static inline struct net_device *skb_bond(struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
-        if (dev->master) {
+        if (dev->master)
-                skb->real_dev = skb->dev;
                skb->dev = dev->master;
-        }
+        return dev;
 }
 static void net_tx_action(struct softirq_action *h)
@@ -1482,10 +1486,11 @@ static void net_tx_action(struct softirq_action *h)
 }
 static __inline__ int deliver_skb(struct sk_buff *skb,
-                                  struct packet_type *pt_prev)
+                                  struct packet_type *pt_prev,
+                                  struct net_device *orig_dev)
 {
        atomic_inc(&skb->users);
-        return pt_prev->func(skb, skb->dev, pt_prev);
+        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
@@ -1496,7 +1501,8 @@ struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
 static __inline__ int handle_bridge(struct sk_buff **pskb,
-                                    struct packet_type **pt_prev, int *ret)
+                                    struct packet_type **pt_prev, int *ret,
+                                    struct net_device *orig_dev)
 {
        struct net_bridge_port *port;
@@ -1505,14 +1511,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb,
                return 0;
        if (*pt_prev) {
-                *ret = deliver_skb(*pskb, *pt_prev);
+                *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
                *pt_prev = NULL;
        } 
        
        return br_handle_frame_hook(port, pskb);
 }
 #else
-#define handle_bridge(skb, pt_prev, ret)        (0)
+#define handle_bridge(skb, pt_prev, ret, orig_dev)      (0)
 #endif
 #ifdef CONFIG_NET_CLS_ACT
@@ -1534,17 +1540,14 @@ static int ing_filter(struct sk_buff *skb)
                __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
                if (MAX_RED_LOOP < ttl++) {
                        printk("Redir loop detected Dropping packet (%s->%s)\n",
-                                skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
+                                skb->input_dev->name, skb->dev->name);
                        return TC_ACT_SHOT;
                }
                skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
                skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
-                if (NULL == skb->input_dev) {
-                        skb->input_dev = skb->dev;
-                        printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
-                }
                spin_lock(&dev->ingress_lock);
                if ((q = dev->qdisc_ingress) != NULL)
                        result = q->enqueue(skb, q);
@@ -1559,6 +1562,7 @@ static int ing_filter(struct sk_buff *skb)
 int netif_receive_skb(struct sk_buff *skb)
 {
        struct packet_type *ptype, *pt_prev;
+        struct net_device *orig_dev;
        int ret = NET_RX_DROP;
        unsigned short type;
@@ -1566,10 +1570,13 @@ int netif_receive_skb(struct sk_buff *skb)
        if (skb->dev->poll && netpoll_rx(skb))
                return NET_RX_DROP;
-        if (!skb->stamp.tv_sec)
+        if (!skb->tstamp.off_sec)
-                net_timestamp(&skb->stamp);
+                net_timestamp(skb);
+        if (!skb->input_dev)
+                skb->input_dev = skb->dev;
-        skb_bond(skb);
+        orig_dev = skb_bond(skb);
        __get_cpu_var(netdev_rx_stat).total++;
@@ -1590,14 +1597,14 @@ int netif_receive_skb(struct sk_buff *skb)
        list_for_each_entry_rcu(ptype, &ptype_all, list) {
                if (!ptype->dev || ptype->dev == skb->dev) {
                        if (pt_prev) 
-                                ret = deliver_skb(skb, pt_prev);
+                                ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = ptype;
                }
        }
 #ifdef CONFIG_NET_CLS_ACT
        if (pt_prev) {
-                ret = deliver_skb(skb, pt_prev);
+                ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = NULL; /* noone else should process this after*/
        } else {
                skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -1616,7 +1623,7 @@ ncls:
        handle_diverter(skb);
-        if (handle_bridge(&skb, &pt_prev, &ret))
+        if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
                goto out;
        type = skb->protocol;
@@ -1624,13 +1631,13 @@ ncls:
                if (ptype->type == type &&
                    (!ptype->dev || ptype->dev == skb->dev)) {
                        if (pt_prev) 
-                                ret = deliver_skb(skb, pt_prev);
+                                ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = ptype;
                }
        }
        if (pt_prev) {
-                ret = pt_prev->func(skb, skb->dev, pt_prev);
+                ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
        } else {
                kfree_skb(skb);
                /* Jamal, now you will not able to escape explaining
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index a3eeb88e1c81..289c1b5a8e4a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -81,6 +81,18 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data)
        return 0;
 }
+int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data)
+{
+        unsigned char len = dev->addr_len;
+        if ( addr->size < len )
+                return -ETOOSMALL;
+        
+        addr->size = len;
+        memcpy(data, dev->perm_addr, len);
+        return 0;
+}
+ 
 /* Handlers for each ethtool command */
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
@@ -683,6 +695,39 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
        return ret;
 }
+static int ethtool_get_perm_addr(struct net_device *dev, void *useraddr)
+{
+        struct ethtool_perm_addr epaddr;
+        u8 *data;
+        int ret;
+        if (!dev->ethtool_ops->get_perm_addr)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&epaddr,useraddr,sizeof(epaddr)))
+                return -EFAULT;
+        data = kmalloc(epaddr.size, GFP_USER);
+        if (!data)
+                return -ENOMEM;
+        ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data);
+        if (ret)
+                return ret;
+        ret = -EFAULT;
+        if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+                goto out;
+        useraddr += sizeof(epaddr);
+        if (copy_to_user(useraddr, data, epaddr.size))
+                goto out;
+        ret = 0;
+ out:
+        kfree(data);
+        return ret;
+}
 /* The main entry point in this file.  Called from net/core/dev.c */
 int dev_ethtool(struct ifreq *ifr)
@@ -806,6 +851,9 @@ int dev_ethtool(struct ifreq *ifr)
        case ETHTOOL_GSTATS:
                rc = ethtool_get_stats(dev, useraddr);
                break;
+        case ETHTOOL_GPERMADDR:
+                rc = ethtool_get_perm_addr(dev, useraddr);
+                break;
        default:
                rc =  -EOPNOTSUPP;
        }
@@ -826,6 +874,7 @@ int dev_ethtool(struct ifreq *ifr)
 EXPORT_SYMBOL(dev_ethtool);
 EXPORT_SYMBOL(ethtool_op_get_link);
+EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
 EXPORT_SYMBOL(ethtool_op_get_sg);
 EXPORT_SYMBOL(ethtool_op_get_tso);
 EXPORT_SYMBOL(ethtool_op_get_tx_csum);
diff --git a/net/core/flow.c b/net/core/flow.c
index f289570b15a3..7e95b39de9fd 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 #define flow_table(cpu) (per_cpu(flow_tables, cpu))
-static kmem_cache_t *flow_cachep;
+static kmem_cache_t *flow_cachep __read_mostly;
 static int flow_lwm, flow_hwm;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1beb782ac41b..39fc55edf691 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1217,7 +1217,7 @@ static void neigh_proxy_process(unsigned long arg)
        while (skb != (struct sk_buff *)&tbl->proxy_queue) {
                struct sk_buff *back = skb;
-                long tdif = back->stamp.tv_usec - now;
+                long tdif = NEIGH_CB(back)->sched_next - now;
                skb = skb->next;
                if (tdif <= 0) {
@@ -1248,8 +1248,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
                kfree_skb(skb);
                return;
        }
-        skb->stamp.tv_sec  = LOCALLY_ENQUEUED;
-        skb->stamp.tv_usec = sched_next;
+        NEIGH_CB(skb)->sched_next = sched_next;
+        NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
        spin_lock(&tbl->proxy_queue.lock);
        if (del_timer(&tbl->proxy_timer)) {
@@ -2342,8 +2343,8 @@ void neigh_app_ns(struct neighbour *n)
        }
        nlh                        = (struct nlmsghdr *)skb->data;
        nlh->nlmsg_flags           = NLM_F_REQUEST;
-        NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
+        NETLINK_CB(skb).dst_group  = RTNLGRP_NEIGH;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
 }
 static void neigh_app_notify(struct neighbour *n)
@@ -2360,8 +2361,8 @@ static void neigh_app_notify(struct neighbour *n)
                return;
        }
        nlh                        = (struct nlmsghdr *)skb->data;
-        NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
+        NETLINK_CB(skb).dst_group  = RTNLGRP_NEIGH;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
 }
 #endif /* CONFIG_ARPD */
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
deleted file mode 100644
index 076c156d5eda..000000000000
--- a/net/core/netfilter.c
+++ /dev/null
@@ -1,648 +0,0 @@
-/* netfilter.c: look after the filters for various protocols. 
- * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
- *
- * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
- * way.
- *
- * Rusty Russell (C)2000 -- This code is GPL.
- *
- * February 2000: Modified by James Morris to have 1 queue per protocol.
- * 15-Mar-2000:   Added NF_REPEAT --RR.
- * 08-May-2003:   Internal logging interface added by Jozsef Kadlecsik.
- */
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/wait.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <linux/ip.h>
-/* In this code, we can be waiting indefinitely for userspace to
- * service a packet if a hook returns NF_QUEUE.  We could keep a count
- * of skbuffs queued for userspace, and not deregister a hook unless
- * this is zero, but that sucks.  Now, we simply check when the
- * packets come back: if the hook is gone, the packet is discarded. */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NFDEBUG(format, args...)  printk(format , ## args)
-#else
-#define NFDEBUG(format, args...)
-#endif
-/* Sockopts only registered and called from user context, so
-   net locking would be overkill.  Also, [gs]etsockopt calls may
-   sleep. */
-static DECLARE_MUTEX(nf_sockopt_mutex);
-struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
-static LIST_HEAD(nf_sockopts);
-static DEFINE_SPINLOCK(nf_hook_lock);
-/* 
- * A queue handler may be registered for each protocol.  Each is protected by
- * long term mutex.  The handler must provide an an outfn() to accept packets
- * for queueing and must reinject all packets it receives, no matter what.
- */
-static struct nf_queue_handler_t {
-        nf_queue_outfn_t outfn;
-        void *data;
-} queue_handler[NPROTO];
-static DEFINE_RWLOCK(queue_handler_lock);
-int nf_register_hook(struct nf_hook_ops *reg)
-{
-        struct list_head *i;
-        spin_lock_bh(&nf_hook_lock);
-        list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
-                if (reg->priority < ((struct nf_hook_ops *)i)->priority)
-                        break;
-        }
-        list_add_rcu(&reg->list, i->prev);
-        spin_unlock_bh(&nf_hook_lock);
-        synchronize_net();
-        return 0;
-}
-void nf_unregister_hook(struct nf_hook_ops *reg)
-{
-        spin_lock_bh(&nf_hook_lock);
-        list_del_rcu(&reg->list);
-        spin_unlock_bh(&nf_hook_lock);
-        synchronize_net();
-}
-/* Do exclusive ranges overlap? */
-static inline int overlap(int min1, int max1, int min2, int max2)
-{
-        return max1 > min2 && min1 < max2;
-}
-/* Functions to register sockopt ranges (exclusive). */
-int nf_register_sockopt(struct nf_sockopt_ops *reg)
-{
-        struct list_head *i;
-        int ret = 0;
-        if (down_interruptible(&nf_sockopt_mutex) != 0)
-                return -EINTR;
-        list_for_each(i, &nf_sockopts) {
-                struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
-                if (ops->pf == reg->pf
-                    && (overlap(ops->set_optmin, ops->set_optmax, 
-                                reg->set_optmin, reg->set_optmax)
-                        || overlap(ops->get_optmin, ops->get_optmax, 
-                                   reg->get_optmin, reg->get_optmax))) {
-                        NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
-                                ops->set_optmin, ops->set_optmax, 
-                                ops->get_optmin, ops->get_optmax, 
-                                reg->set_optmin, reg->set_optmax,
-                                reg->get_optmin, reg->get_optmax);
-                        ret = -EBUSY;
-                        goto out;
-                }
-        }
-        list_add(&reg->list, &nf_sockopts);
-out:
-        up(&nf_sockopt_mutex);
-        return ret;
-}
-void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
-{
-        /* No point being interruptible: we're probably in cleanup_module() */
- restart:
-        down(&nf_sockopt_mutex);
-        if (reg->use != 0) {
-                /* To be woken by nf_sockopt call... */
-                /* FIXME: Stuart Young's name appears gratuitously. */
-                set_current_state(TASK_UNINTERRUPTIBLE);
-                reg->cleanup_task = current;
-                up(&nf_sockopt_mutex);
-                schedule();
-                goto restart;
-        }
-        list_del(&reg->list);
-        up(&nf_sockopt_mutex);
-}
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, int pf, int val, 
-                      char __user *opt, int *len, int get)
-{
-        struct list_head *i;
-        struct nf_sockopt_ops *ops;
-        int ret;
-        if (down_interruptible(&nf_sockopt_mutex) != 0)
-                return -EINTR;
-        list_for_each(i, &nf_sockopts) {
-                ops = (struct nf_sockopt_ops *)i;
-                if (ops->pf == pf) {
-                        if (get) {
-                                if (val >= ops->get_optmin
-                                    && val < ops->get_optmax) {
-                                        ops->use++;
-                                        up(&nf_sockopt_mutex);
-                                        ret = ops->get(sk, val, opt, len);
-                                        goto out;
-                                }
-                        } else {
-                                if (val >= ops->set_optmin
-                                    && val < ops->set_optmax) {
-                                        ops->use++;
-                                        up(&nf_sockopt_mutex);
-                                        ret = ops->set(sk, val, opt, *len);
-                                        goto out;
-                                }
-                        }
-                }
-        }
-        up(&nf_sockopt_mutex);
-        return -ENOPROTOOPT;
-        
- out:
-        down(&nf_sockopt_mutex);
-        ops->use--;
-        if (ops->cleanup_task)
-                wake_up_process(ops->cleanup_task);
-        up(&nf_sockopt_mutex);
-        return ret;
-}
-int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
-                  int len)
-{
-        return nf_sockopt(sk, pf, val, opt, &len, 0);
-}
-int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
-{
-        return nf_sockopt(sk, pf, val, opt, len, 1);
-}
-static unsigned int nf_iterate(struct list_head *head,
-                               struct sk_buff **skb,
-                               int hook,
-                               const struct net_device *indev,
-                               const struct net_device *outdev,
-                               struct list_head **i,
-                               int (*okfn)(struct sk_buff *),
-                               int hook_thresh)
-{
-        unsigned int verdict;
-        /*
-         * The caller must not block between calls to this
-         * function because of risk of continuing from deleted element.
-         */
-        list_for_each_continue_rcu(*i, head) {
-                struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
-                if (hook_thresh > elem->priority)
-                        continue;
-                /* Optimization: we don't need to hold module
-                   reference here, since function can't sleep. --RR */
-                verdict = elem->hook(hook, skb, indev, outdev, okfn);
-                if (verdict != NF_ACCEPT) {
-#ifdef CONFIG_NETFILTER_DEBUG
-                        if (unlikely(verdict > NF_MAX_VERDICT)) {
-                                NFDEBUG("Evil return from %p(%u).\n",
-                                        elem->hook, hook);
-                                continue;
-                        }
-#endif
-                        if (verdict != NF_REPEAT)
-                                return verdict;
-                        *i = (*i)->prev;
-                }
-        }
-        return NF_ACCEPT;
-}
-int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
-{      
-        int ret;
-        write_lock_bh(&queue_handler_lock);
-        if (queue_handler[pf].outfn)
-                ret = -EBUSY;
-        else {
-                queue_handler[pf].outfn = outfn;
-                queue_handler[pf].data = data;
-                ret = 0;
-        }
-        write_unlock_bh(&queue_handler_lock);
-        return ret;
-}
-/* The caller must flush their queue before this */
-int nf_unregister_queue_handler(int pf)
-{
-        write_lock_bh(&queue_handler_lock);
-        queue_handler[pf].outfn = NULL;
-        queue_handler[pf].data = NULL;
-        write_unlock_bh(&queue_handler_lock);
-        
-        return 0;
-}
-/* 
- * Any packet that leaves via this function must come back 
- * through nf_reinject().
- */
-static int nf_queue(struct sk_buff *skb, 
-                    struct list_head *elem, 
-                    int pf, unsigned int hook,
-                    struct net_device *indev,
-                    struct net_device *outdev,
-                    int (*okfn)(struct sk_buff *))
-{
-        int status;
-        struct nf_info *info;
-#ifdef CONFIG_BRIDGE_NETFILTER
-        struct net_device *physindev = NULL;
-        struct net_device *physoutdev = NULL;
-#endif
-        /* QUEUE == DROP if noone is waiting, to be safe. */
-        read_lock(&queue_handler_lock);
-        if (!queue_handler[pf].outfn) {
-                read_unlock(&queue_handler_lock);
-                kfree_skb(skb);
-                return 1;
-        }
-        info = kmalloc(sizeof(*info), GFP_ATOMIC);
-        if (!info) {
-                if (net_ratelimit())
-                        printk(KERN_ERR "OOM queueing packet %p\n",
-                               skb);
-                read_unlock(&queue_handler_lock);
-                kfree_skb(skb);
-                return 1;
-        }
-        *info = (struct nf_info) { 
-                (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
-        /* If it's going away, ignore hook. */
-        if (!try_module_get(info->elem->owner)) {
-                read_unlock(&queue_handler_lock);
-                kfree(info);
-                return 0;
-        }
-        /* Bump dev refs so they don't vanish while packet is out */
-        if (indev) dev_hold(indev);
-        if (outdev) dev_hold(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
-        if (skb->nf_bridge) {
-                physindev = skb->nf_bridge->physindev;
-                if (physindev) dev_hold(physindev);
-                physoutdev = skb->nf_bridge->physoutdev;
-                if (physoutdev) dev_hold(physoutdev);
-        }
-#endif
-        status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
-        read_unlock(&queue_handler_lock);
-        if (status < 0) {
-                /* James M doesn't say fuck enough. */
-                if (indev) dev_put(indev);
-                if (outdev) dev_put(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
-                if (physindev) dev_put(physindev);
-                if (physoutdev) dev_put(physoutdev);
-#endif
-                module_put(info->elem->owner);
-                kfree(info);
-                kfree_skb(skb);
-                return 1;
-        }
-        return 1;
-}
-/* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. */
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
-                 struct net_device *indev,
-                 struct net_device *outdev,
-                 int (*okfn)(struct sk_buff *),
-                 int hook_thresh)
-{
-        struct list_head *elem;
-        unsigned int verdict;
-        int ret = 0;
-        /* We may already have this, but read-locks nest anyway */
-        rcu_read_lock();
-        elem = &nf_hooks[pf][hook];
-next_hook:
-        verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
-                             outdev, &elem, okfn, hook_thresh);
-        if (verdict == NF_ACCEPT || verdict == NF_STOP) {
-                ret = 1;
-                goto unlock;
-        } else if (verdict == NF_DROP) {
-                kfree_skb(*pskb);
-                ret = -EPERM;
-        } else if (verdict == NF_QUEUE) {
-                NFDEBUG("nf_hook: Verdict = QUEUE.\n");
-                if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))
-                        goto next_hook;
-        }
-unlock:
-        rcu_read_unlock();
-        return ret;
-}
-void nf_reinject(struct sk_buff *skb, struct nf_info *info,
-                 unsigned int verdict)
-{
-        struct list_head *elem = &info->elem->list;
-        struct list_head *i;
-        rcu_read_lock();
-        /* Release those devices we held, or Alexey will kill me. */
-        if (info->indev) dev_put(info->indev);
-        if (info->outdev) dev_put(info->outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
-        if (skb->nf_bridge) {
-                if (skb->nf_bridge->physindev)
-                        dev_put(skb->nf_bridge->physindev);
-                if (skb->nf_bridge->physoutdev)
-                        dev_put(skb->nf_bridge->physoutdev);
-        }
-#endif
-        /* Drop reference to owner of hook which queued us. */
-        module_put(info->elem->owner);
-        list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
-                if (i == elem) 
-                        break;
-        }
-  
-        if (elem == &nf_hooks[info->pf][info->hook]) {
-                /* The module which sent it to userspace is gone. */
-                NFDEBUG("%s: module disappeared, dropping packet.\n",
-                        __FUNCTION__);
-                verdict = NF_DROP;
-        }
-        /* Continue traversal iff userspace said ok... */
-        if (verdict == NF_REPEAT) {
-                elem = elem->prev;
-                verdict = NF_ACCEPT;
-        }
-        if (verdict == NF_ACCEPT) {
-        next_hook:
-                verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
-                                     &skb, info->hook, 
-                                     info->indev, info->outdev, &elem,
-                                     info->okfn, INT_MIN);
-        }
-        switch (verdict) {
-        case NF_ACCEPT:
-                info->okfn(skb);
-                break;
-        case NF_QUEUE:
-                if (!nf_queue(skb, elem, info->pf, info->hook, 
-                              info->indev, info->outdev, info->okfn))
-                        goto next_hook;
-                break;
-        }
-        rcu_read_unlock();
-        if (verdict == NF_DROP)
-                kfree_skb(skb);
-        kfree(info);
-        return;
-}
-#ifdef CONFIG_INET
-/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff **pskb)
-{
-        struct iphdr *iph = (*pskb)->nh.iph;
-        struct rtable *rt;
-        struct flowi fl = {};
-        struct dst_entry *odst;
-        unsigned int hh_len;
-        /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
-         * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
-         */
-        if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
-                fl.nl_u.ip4_u.daddr = iph->daddr;
-                fl.nl_u.ip4_u.saddr = iph->saddr;
-                fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
-                fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
-#ifdef CONFIG_IP_ROUTE_FWMARK
-                fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
-#endif
-                fl.proto = iph->protocol;
-                if (ip_route_output_key(&rt, &fl) != 0)
-                        return -1;
-                /* Drop old route. */
-                dst_release((*pskb)->dst);
-                (*pskb)->dst = &rt->u.dst;
-        } else {
-                /* non-local src, find valid iif to satisfy
-                 * rp-filter when calling ip_route_input. */
-                fl.nl_u.ip4_u.daddr = iph->saddr;
-                if (ip_route_output_key(&rt, &fl) != 0)
-                        return -1;
-                odst = (*pskb)->dst;
-                if (ip_route_input(*pskb, iph->daddr, iph->saddr,
-                                   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
-                        dst_release(&rt->u.dst);
-                        return -1;
-                }
-                dst_release(&rt->u.dst);
-                dst_release(odst);
-        }
-        
-        if ((*pskb)->dst->error)
-                return -1;
-        /* Change in oif may mean change in hh_len. */
-        hh_len = (*pskb)->dst->dev->hard_header_len;
-        if (skb_headroom(*pskb) < hh_len) {
-                struct sk_buff *nskb;
-                nskb = skb_realloc_headroom(*pskb, hh_len);
-                if (!nskb) 
-                        return -1;
-                if ((*pskb)->sk)
-                        skb_set_owner_w(nskb, (*pskb)->sk);
-                kfree_skb(*pskb);
-                *pskb = nskb;
-        }
-        return 0;
-}
-EXPORT_SYMBOL(ip_route_me_harder);
-int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len)
-{
-        struct sk_buff *nskb;
-        if (writable_len > (*pskb)->len)
-                return 0;
-        /* Not exclusive use of packet?  Must copy. */
-        if (skb_shared(*pskb) || skb_cloned(*pskb))
-                goto copy_skb;
-        return pskb_may_pull(*pskb, writable_len);
-copy_skb:
-        nskb = skb_copy(*pskb, GFP_ATOMIC);
-        if (!nskb)
-                return 0;
-        BUG_ON(skb_is_nonlinear(nskb));
-        /* Rest of kernel will get very unhappy if we pass it a
-           suddenly-orphaned skbuff */
-        if ((*pskb)->sk)
-                skb_set_owner_w(nskb, (*pskb)->sk);
-        kfree_skb(*pskb);
-        *pskb = nskb;
-        return 1;
-}
-EXPORT_SYMBOL(skb_ip_make_writable);
-#endif /*CONFIG_INET*/
-/* Internal logging interface, which relies on the real 
-   LOG target modules */
-#define NF_LOG_PREFIXLEN                128
-static nf_logfn *nf_logging[NPROTO]; /* = NULL */
-static int reported = 0;
-static DEFINE_SPINLOCK(nf_log_lock);
-int nf_log_register(int pf, nf_logfn *logfn)
-{
-        int ret = -EBUSY;
-        /* Any setup of logging members must be done before
-         * substituting pointer. */
-        spin_lock(&nf_log_lock);
-        if (!nf_logging[pf]) {
-                rcu_assign_pointer(nf_logging[pf], logfn);
-                ret = 0;
-        }
-        spin_unlock(&nf_log_lock);
-        return ret;
-}               
-void nf_log_unregister(int pf, nf_logfn *logfn)
-{
-        spin_lock(&nf_log_lock);
-        if (nf_logging[pf] == logfn)
-                nf_logging[pf] = NULL;
-        spin_unlock(&nf_log_lock);
-        /* Give time to concurrent readers. */
-        synchronize_net();
-}               
-void nf_log_packet(int pf,
-                   unsigned int hooknum,
-                   const struct sk_buff *skb,
-                   const struct net_device *in,
-                   const struct net_device *out,
-                   const char *fmt, ...)
-{
-        va_list args;
-        char prefix[NF_LOG_PREFIXLEN];
-        nf_logfn *logfn;
-        
-        rcu_read_lock();
-        logfn = rcu_dereference(nf_logging[pf]);
-        if (logfn) {
-                va_start(args, fmt);
-                vsnprintf(prefix, sizeof(prefix), fmt, args);
-                va_end(args);
-                /* We must read logging before nf_logfn[pf] */
-                logfn(hooknum, skb, in, out, prefix);
-        } else if (!reported) {
-                printk(KERN_WARNING "nf_log_packet: can\'t log yet, "
-                       "no backend logging module loaded in!\n");
-                reported++;
-        }
-        rcu_read_unlock();
-}
-EXPORT_SYMBOL(nf_log_register);
-EXPORT_SYMBOL(nf_log_unregister);
-EXPORT_SYMBOL(nf_log_packet);
-/* This does not belong here, but locally generated errors need it if connection
-   tracking in use: without this, connection may not be in hash table, and hence
-   manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
-void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
-{
-        void (*attach)(struct sk_buff *, struct sk_buff *);
-        if (skb->nfct && (attach = ip_ct_attach) != NULL) {
-                mb(); /* Just to be sure: must be read before executing this */
-                attach(new, skb);
-        }
-}
-void __init netfilter_init(void)
-{
-        int i, h;
-        for (i = 0; i < NPROTO; i++) {
-                for (h = 0; h < NF_MAX_HOOKS; h++)
-                        INIT_LIST_HEAD(&nf_hooks[i][h]);
-        }
-}
-EXPORT_SYMBOL(ip_ct_attach);
-EXPORT_SYMBOL(nf_ct_attach);
-EXPORT_SYMBOL(nf_getsockopt);
-EXPORT_SYMBOL(nf_hook_slow);
-EXPORT_SYMBOL(nf_hooks);
-EXPORT_SYMBOL(nf_register_hook);
-EXPORT_SYMBOL(nf_register_queue_handler);
-EXPORT_SYMBOL(nf_register_sockopt);
-EXPORT_SYMBOL(nf_reinject);
-EXPORT_SYMBOL(nf_setsockopt);
-EXPORT_SYMBOL(nf_unregister_hook);
-EXPORT_SYMBOL(nf_unregister_queue_handler);
-EXPORT_SYMBOL(nf_unregister_sockopt);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index bb55675f0685..b8203de5ff07 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -32,7 +32,6 @@
 * Further increasing requires to change hash table size.
 */
 int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
 int reqsk_queue_alloc(struct request_sock_queue *queue,
                      const int nr_table_entries)
@@ -53,6 +52,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
        get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
        rwlock_init(&queue->syn_wait_lock);
        queue->rskq_accept_head = queue->rskq_accept_head = NULL;
+        queue->rskq_defer_accept = 0;
+        lopt->nr_table_entries = nr_table_entries;
        write_lock_bh(&queue->syn_wait_lock);
        queue->listen_opt = lopt;
@@ -62,3 +63,28 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
 }
 EXPORT_SYMBOL(reqsk_queue_alloc);
+void reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+        /* make all the listen_opt local to us */
+        struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+        if (lopt->qlen != 0) {
+                int i;
+                for (i = 0; i < lopt->nr_table_entries; i++) {
+                        struct request_sock *req;
+                        while ((req = lopt->syn_table[i]) != NULL) {
+                                lopt->syn_table[i] = req->dl_next;
+                                lopt->qlen--;
+                                reqsk_free(req);
+                        }
+                }
+        }
+        BUG_TRAP(lopt->qlen == 0);
+        kfree(lopt);
+}
+EXPORT_SYMBOL(reqsk_queue_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4b1bb30e6381..9bed7569ce3f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -148,7 +148,7 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 {
        int err = 0;
-        NETLINK_CB(skb).dst_groups = group;
+        NETLINK_CB(skb).dst_group = group;
        if (echo)
                atomic_inc(&skb->users);
        netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
@@ -458,8 +458,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
                kfree_skb(skb);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
+        NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
 }
 static int rtnetlink_done(struct netlink_callback *cb)
@@ -708,7 +708,8 @@ void __init rtnetlink_init(void)
        if (!rta_buf)
                panic("rtnetlink_init: cannot allocate rta_buf\n");
-        rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv);
+        rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
+                                     THIS_MODULE);
        if (rtnl == NULL)
                panic("rtnetlink_init: cannot initialize rtnetlink\n");
        netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7eab867ede59..f80a28785610 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -68,7 +68,10 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
-static kmem_cache_t *skbuff_head_cache;
+static kmem_cache_t *skbuff_head_cache __read_mostly;
+static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+struct timeval __read_mostly skb_tv_base;
 /*
 *      Keep out-of-line to prevent kernel bloat.
@@ -118,7 +121,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 */
 /**
- *      alloc_skb       -       allocate a network buffer
+ *      __alloc_skb     -       allocate a network buffer
 *      @size: size to allocate
 *      @gfp_mask: allocation mask
 *
@@ -129,14 +132,20 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 *      Buffers may only be allocated from interrupts using a @gfp_mask of
 *      %GFP_ATOMIC.
 */
-struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
+struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask,
+                            int fclone)
 {
        struct sk_buff *skb;
        u8 *data;
        /* Get the HEAD */
-        skb = kmem_cache_alloc(skbuff_head_cache,
+        if (fclone)
-                               gfp_mask & ~__GFP_DMA);
+                skb = kmem_cache_alloc(skbuff_fclone_cache,
+                                       gfp_mask & ~__GFP_DMA);
+        else
+                skb = kmem_cache_alloc(skbuff_head_cache,
+                                       gfp_mask & ~__GFP_DMA);
        if (!skb)
                goto out;
@@ -153,7 +162,15 @@ struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
        skb->data = data;
        skb->tail = data;
        skb->end  = data + size;
+        if (fclone) {
+                struct sk_buff *child = skb + 1;
+                atomic_t *fclone_ref = (atomic_t *) (child + 1);
+                skb->fclone = SKB_FCLONE_ORIG;
+                atomic_set(fclone_ref, 1);
+                child->fclone = SKB_FCLONE_UNAVAILABLE;
+        }
        atomic_set(&(skb_shinfo(skb)->dataref), 1);
        skb_shinfo(skb)->nr_frags  = 0;
        skb_shinfo(skb)->tso_size = 0;
@@ -266,8 +283,34 @@ void skb_release_data(struct sk_buff *skb)
 */
 void kfree_skbmem(struct sk_buff *skb)
 {
+        struct sk_buff *other;
+        atomic_t *fclone_ref;
        skb_release_data(skb);
-        kmem_cache_free(skbuff_head_cache, skb);
+        switch (skb->fclone) {
+        case SKB_FCLONE_UNAVAILABLE:
+                kmem_cache_free(skbuff_head_cache, skb);
+                break;
+        case SKB_FCLONE_ORIG:
+                fclone_ref = (atomic_t *) (skb + 2);
+                if (atomic_dec_and_test(fclone_ref))
+                        kmem_cache_free(skbuff_fclone_cache, skb);
+                break;
+        case SKB_FCLONE_CLONE:
+                fclone_ref = (atomic_t *) (skb + 1);
+                other = skb - 1;
+                /* The clone portion is available for
+                 * fast-cloning again.
+                 */
+                skb->fclone = SKB_FCLONE_UNAVAILABLE;
+                if (atomic_dec_and_test(fclone_ref))
+                        kmem_cache_free(skbuff_fclone_cache, other);
+                break;
+        };
 }
 /**
@@ -281,8 +324,6 @@ void kfree_skbmem(struct sk_buff *skb)
 void __kfree_skb(struct sk_buff *skb)
 {
-        BUG_ON(skb->list != NULL);
        dst_release(skb->dst);
 #ifdef CONFIG_XFRM
        secpath_put(skb->sp);
@@ -302,7 +343,6 @@ void __kfree_skb(struct sk_buff *skb)
        skb->tc_index = 0;
 #ifdef CONFIG_NET_CLS_ACT
        skb->tc_verd = 0;
-        skb->tc_classid = 0;
 #endif
 #endif
@@ -325,19 +365,27 @@ void __kfree_skb(struct sk_buff *skb)
 struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
-        struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+        struct sk_buff *n;
-        if (!n) 
+        n = skb + 1;
-                return NULL;
+        if (skb->fclone == SKB_FCLONE_ORIG &&
+            n->fclone == SKB_FCLONE_UNAVAILABLE) {
+                atomic_t *fclone_ref = (atomic_t *) (n + 1);
+                n->fclone = SKB_FCLONE_CLONE;
+                atomic_inc(fclone_ref);
+        } else {
+                n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+                if (!n)
+                        return NULL;
+                n->fclone = SKB_FCLONE_UNAVAILABLE;
+        }
 #define C(x) n->x = skb->x
        n->next = n->prev = NULL;
-        n->list = NULL;
        n->sk = NULL;
-        C(stamp);
+        C(tstamp);
        C(dev);
-        C(real_dev);
        C(h);
        C(nh);
        C(mac);
@@ -361,7 +409,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
        n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
        C(nfmark);
-        C(nfcache);
        C(nfct);
        nf_conntrack_get(skb->nfct);
        C(nfctinfo);
@@ -370,9 +417,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
        nf_bridge_get(skb->nf_bridge);
 #endif
 #endif /*CONFIG_NETFILTER*/
-#if defined(CONFIG_HIPPI)
-        C(private);
-#endif
 #ifdef CONFIG_NET_SCHED
        C(tc_index);
 #ifdef CONFIG_NET_CLS_ACT
@@ -380,7 +424,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
        n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
        n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
        C(input_dev);
-        C(tc_classid);
 #endif
 #endif
@@ -404,10 +447,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
         */
        unsigned long offset = new->data - old->data;
-        new->list       = NULL;
        new->sk         = NULL;
        new->dev        = old->dev;
-        new->real_dev   = old->real_dev;
        new->priority   = old->priority;
        new->protocol   = old->protocol;
        new->dst        = dst_clone(old->dst);
@@ -419,12 +460,12 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
        new->mac.raw    = old->mac.raw + offset;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        new->local_df   = old->local_df;
+        new->fclone     = SKB_FCLONE_UNAVAILABLE;
        new->pkt_type   = old->pkt_type;
-        new->stamp      = old->stamp;
+        new->tstamp     = old->tstamp;
        new->destructor = NULL;
 #ifdef CONFIG_NETFILTER
        new->nfmark     = old->nfmark;
-        new->nfcache    = old->nfcache;
        new->nfct       = old->nfct;
        nf_conntrack_get(old->nfct);
        new->nfctinfo   = old->nfctinfo;
@@ -1344,50 +1385,43 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
        __skb_queue_tail(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
 }
 /**
 *      skb_unlink      -       remove a buffer from a list
 *      @skb: buffer to remove
+ *      @list: list to use
 *
- *      Place a packet after a given packet in a list. The list locks are taken
+ *      Remove a packet from a list. The list locks are taken and this
- *      and this function is atomic with respect to other list locked calls
+ *      function is atomic with respect to other list locked calls
 *
- *      Works even without knowing the list it is sitting on, which can be
+ *      You must know what list the SKB is on.
- *      handy at times. It also means that THE LIST MUST EXIST when you
- *      unlink. Thus a list must have its contents unlinked before it is
- *      destroyed.
 */
-void skb_unlink(struct sk_buff *skb)
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
 {
-        struct sk_buff_head *list = skb->list;
+        unsigned long flags;
-        if (list) {
-                unsigned long flags;
-                spin_lock_irqsave(&list->lock, flags);
+        spin_lock_irqsave(&list->lock, flags);
-                if (skb->list == list)
+        __skb_unlink(skb, list);
-                        __skb_unlink(skb, skb->list);
+        spin_unlock_irqrestore(&list->lock, flags);
-                spin_unlock_irqrestore(&list->lock, flags);
-        }
 }
 /**
 *      skb_append      -       append a buffer
 *      @old: buffer to insert after
 *      @newsk: buffer to insert
+ *      @list: list to use
 *
 *      Place a packet after a given packet in a list. The list locks are taken
 *      and this function is atomic with respect to other list locked calls.
 *      A buffer cannot be placed on two lists at the same time.
 */
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
-void skb_append(struct sk_buff *old, struct sk_buff *newsk)
 {
        unsigned long flags;
-        spin_lock_irqsave(&old->list->lock, flags);
+        spin_lock_irqsave(&list->lock, flags);
-        __skb_append(old, newsk);
+        __skb_append(old, newsk, list);
-        spin_unlock_irqrestore(&old->list->lock, flags);
+        spin_unlock_irqrestore(&list->lock, flags);
 }
@@ -1395,19 +1429,21 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk)
 *      skb_insert      -       insert a buffer
 *      @old: buffer to insert before
 *      @newsk: buffer to insert
+ *      @list: list to use
+ *
+ *      Place a packet before a given packet in a list. The list locks are
+ *      taken and this function is atomic with respect to other list locked
+ *      calls.
 *
- *      Place a packet before a given packet in a list. The list locks are taken
- *      and this function is atomic with respect to other list locked calls
 *      A buffer cannot be placed on two lists at the same time.
 */
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
-void skb_insert(struct sk_buff *old, struct sk_buff *newsk)
 {
        unsigned long flags;
-        spin_lock_irqsave(&old->list->lock, flags);
+        spin_lock_irqsave(&list->lock, flags);
-        __skb_insert(newsk, old->prev, old, old->list);
+        __skb_insert(newsk, old->prev, old, list);
-        spin_unlock_irqrestore(&old->list->lock, flags);
+        spin_unlock_irqrestore(&list->lock, flags);
 }
 #if 0
@@ -1663,12 +1699,23 @@ void __init skb_init(void)
                                              NULL, NULL);
        if (!skbuff_head_cache)
                panic("cannot create skbuff cache");
+        skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+                                                (2*sizeof(struct sk_buff)) +
+                                                sizeof(atomic_t),
+                                                0,
+                                                SLAB_HWCACHE_ALIGN,
+                                                NULL, NULL);
+        if (!skbuff_fclone_cache)
+                panic("cannot create skbuff cache");
+        do_gettimeofday(&skb_tv_base);
 }
 EXPORT_SYMBOL(___pskb_trim);
 EXPORT_SYMBOL(__kfree_skb);
 EXPORT_SYMBOL(__pskb_pull_tail);
-EXPORT_SYMBOL(alloc_skb);
+EXPORT_SYMBOL(__alloc_skb);
 EXPORT_SYMBOL(pskb_copy);
 EXPORT_SYMBOL(pskb_expand_head);
 EXPORT_SYMBOL(skb_checksum);
@@ -1696,3 +1743,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
 EXPORT_SYMBOL(skb_seq_read);
 EXPORT_SYMBOL(skb_abort_seq_read);
 EXPORT_SYMBOL(skb_find_text);
+EXPORT_SYMBOL(skb_tv_base);
diff --git a/net/core/sock.c b/net/core/sock.c
index 12f6d9a2a522..ccd10fd65682 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -260,7 +260,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                           
                        if (val > sysctl_wmem_max)
                                val = sysctl_wmem_max;
+set_sndbuf:
                        sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                        if ((val * 2) < SOCK_MIN_SNDBUF)
                                sk->sk_sndbuf = SOCK_MIN_SNDBUF;
@@ -274,6 +274,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                        sk->sk_write_space(sk);
                        break;
+                case SO_SNDBUFFORCE:
+                        if (!capable(CAP_NET_ADMIN)) {
+                                ret = -EPERM;
+                                break;
+                        }
+                        goto set_sndbuf;
                case SO_RCVBUF:
                        /* Don't error on this BSD doesn't and if you think
                           about it this is right. Otherwise apps have to
@@ -282,7 +289,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                          
                        if (val > sysctl_rmem_max)
                                val = sysctl_rmem_max;
+set_rcvbuf:
                        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
                        /* FIXME: is this lower bound the right one? */
                        if ((val * 2) < SOCK_MIN_RCVBUF)
@@ -291,6 +298,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                                sk->sk_rcvbuf = val * 2;
                        break;
+                case SO_RCVBUFFORCE:
+                        if (!capable(CAP_NET_ADMIN)) {
+                                ret = -EPERM;
+                                break;
+                        }
+                        goto set_rcvbuf;
                case SO_KEEPALIVE:
 #ifdef CONFIG_INET
                        if (sk->sk_protocol == IPPROTO_TCP)
@@ -686,6 +700,80 @@ void sk_free(struct sock *sk)
        module_put(owner);
 }
+struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
+{
+        struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
+        if (newsk != NULL) {
+                struct sk_filter *filter;
+                memcpy(newsk, sk, sk->sk_prot->obj_size);
+                /* SANITY */
+                sk_node_init(&newsk->sk_node);
+                sock_lock_init(newsk);
+                bh_lock_sock(newsk);
+                atomic_set(&newsk->sk_rmem_alloc, 0);
+                atomic_set(&newsk->sk_wmem_alloc, 0);
+                atomic_set(&newsk->sk_omem_alloc, 0);
+                skb_queue_head_init(&newsk->sk_receive_queue);
+                skb_queue_head_init(&newsk->sk_write_queue);
+                rwlock_init(&newsk->sk_dst_lock);
+                rwlock_init(&newsk->sk_callback_lock);
+                newsk->sk_dst_cache     = NULL;
+                newsk->sk_wmem_queued   = 0;
+                newsk->sk_forward_alloc = 0;
+                newsk->sk_send_head     = NULL;
+                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
+                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+                sock_reset_flag(newsk, SOCK_DONE);
+                skb_queue_head_init(&newsk->sk_error_queue);
+                filter = newsk->sk_filter;
+                if (filter != NULL)
+                        sk_filter_charge(newsk, filter);
+                if (unlikely(xfrm_sk_clone_policy(newsk))) {
+                        /* It is still raw copy of parent, so invalidate
+                         * destructor and make plain sk_free() */
+                        newsk->sk_destruct = NULL;
+                        sk_free(newsk);
+                        newsk = NULL;
+                        goto out;
+                }
+                newsk->sk_err      = 0;
+                newsk->sk_priority = 0;
+                atomic_set(&newsk->sk_refcnt, 2);
+                /*
+                 * Increment the counter in the same struct proto as the master
+                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+                 * is the same as sk->sk_prot->socks, as this field was copied
+                 * with memcpy).
+                 *
+                 * This _changes_ the previous behaviour, where
+                 * tcp_create_openreq_child always was incrementing the
+                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+                 * to be taken into account in all callers. -acme
+                 */
+                sk_refcnt_debug_inc(newsk);
+                newsk->sk_socket = NULL;
+                newsk->sk_sleep  = NULL;
+                if (newsk->sk_prot->sockets_allocated)
+                        atomic_inc(newsk->sk_prot->sockets_allocated);
+        }
+out:
+        return newsk;
+}
+EXPORT_SYMBOL_GPL(sk_clone);
 void __init sk_init(void)
 {
        if (num_physpages <= 4096) {
@@ -1353,11 +1441,7 @@ void sk_common_release(struct sock *sk)
        xfrm_sk_free_policy(sk);
-#ifdef INET_REFCNT_DEBUG
+        sk_refcnt_debug_release(sk);
-        if (atomic_read(&sk->sk_refcnt) != 1)
-                printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
-                       sk, atomic_read(&sk->sk_refcnt));
-#endif
        sock_put(sk);
 }
@@ -1368,7 +1452,8 @@ static LIST_HEAD(proto_list);
 int proto_register(struct proto *prot, int alloc_slab)
 {
-        char *request_sock_slab_name;
+        char *request_sock_slab_name = NULL;
+        char *timewait_sock_slab_name;
        int rc = -ENOBUFS;
        if (alloc_slab) {
@@ -1399,6 +1484,23 @@ int proto_register(struct proto *prot, int alloc_slab)
                                goto out_free_request_sock_slab_name;
                        }
                }
+                if (prot->twsk_obj_size) {
+                        static const char mask[] = "tw_sock_%s";
+                        timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+                        if (timewait_sock_slab_name == NULL)
+                                goto out_free_request_sock_slab;
+                        sprintf(timewait_sock_slab_name, mask, prot->name);
+                        prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
+                                                            prot->twsk_obj_size,
+                                                            0, SLAB_HWCACHE_ALIGN,
+                                                            NULL, NULL);
+                        if (prot->twsk_slab == NULL)
+                                goto out_free_timewait_sock_slab_name;
+                }
        }
        write_lock(&proto_list_lock);
@@ -1407,6 +1509,13 @@ int proto_register(struct proto *prot, int alloc_slab)
        rc = 0;
 out:
        return rc;
+out_free_timewait_sock_slab_name:
+        kfree(timewait_sock_slab_name);
+out_free_request_sock_slab:
+        if (prot->rsk_prot && prot->rsk_prot->slab) {
+                kmem_cache_destroy(prot->rsk_prot->slab);
+                prot->rsk_prot->slab = NULL;
+        }
 out_free_request_sock_slab_name:
        kfree(request_sock_slab_name);
 out_free_sock_slab:
@@ -1434,6 +1543,14 @@ void proto_unregister(struct proto *prot)
                prot->rsk_prot->slab = NULL;
        }
+        if (prot->twsk_slab != NULL) {
+                const char *name = kmem_cache_name(prot->twsk_slab);
+                kmem_cache_destroy(prot->twsk_slab);
+                kfree(name);
+                prot->twsk_slab = NULL;
+        }
        list_del(&prot->node);
        write_unlock(&proto_list_lock);
 }
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8f817ad9f546..2f278c8e4743 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -9,23 +9,18 @@
 #include <linux/sysctl.h>
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/socket.h>
+#include <net/sock.h>
 #ifdef CONFIG_SYSCTL
 extern int netdev_max_backlog;
-extern int netdev_budget;
 extern int weight_p;
-extern int net_msg_cost;
-extern int net_msg_burst;
 extern __u32 sysctl_wmem_max;
 extern __u32 sysctl_rmem_max;
-extern __u32 sysctl_wmem_default;
-extern __u32 sysctl_rmem_default;
 extern int sysctl_core_destroy_delay;
-extern int sysctl_optmem_max;
-extern int sysctl_somaxconn;
 #ifdef CONFIG_NET_DIVERT
 extern char sysctl_divert_version[];
diff --git a/net/core/utils.c b/net/core/utils.c
index 88eb8b68e26b..7b5970fc9e40 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -16,7 +16,9 @@
 #include <linux/module.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/inet.h>
 #include <linux/mm.h>
+#include <linux/net.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/random.h>
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 3ff5639c0b78..5caae2399f3a 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -571,10 +571,6 @@ static int wireless_seq_show(struct seq_file *seq, void *v)
        return 0;
 }
-extern void *dev_seq_start(struct seq_file *seq, loff_t *pos);
-extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
-extern void dev_seq_stop(struct seq_file *seq, void *v);
 static struct seq_operations wireless_seq_ops = {
        .start = dev_seq_start,
        .next  = dev_seq_next,
@@ -1144,8 +1140,8 @@ static inline void rtmsg_iwinfo(struct net_device *	dev,
                kfree_skb(skb);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
+        NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
 }
 #endif  /* WE_EVENT_NETLINK */
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
new file mode 100644
index 000000000000..187ac182e24b
--- /dev/null
+++ b/net/dccp/Kconfig
@@ -0,0 +1,50 @@
+menu "DCCP Configuration (EXPERIMENTAL)"
+        depends on INET && EXPERIMENTAL
+config IP_DCCP
+        tristate "The DCCP Protocol (EXPERIMENTAL)"
+        ---help---
+          Datagram Congestion Control Protocol
+          From draft-ietf-dccp-spec-11 <http://www.icir.org/kohler/dcp/draft-ietf-dccp-spec-11.txt>.
+          The Datagram Congestion Control Protocol (DCCP) is a transport
+          protocol that implements bidirectional, unicast connections of
+          congestion-controlled, unreliable datagrams. It should be suitable
+          for use by applications such as streaming media, Internet telephony,
+          and on-line games
+          To compile this protocol support as a module, choose M here: the
+          module will be called dccp.
+          If in doubt, say N.
+config INET_DCCP_DIAG
+        depends on IP_DCCP && INET_DIAG
+        def_tristate y if (IP_DCCP = y && INET_DIAG = y)
+        def_tristate m
+source "net/dccp/ccids/Kconfig"
+menu "DCCP Kernel Hacking"
+        depends on IP_DCCP && DEBUG_KERNEL=y
+config IP_DCCP_DEBUG
+        bool "DCCP debug messages"
+        ---help---
+          Only use this if you're hacking DCCP.
+          Just say N.
+config IP_DCCP_UNLOAD_HACK
+        depends on IP_DCCP=m && IP_DCCP_CCID3=m
+        bool "DCCP control sock unload hack"
+        ---help---
+          Enable this to be able to unload the dccp module when the it
+          has only one refcount held, the control sock one. Just execute
+          "rmmod dccp_ccid3 dccp"
+          Just say N.
+endmenu
+endmenu
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
new file mode 100644
index 000000000000..fb97bb042455
--- /dev/null
+++ b/net/dccp/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_IP_DCCP) += dccp.o
+dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
+          timer.o
+obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+dccp_diag-y := diag.o
+obj-y += ccids/
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
new file mode 100644
index 000000000000..9d8fc0e289ea
--- /dev/null
+++ b/net/dccp/ccid.c
@@ -0,0 +1,139 @@
+/*
+ *  net/dccp/ccid.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  CCID infrastructure
+ *
+ *      This program is free software; you can redistribute it and/or modify it
+ *      under the terms of the GNU General Public License version 2 as
+ *      published by the Free Software Foundation.
+ */
+#include "ccid.h"
+static struct ccid *ccids[CCID_MAX];
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+static atomic_t ccids_lockct = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(ccids_lock);
+/*
+ * The strategy is: modifications ccids vector are short, do not sleep and
+ * veeery rare, but read access should be free of any exclusive locks.
+ */
+static void ccids_write_lock(void)
+{
+        spin_lock(&ccids_lock);
+        while (atomic_read(&ccids_lockct) != 0) {
+                spin_unlock(&ccids_lock);
+                yield();
+                spin_lock(&ccids_lock);
+        }
+}
+static inline void ccids_write_unlock(void)
+{
+        spin_unlock(&ccids_lock);
+}
+static inline void ccids_read_lock(void)
+{
+        atomic_inc(&ccids_lockct);
+        spin_unlock_wait(&ccids_lock);
+}
+static inline void ccids_read_unlock(void)
+{
+        atomic_dec(&ccids_lockct);
+}
+#else
+#define ccids_write_lock() do { } while(0)
+#define ccids_write_unlock() do { } while(0)
+#define ccids_read_lock() do { } while(0)
+#define ccids_read_unlock() do { } while(0)
+#endif
+int ccid_register(struct ccid *ccid)
+{
+        int err;
+        if (ccid->ccid_init == NULL)
+                return -1;
+        ccids_write_lock();
+        err = -EEXIST;
+        if (ccids[ccid->ccid_id] == NULL) {
+                ccids[ccid->ccid_id] = ccid;
+                err = 0;
+        }
+        ccids_write_unlock();
+        if (err == 0)
+                pr_info("CCID: Registered CCID %d (%s)\n",
+                        ccid->ccid_id, ccid->ccid_name);
+        return err;
+}
+EXPORT_SYMBOL_GPL(ccid_register);
+int ccid_unregister(struct ccid *ccid)
+{
+        ccids_write_lock();
+        ccids[ccid->ccid_id] = NULL;
+        ccids_write_unlock();
+        pr_info("CCID: Unregistered CCID %d (%s)\n",
+                ccid->ccid_id, ccid->ccid_name);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(ccid_unregister);
+struct ccid *ccid_init(unsigned char id, struct sock *sk)
+{
+        struct ccid *ccid;
+#ifdef CONFIG_KMOD
+        if (ccids[id] == NULL)
+                request_module("net-dccp-ccid-%d", id);
+#endif
+        ccids_read_lock();
+        ccid = ccids[id];
+        if (ccid == NULL)
+                goto out;
+        if (!try_module_get(ccid->ccid_owner))
+                goto out_err;
+        if (ccid->ccid_init(sk) != 0)
+                goto out_module_put;
+out:
+        ccids_read_unlock();
+        return ccid;
+out_module_put:
+        module_put(ccid->ccid_owner);
+out_err:
+        ccid = NULL;
+        goto out;
+}
+EXPORT_SYMBOL_GPL(ccid_init);
+void ccid_exit(struct ccid *ccid, struct sock *sk)
+{
+        if (ccid == NULL)
+                return;
+        ccids_read_lock();
+        if (ccids[ccid->ccid_id] != NULL) {
+                if (ccid->ccid_exit != NULL)
+                        ccid->ccid_exit(sk);
+                module_put(ccid->ccid_owner);
+        }
+        ccids_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ccid_exit);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
new file mode 100644
index 000000000000..962f1e9e2f7e
--- /dev/null
+++ b/net/dccp/ccid.h
@@ -0,0 +1,180 @@
+#ifndef _CCID_H
+#define _CCID_H
+/*
+ *  net/dccp/ccid.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  CCID infrastructure
+ *
+ *      This program is free software; you can redistribute it and/or modify it
+ *      under the terms of the GNU General Public License version 2 as
+ *      published by the Free Software Foundation.
+ */
+#include <net/sock.h>
+#include <linux/dccp.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#define CCID_MAX 255
+struct ccid {
+        unsigned char   ccid_id;
+        const char      *ccid_name;
+        struct module   *ccid_owner;
+        int             (*ccid_init)(struct sock *sk);
+        void            (*ccid_exit)(struct sock *sk);
+        int             (*ccid_hc_rx_init)(struct sock *sk);
+        int             (*ccid_hc_tx_init)(struct sock *sk);
+        void            (*ccid_hc_rx_exit)(struct sock *sk);
+        void            (*ccid_hc_tx_exit)(struct sock *sk);
+        void            (*ccid_hc_rx_packet_recv)(struct sock *sk,
+                                                  struct sk_buff *skb);
+        int             (*ccid_hc_rx_parse_options)(struct sock *sk,
+                                                    unsigned char option,
+                                                    unsigned char len, u16 idx,
+                                                    unsigned char* value);
+        void            (*ccid_hc_rx_insert_options)(struct sock *sk,
+                                                     struct sk_buff *skb);
+        void            (*ccid_hc_tx_insert_options)(struct sock *sk,
+                                                     struct sk_buff *skb);
+        void            (*ccid_hc_tx_packet_recv)(struct sock *sk,
+                                                  struct sk_buff *skb);
+        int             (*ccid_hc_tx_parse_options)(struct sock *sk,
+                                                    unsigned char option,
+                                                    unsigned char len, u16 idx,
+                                                    unsigned char* value);
+        int             (*ccid_hc_tx_send_packet)(struct sock *sk,
+                                                  struct sk_buff *skb, int len);
+        void            (*ccid_hc_tx_packet_sent)(struct sock *sk, int more,
+                                                  int len);
+        void            (*ccid_hc_rx_get_info)(struct sock *sk,
+                                               struct tcp_info *info);
+        void            (*ccid_hc_tx_get_info)(struct sock *sk,
+                                               struct tcp_info *info);
+};
+extern int         ccid_register(struct ccid *ccid);
+extern int         ccid_unregister(struct ccid *ccid);
+extern struct ccid *ccid_init(unsigned char id, struct sock *sk);
+extern void        ccid_exit(struct ccid *ccid, struct sock *sk);
+static inline void __ccid_get(struct ccid *ccid)
+{
+        __module_get(ccid->ccid_owner);
+}
+static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
+                                         struct sk_buff *skb, int len)
+{
+        int rc = 0;
+        if (ccid->ccid_hc_tx_send_packet != NULL)
+                rc = ccid->ccid_hc_tx_send_packet(sk, skb, len);
+        return rc;
+}
+static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
+                                          int more, int len)
+{
+        if (ccid->ccid_hc_tx_packet_sent != NULL)
+                ccid->ccid_hc_tx_packet_sent(sk, more, len);
+}
+static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk)
+{
+        int rc = 0;
+        if (ccid->ccid_hc_rx_init != NULL)
+                rc = ccid->ccid_hc_rx_init(sk);
+        return rc;
+}
+static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+        int rc = 0;
+        if (ccid->ccid_hc_tx_init != NULL)
+                rc = ccid->ccid_hc_tx_init(sk);
+        return rc;
+}
+static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk)
+{
+        if (ccid->ccid_hc_rx_exit != NULL &&
+            dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL)
+                ccid->ccid_hc_rx_exit(sk);
+}
+static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk)
+{
+        if (ccid->ccid_hc_tx_exit != NULL &&
+            dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL)
+                ccid->ccid_hc_tx_exit(sk);
+}
+static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
+                                          struct sk_buff *skb)
+{
+        if (ccid->ccid_hc_rx_packet_recv != NULL)
+                ccid->ccid_hc_rx_packet_recv(sk, skb);
+}
+static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
+                                          struct sk_buff *skb)
+{
+        if (ccid->ccid_hc_tx_packet_recv != NULL)
+                ccid->ccid_hc_tx_packet_recv(sk, skb);
+}
+static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
+                                           unsigned char option,
+                                           unsigned char len, u16 idx,
+                                           unsigned char* value)
+{
+        int rc = 0;
+        if (ccid->ccid_hc_tx_parse_options != NULL)
+                rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx,
+                                                    value);
+        return rc;
+}
+static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
+                                           unsigned char option,
+                                           unsigned char len, u16 idx,
+                                           unsigned char* value)
+{
+        int rc = 0;
+        if (ccid->ccid_hc_rx_parse_options != NULL)
+                rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value);
+        return rc;
+}
+static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
+                                             struct sk_buff *skb)
+{
+        if (ccid->ccid_hc_tx_insert_options != NULL)
+                ccid->ccid_hc_tx_insert_options(sk, skb);
+}
+static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+                                             struct sk_buff *skb)
+{
+        if (ccid->ccid_hc_rx_insert_options != NULL)
+                ccid->ccid_hc_rx_insert_options(sk, skb);
+}
+static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
+                                       struct tcp_info *info)
+{
+        if (ccid->ccid_hc_rx_get_info != NULL)
+                ccid->ccid_hc_rx_get_info(sk, info);
+}
+static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
+                                       struct tcp_info *info)
+{
+        if (ccid->ccid_hc_tx_get_info != NULL)
+                ccid->ccid_hc_tx_get_info(sk, info);
+}
+#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
new file mode 100644
index 000000000000..7684d83946a4
--- /dev/null
+++ b/net/dccp/ccids/Kconfig
@@ -0,0 +1,29 @@
+menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
+        depends on IP_DCCP && EXPERIMENTAL
+config IP_DCCP_CCID3
+        tristate "CCID3 (TFRC) (EXPERIMENTAL)"
+        depends on IP_DCCP
+        ---help---
+          CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+          rate-controlled congestion control mechanism.  TFRC is designed to
+          be reasonably fair when competing for bandwidth with TCP-like flows,
+          where a flow is "reasonably fair" if its sending rate is generally
+          within a factor of two of the sending rate of a TCP flow under the
+          same conditions.  However, TFRC has a much lower variation of
+          throughput over time compared with TCP, which makes CCID 3 more
+          suitable than CCID 2 for applications such streaming media where a
+          relatively smooth sending rate is of importance.
+          CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
+          congestion control algorithms were initially described in RFC 3448.
+          This text was extracted from draft-ietf-dccp-spec-11.txt.
+          
+          If in doubt, say M.
+config IP_DCCP_TFRC_LIB
+        depends on IP_DCCP_CCID3
+        def_tristate IP_DCCP_CCID3
+endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
new file mode 100644
index 000000000000..956f79f50743
--- /dev/null
+++ b/net/dccp/ccids/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
+dccp_ccid3-y := ccid3.o
+obj-y += lib/
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
new file mode 100644
index 000000000000..7bf3b3a91e97
--- /dev/null
+++ b/net/dccp/ccids/ccid3.c
@@ -0,0 +1,1221 @@
+/*
+ *  net/dccp/ccids/ccid3.c
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/config.h>
+#include "../ccid.h"
+#include "../dccp.h"
+#include "lib/packet_history.h"
+#include "lib/loss_interval.h"
+#include "lib/tfrc.h"
+#include "ccid3.h"
+/*
+ * Reason for maths with 10 here is to avoid 32 bit overflow when a is big.
+ */
+static inline u32 usecs_div(const u32 a, const u32 b)
+{
+        const u32 tmp = a * (USEC_PER_SEC / 10);
+        return b > 20 ? tmp / (b / 10) : tmp;
+}
+static int ccid3_debug;
+#ifdef CCID3_DEBUG
+#define ccid3_pr_debug(format, a...) \
+        do { if (ccid3_debug) \
+                printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
+        } while (0)
+#else
+#define ccid3_pr_debug(format, a...)
+#endif
+static struct dccp_tx_hist *ccid3_tx_hist;
+static struct dccp_rx_hist *ccid3_rx_hist;
+static struct dccp_li_hist *ccid3_li_hist;
+static int ccid3_init(struct sock *sk)
+{
+        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+        return 0;
+}
+static void ccid3_exit(struct sock *sk)
+{
+        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+}
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+        TFRC_SSTATE_NO_SENT = 1,
+        TFRC_SSTATE_NO_FBACK,
+        TFRC_SSTATE_FBACK,
+        TFRC_SSTATE_TERM,
+};
+#ifdef CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+        static char *ccid3_state_names[] = {
+        [TFRC_SSTATE_NO_SENT]  = "NO_SENT",
+        [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+        [TFRC_SSTATE_FBACK]    = "FBACK",
+        [TFRC_SSTATE_TERM]     = "TERM",
+        };
+        return ccid3_state_names[state];
+}
+#endif
+static inline void ccid3_hc_tx_set_state(struct sock *sk,
+                                         enum ccid3_hc_tx_states state)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+        ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+                       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+                       ccid3_tx_state_name(state));
+        WARN_ON(state == oldstate);
+        hctx->ccid3hctx_state = state;
+}
+/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */
+static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx)
+{
+        /*
+         * If no feedback spec says t_ipi is 1 second (set elsewhere and then
+         * doubles after every no feedback timer (separate function)
+         */
+        if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+                hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s,
+                                                  hctx->ccid3hctx_x);
+}
+/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
+static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx)
+{
+        hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
+                                           TFRC_OPSYS_HALF_TIME_GRAN);
+}
+/*
+ * Update X by
+ *    If (p > 0)
+ *       x_calc = calcX(s, R, p);
+ *       X = max(min(X_calc, 2 * X_recv), s / t_mbi);
+ *    Else
+ *       If (now - tld >= R)
+ *          X = max(min(2 * X, 2 * X_recv), s / R);
+ *          tld = now;
+ */ 
+static void ccid3_hc_tx_update_x(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        /* To avoid large error in calcX */
+        if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
+                hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
+                                                     hctx->ccid3hctx_rtt,
+                                                     hctx->ccid3hctx_p);
+                hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc,
+                                                          2 * hctx->ccid3hctx_x_recv),
+                                               (hctx->ccid3hctx_s /
+                                                TFRC_MAX_BACK_OFF_TIME));
+        } else {
+                struct timeval now;
+                do_gettimeofday(&now);
+                if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >=
+                    hctx->ccid3hctx_rtt) {
+                        hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv,
+                                                                  hctx->ccid3hctx_x) * 2,
+                                                       usecs_div(hctx->ccid3hctx_s,
+                                                                 hctx->ccid3hctx_rtt));
+                        hctx->ccid3hctx_t_ld = now;
+                }
+        }
+}
+static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock *)data;
+        struct dccp_sock *dp = dccp_sk(sk);
+        unsigned long next_tmout = 0;
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later. */
+                /* XXX: set some sensible MIB */
+                sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+                               jiffies + HZ / 5);
+                goto out;
+        }
+        ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk,
+                       ccid3_tx_state_name(hctx->ccid3hctx_state));
+        
+        switch (hctx->ccid3hctx_state) {
+        case TFRC_SSTATE_TERM:
+                goto out;
+        case TFRC_SSTATE_NO_FBACK:
+                /* Halve send rate */
+                hctx->ccid3hctx_x /= 2;
+                if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s /
+                                         TFRC_MAX_BACK_OFF_TIME))
+                        hctx->ccid3hctx_x = (hctx->ccid3hctx_s /
+                                             TFRC_MAX_BACK_OFF_TIME);
+                ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d "
+                               "bytes/s\n",
+                               dccp_role(sk), sk,
+                               ccid3_tx_state_name(hctx->ccid3hctx_state),
+                               hctx->ccid3hctx_x);
+                next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s,
+                                                      hctx->ccid3hctx_x),
+                                        TFRC_INITIAL_TIMEOUT);
+                /*
+                 * FIXME - not sure above calculation is correct. See section
+                 * 5 of CCID3 11 should adjust tx_t_ipi and double that to
+                 * achieve it really
+                 */
+                break;
+        case TFRC_SSTATE_FBACK:
+                /*
+                 * Check if IDLE since last timeout and recv rate is less than
+                 * 4 packets per RTT
+                 */
+                if (!hctx->ccid3hctx_idle ||
+                    (hctx->ccid3hctx_x_recv >=
+                     4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) {
+                        ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n",
+                                       dccp_role(sk), sk,
+                                       ccid3_tx_state_name(hctx->ccid3hctx_state));
+                        /* Halve sending rate */
+                        /*  If (X_calc > 2 * X_recv)
+                         *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
+                         *  Else
+                         *    X_recv = X_calc / 4;
+                         */
+                        BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
+                               hctx->ccid3hctx_x_calc == 0);
+                        /* check also if p is zero -> x_calc is infinity? */
+                        if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
+                            hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
+                                hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
+                                                                    hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME));
+                        else
+                                hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4;
+                        /* Update sending rate */
+                        ccid3_hc_tx_update_x(sk);
+                }
+                /*
+                 * Schedule no feedback timer to expire in
+                 * max(4 * R, 2 * s / X)
+                 */
+                next_tmout = max_t(u32, hctx->ccid3hctx_t_rto, 
+                                        2 * usecs_div(hctx->ccid3hctx_s,
+                                                      hctx->ccid3hctx_x));
+                break;
+        default:
+                printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+                       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+                dump_stack();
+                goto out;
+        }
+        sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 
+                      jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+        hctx->ccid3hctx_idle = 1;
+out:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+static int ccid3_hc_tx_send_packet(struct sock *sk,
+                                   struct sk_buff *skb, int len)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        struct dccp_tx_hist_entry *new_packet;
+        struct timeval now;
+        long delay;
+        int rc = -ENOTCONN;
+        /* Check if pure ACK or Terminating*/
+        /*
+         * XXX: We only call this function for DATA and DATAACK, on, these
+         * packets can have zero length, but why the comment about "pure ACK"?
+         */
+        if (hctx == NULL || len == 0 ||
+            hctx->ccid3hctx_state == TFRC_SSTATE_TERM)
+                goto out;
+        /* See if last packet allocated was not sent */
+        new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+        if (new_packet == NULL || new_packet->dccphtx_sent) {
+                new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
+                                                    SLAB_ATOMIC);
+                rc = -ENOBUFS;
+                if (new_packet == NULL) {
+                        ccid3_pr_debug("%s, sk=%p, not enough mem to add "
+                                       "to history, send refused\n",
+                                       dccp_role(sk), sk);
+                        goto out;
+                }
+                dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
+        }
+        do_gettimeofday(&now);
+        switch (hctx->ccid3hctx_state) {
+        case TFRC_SSTATE_NO_SENT:
+                ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n",
+                               dccp_role(sk), sk, dp->dccps_gss);
+                hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer;
+                hctx->ccid3hctx_no_feedback_timer.data     = (unsigned long)sk;
+                sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+                               jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT));
+                hctx->ccid3hctx_last_win_count   = 0;
+                hctx->ccid3hctx_t_last_win_count = now;
+                ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+                hctx->ccid3hctx_t_ipi = TFRC_INITIAL_TIMEOUT;
+                /* Set nominal send time for initial packet */
+                hctx->ccid3hctx_t_nom = now;
+                timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+                                  hctx->ccid3hctx_t_ipi);
+                ccid3_calc_new_delta(hctx);
+                rc = 0;
+                break;
+        case TFRC_SSTATE_NO_FBACK:
+        case TFRC_SSTATE_FBACK:
+                delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) -
+                         hctx->ccid3hctx_delta);
+                ccid3_pr_debug("send_packet delay=%ld\n", delay);
+                delay /= -1000;
+                /* divide by -1000 is to convert to ms and get sign right */
+                rc = delay > 0 ? delay : 0;
+                break;
+        default:
+                printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+                       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+                dump_stack();
+                rc = -EINVAL;
+                break;
+        }
+        /* Can we send? if so add options and add to packet history */
+        if (rc == 0)
+                new_packet->dccphtx_ccval =
+                        DCCP_SKB_CB(skb)->dccpd_ccval =
+                                hctx->ccid3hctx_last_win_count;
+out:
+        return rc;
+}
+static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        struct timeval now;
+        BUG_ON(hctx == NULL);
+        if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
+                ccid3_pr_debug("%s, sk=%p, while state is TFRC_SSTATE_TERM!\n",
+                               dccp_role(sk), sk);
+                return;
+        }
+        do_gettimeofday(&now);
+        /* check if we have sent a data packet */
+        if (len > 0) {
+                unsigned long quarter_rtt;
+                struct dccp_tx_hist_entry *packet;
+                packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+                if (packet == NULL) {
+                        printk(KERN_CRIT "%s: packet doesn't exists in "
+                                         "history!\n", __FUNCTION__);
+                        return;
+                }
+                if (packet->dccphtx_sent) {
+                        printk(KERN_CRIT "%s: no unsent packet in history!\n",
+                               __FUNCTION__);
+                        return;
+                }
+                packet->dccphtx_tstamp = now;
+                packet->dccphtx_seqno  = dp->dccps_gss;
+                /*
+                 * Check if win_count have changed
+                 * Algorithm in "8.1. Window Counter Valuer" in
+                 * draft-ietf-dccp-ccid3-11.txt
+                 */
+                quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count);
+                if (likely(hctx->ccid3hctx_rtt > 8))
+                        quarter_rtt /= hctx->ccid3hctx_rtt / 4;
+                if (quarter_rtt > 0) {
+                        hctx->ccid3hctx_t_last_win_count = now;
+                        hctx->ccid3hctx_last_win_count   = (hctx->ccid3hctx_last_win_count +
+                                                            min_t(unsigned long, quarter_rtt, 5)) % 16;
+                        ccid3_pr_debug("%s, sk=%p, window changed from "
+                                       "%u to %u!\n",
+                                       dccp_role(sk), sk,
+                                       packet->dccphtx_ccval,
+                                       hctx->ccid3hctx_last_win_count);
+                }
+                hctx->ccid3hctx_idle = 0;
+                packet->dccphtx_rtt  = hctx->ccid3hctx_rtt;
+                packet->dccphtx_sent = 1;
+        } else
+                ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n",
+                               dccp_role(sk), sk, dp->dccps_gss);
+        switch (hctx->ccid3hctx_state) {
+        case TFRC_SSTATE_NO_SENT:
+                /* if first wasn't pure ack */
+                if (len != 0)
+                        printk(KERN_CRIT "%s: %s, First packet sent is noted "
+                                         "as a data packet\n",
+                               __FUNCTION__, dccp_role(sk));
+                return;
+        case TFRC_SSTATE_NO_FBACK:
+        case TFRC_SSTATE_FBACK:
+                if (len > 0) {
+                        hctx->ccid3hctx_t_nom = now;
+                        ccid3_calc_new_t_ipi(hctx);
+                        ccid3_calc_new_delta(hctx);
+                        timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+                                          hctx->ccid3hctx_t_ipi);
+                }
+                break;
+        default:
+                printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+                       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+                dump_stack();
+                break;
+        }
+}
+static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        struct ccid3_options_received *opt_recv;
+        struct dccp_tx_hist_entry *packet;
+        unsigned long next_tmout; 
+        u32 t_elapsed;
+        u32 pinv;
+        u32 x_recv;
+        u32 r_sample;
+        if (hctx == NULL)
+                return;
+        if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
+                ccid3_pr_debug("%s, sk=%p, received a packet when "
+                               "terminating!\n", dccp_role(sk), sk);
+                return;
+        }
+        /* we are only interested in ACKs */
+        if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
+              DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
+                return;
+        opt_recv = &hctx->ccid3hctx_options_received;
+        t_elapsed = dp->dccps_options_received.dccpor_elapsed_time;
+        x_recv = opt_recv->ccid3or_receive_rate;
+        pinv = opt_recv->ccid3or_loss_event_rate;
+        switch (hctx->ccid3hctx_state) {
+        case TFRC_SSTATE_NO_SENT:
+                /* FIXME: what to do here? */
+                return;
+        case TFRC_SSTATE_NO_FBACK:
+        case TFRC_SSTATE_FBACK:
+                /* Calculate new round trip sample by
+                 * R_sample = (now - t_recvdata) - t_delay */
+                /* get t_recvdata from history */
+                packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
+                                                 DCCP_SKB_CB(skb)->dccpd_ack_seq);
+                if (packet == NULL) {
+                        ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't "
+                                       "exist in history!\n",
+                                       dccp_role(sk), sk,
+                                       DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                                       dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
+                        return;
+                }
+                /* Update RTT */
+                r_sample = timeval_now_delta(&packet->dccphtx_tstamp);
+                /* FIXME: */
+                // r_sample -= usecs_to_jiffies(t_elapsed * 10);
+                /* Update RTT estimate by 
+                 * If (No feedback recv)
+                 *    R = R_sample;
+                 * Else
+                 *    R = q * R + (1 - q) * R_sample;
+                 *
+                 * q is a constant, RFC 3448 recomments 0.9
+                 */
+                if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+                        ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
+                        hctx->ccid3hctx_rtt = r_sample;
+                } else
+                        hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
+                                              r_sample / 10;
+                ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
+                               "r_sample=%us\n", dccp_role(sk), sk,
+                               hctx->ccid3hctx_rtt, r_sample);
+                /* Update timeout interval */
+                hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
+                                              USEC_PER_SEC);
+                /* Update receive rate */
+                hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
+                /* Update loss event rate */
+                if (pinv == ~0 || pinv == 0)
+                        hctx->ccid3hctx_p = 0;
+                else {
+                        hctx->ccid3hctx_p = 1000000 / pinv;
+                        if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
+                                hctx->ccid3hctx_p = TFRC_SMALLEST_P;
+                                ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
+                                               dccp_role(sk), sk);
+                        }
+                }
+                /* unschedule no feedback timer */
+                sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+                /* Update sending rate */
+                ccid3_hc_tx_update_x(sk);
+                /* Update next send time */
+                timeval_sub_usecs(&hctx->ccid3hctx_t_nom,
+                                  hctx->ccid3hctx_t_ipi);
+                ccid3_calc_new_t_ipi(hctx);
+                timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+                                  hctx->ccid3hctx_t_ipi);
+                ccid3_calc_new_delta(hctx);
+                /* remove all packets older than the one acked from history */
+                dccp_tx_hist_purge_older(ccid3_tx_hist,
+                                         &hctx->ccid3hctx_hist, packet);
+                /*
+                 * As we have calculated new ipi, delta, t_nom it is possible that
+                 * we now can send a packet, so wake up dccp_wait_for_ccids.
+                 */
+                sk->sk_write_space(sk);
+                /*
+                 * Schedule no feedback timer to expire in
+                 * max(4 * R, 2 * s / X)
+                 */
+                next_tmout = max(hctx->ccid3hctx_t_rto,
+                                 2 * usecs_div(hctx->ccid3hctx_s,
+                                               hctx->ccid3hctx_x));
+                        
+                ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
+                               "expire in %lu jiffies (%luus)\n",
+                               dccp_role(sk), sk,
+                               usecs_to_jiffies(next_tmout), next_tmout); 
+                sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 
+                               jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+                /* set idle flag */
+                hctx->ccid3hctx_idle = 1;   
+                break;
+        default:
+                printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+                       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+                dump_stack();
+                break;
+        }
+}
+static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+        const struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        if (hctx == NULL || !(sk->sk_state == DCCP_OPEN ||
+                              sk->sk_state == DCCP_PARTOPEN))
+                return;
+         DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
+}
+static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
+                                     unsigned char len, u16 idx,
+                                     unsigned char *value)
+{
+        int rc = 0;
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        struct ccid3_options_received *opt_recv;
+        if (hctx == NULL)
+                return 0;
+        opt_recv = &hctx->ccid3hctx_options_received;
+        if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
+                opt_recv->ccid3or_seqno              = dp->dccps_gsr;
+                opt_recv->ccid3or_loss_event_rate    = ~0;
+                opt_recv->ccid3or_loss_intervals_idx = 0;
+                opt_recv->ccid3or_loss_intervals_len = 0;
+                opt_recv->ccid3or_receive_rate       = 0;
+        }
+        switch (option) {
+        case TFRC_OPT_LOSS_EVENT_RATE:
+                if (len != 4) {
+                        ccid3_pr_debug("%s, sk=%p, invalid len for "
+                                       "TFRC_OPT_LOSS_EVENT_RATE\n",
+                                       dccp_role(sk), sk);
+                        rc = -EINVAL;
+                } else {
+                        opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value);
+                        ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
+                                       dccp_role(sk), sk,
+                                       opt_recv->ccid3or_loss_event_rate);
+                }
+                break;
+        case TFRC_OPT_LOSS_INTERVALS:
+                opt_recv->ccid3or_loss_intervals_idx = idx;
+                opt_recv->ccid3or_loss_intervals_len = len;
+                ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n",
+                               dccp_role(sk), sk,
+                               opt_recv->ccid3or_loss_intervals_idx,
+                               opt_recv->ccid3or_loss_intervals_len);
+                break;
+        case TFRC_OPT_RECEIVE_RATE:
+                if (len != 4) {
+                        ccid3_pr_debug("%s, sk=%p, invalid len for "
+                                       "TFRC_OPT_RECEIVE_RATE\n",
+                                       dccp_role(sk), sk);
+                        rc = -EINVAL;
+                } else {
+                        opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value);
+                        ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
+                                       dccp_role(sk), sk,
+                                       opt_recv->ccid3or_receive_rate);
+                }
+                break;
+        }
+        return rc;
+}
+static int ccid3_hc_tx_init(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx;
+        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+        hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx),
+                                                      gfp_any());
+        if (hctx == NULL)
+                return -ENOMEM;
+        memset(hctx, 0, sizeof(*hctx));
+        if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+            dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+                hctx->ccid3hctx_s = dp->dccps_packet_size;
+        else
+                hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE;
+        /* Set transmission rate to 1 packet per second */
+        hctx->ccid3hctx_x     = hctx->ccid3hctx_s;
+        hctx->ccid3hctx_t_rto = USEC_PER_SEC;
+        hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
+        INIT_LIST_HEAD(&hctx->ccid3hctx_hist);
+        init_timer(&hctx->ccid3hctx_no_feedback_timer);
+        return 0;
+}
+static void ccid3_hc_tx_exit(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+        BUG_ON(hctx == NULL);
+        ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
+        sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+        /* Empty packet history */
+        dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
+        kfree(dp->dccps_hc_tx_ccid_private);
+        dp->dccps_hc_tx_ccid_private = NULL;
+}
+/*
+ * RX Half Connection methods
+ */
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+        TFRC_RSTATE_NO_DATA = 1,
+        TFRC_RSTATE_DATA,
+        TFRC_RSTATE_TERM    = 127,
+};
+#ifdef CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+        static char *ccid3_rx_state_names[] = {
+        [TFRC_RSTATE_NO_DATA] = "NO_DATA",
+        [TFRC_RSTATE_DATA]    = "DATA",
+        [TFRC_RSTATE_TERM]    = "TERM",
+        };
+        return ccid3_rx_state_names[state];
+}
+#endif
+static inline void ccid3_hc_rx_set_state(struct sock *sk,
+                                         enum ccid3_hc_rx_states state)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+        ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+                       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+                       ccid3_rx_state_name(state));
+        WARN_ON(state == oldstate);
+        hcrx->ccid3hcrx_state = state;
+}
+static void ccid3_hc_rx_send_feedback(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        struct dccp_rx_hist_entry *packet;
+        struct timeval now;
+        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+        do_gettimeofday(&now);
+        switch (hcrx->ccid3hcrx_state) {
+        case TFRC_RSTATE_NO_DATA:
+                hcrx->ccid3hcrx_x_recv = 0;
+                break;
+        case TFRC_RSTATE_DATA: {
+                const u32 delta = timeval_delta(&now,
+                                        &hcrx->ccid3hcrx_tstamp_last_feedback);
+                hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv *
+                                          USEC_PER_SEC);
+                if (likely(delta > 1))
+                        hcrx->ccid3hcrx_x_recv /= delta;
+        }
+                break;
+        default:
+                printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+                       __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+                dump_stack();
+                return;
+        }
+        packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist);
+        if (packet == NULL) {
+                printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n",
+                       __FUNCTION__, dccp_role(sk), sk);
+                dump_stack();
+                return;
+        }
+        hcrx->ccid3hcrx_tstamp_last_feedback = now;
+        hcrx->ccid3hcrx_last_counter         = packet->dccphrx_ccval;
+        hcrx->ccid3hcrx_seqno_last_counter   = packet->dccphrx_seqno;
+        hcrx->ccid3hcrx_bytes_recv           = 0;
+        /* Convert to multiples of 10us */
+        hcrx->ccid3hcrx_elapsed_time =
+                        timeval_delta(&now, &packet->dccphrx_tstamp) / 10;
+        if (hcrx->ccid3hcrx_p == 0)
+                hcrx->ccid3hcrx_pinv = ~0;
+        else
+                hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
+        dccp_send_ack(sk);
+}
+static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+        const struct dccp_sock *dp = dccp_sk(sk);
+        u32 x_recv, pinv;
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN ||
+                              sk->sk_state == DCCP_PARTOPEN))
+                return;
+        DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter;
+        if (dccp_packet_without_ack(skb))
+                return;
+                
+        if (hcrx->ccid3hcrx_elapsed_time != 0)
+                dccp_insert_option_elapsed_time(sk, skb,
+                                                hcrx->ccid3hcrx_elapsed_time);
+        dccp_insert_option_timestamp(sk, skb);
+        x_recv = htonl(hcrx->ccid3hcrx_x_recv);
+        pinv   = htonl(hcrx->ccid3hcrx_pinv);
+        dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
+                           &pinv, sizeof(pinv));
+        dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
+                           &x_recv, sizeof(x_recv));
+}
+/* calculate first loss interval
+ *
+ * returns estimated loss interval in usecs */
+static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
+        u32 rtt, delta, x_recv, fval, p, tmp2;
+        struct timeval tstamp = { 0, };
+        int interval = 0;
+        int win_count = 0;
+        int step = 0;
+        u64 tmp1;
+        list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist,
+                                 dccphrx_node) {
+                if (dccp_rx_hist_entry_data_packet(entry)) {
+                        tail = entry;
+                        switch (step) {
+                        case 0:
+                                tstamp    = entry->dccphrx_tstamp;
+                                win_count = entry->dccphrx_ccval;
+                                step = 1;
+                                break;
+                        case 1:
+                                interval = win_count - entry->dccphrx_ccval;
+                                if (interval < 0)
+                                        interval += TFRC_WIN_COUNT_LIMIT;
+                                if (interval > 4)
+                                        goto found;
+                                break;
+                        }
+                }
+        }
+        if (step == 0) {
+                printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no "
+                                 "data packets!\n",
+                       __FUNCTION__, dccp_role(sk), sk);
+                return ~0;
+        }
+        if (interval == 0) {
+                ccid3_pr_debug("%s, sk=%p, Could not find a win_count "
+                               "interval > 0. Defaulting to 1\n",
+                               dccp_role(sk), sk);
+                interval = 1;
+        }
+found:
+        rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
+        ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
+                       dccp_role(sk), sk, rtt);
+        if (rtt == 0)
+                rtt = 1;
+        delta = timeval_now_delta(&hcrx->ccid3hcrx_tstamp_last_feedback);
+        x_recv = hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC;
+        if (likely(delta > 1))
+                x_recv /= delta;
+        tmp1 = (u64)x_recv * (u64)rtt;
+        do_div(tmp1,10000000);
+        tmp2 = (u32)tmp1;
+        fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
+        /* do not alter order above or you will get overflow on 32 bit */
+        p = tfrc_calc_x_reverse_lookup(fval);
+        ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied "
+                       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
+        if (p == 0)
+                return ~0;
+        else
+                return 1000000 / p; 
+}
+static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        if (seq_loss != DCCP_MAX_SEQNO + 1 &&
+            list_empty(&hcrx->ccid3hcrx_li_hist)) {
+                struct dccp_li_hist_entry *li_tail;
+                li_tail = dccp_li_hist_interval_new(ccid3_li_hist,
+                                                    &hcrx->ccid3hcrx_li_hist,
+                                                    seq_loss, win_loss);
+                if (li_tail == NULL)
+                        return;
+                li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
+        }
+        /* FIXME: find end of interval */
+}
+static void ccid3_hc_rx_detect_loss(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        u8 win_loss;
+        const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
+                                                      &hcrx->ccid3hcrx_li_hist,
+                                                      &win_loss);
+        ccid3_hc_rx_update_li(sk, seq_loss, win_loss);
+}
+static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        const struct dccp_options_received *opt_recv;
+        struct dccp_rx_hist_entry *packet;
+        struct timeval now;
+        u8 win_count;
+        u32 p_prev;
+        int ins;
+        if (hcrx == NULL)
+                return;
+        BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
+                 hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA));
+        opt_recv = &dp->dccps_options_received;
+        switch (DCCP_SKB_CB(skb)->dccpd_type) {
+        case DCCP_PKT_ACK:
+                if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
+                        return;
+        case DCCP_PKT_DATAACK:
+                if (opt_recv->dccpor_timestamp_echo == 0)
+                        break;
+                p_prev = hcrx->ccid3hcrx_rtt;
+                do_gettimeofday(&now);
+                hcrx->ccid3hcrx_rtt = timeval_usecs(&now) -
+                                     (opt_recv->dccpor_timestamp_echo -
+                                      opt_recv->dccpor_elapsed_time) * 10;
+                if (p_prev != hcrx->ccid3hcrx_rtt)
+                        ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
+                                       dccp_role(sk), hcrx->ccid3hcrx_rtt,
+                                       opt_recv->dccpor_elapsed_time);
+                break;
+        case DCCP_PKT_DATA:
+                break;
+        default:
+                ccid3_pr_debug("%s, sk=%p, not DATA/DATAACK/ACK packet(%s)\n",
+                               dccp_role(sk), sk,
+                               dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
+                return;
+        }
+        packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp,
+                                        skb, SLAB_ATOMIC);
+        if (packet == NULL) {
+                ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet "
+                               "to history (consider it lost)!",
+                               dccp_role(sk), sk);
+                return;
+        }
+        win_count = packet->dccphrx_ccval;
+        ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
+                                      &hcrx->ccid3hcrx_li_hist, packet);
+        if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
+                return;
+        switch (hcrx->ccid3hcrx_state) {
+        case TFRC_RSTATE_NO_DATA:
+                ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial "
+                               "feedback\n",
+                               dccp_role(sk), sk,
+                               dccp_state_name(sk->sk_state), skb);
+                ccid3_hc_rx_send_feedback(sk);
+                ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+                return;
+        case TFRC_RSTATE_DATA:
+                hcrx->ccid3hcrx_bytes_recv += skb->len -
+                                              dccp_hdr(skb)->dccph_doff * 4;
+                if (ins != 0)
+                        break;
+                do_gettimeofday(&now);
+                if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
+                    hcrx->ccid3hcrx_rtt) {
+                        hcrx->ccid3hcrx_tstamp_last_ack = now;
+                        ccid3_hc_rx_send_feedback(sk);
+                }
+                return;
+        default:
+                printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+                       __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+                dump_stack();
+                return;
+        }
+        /* Dealing with packet loss */
+        ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
+                       dccp_role(sk), sk, dccp_state_name(sk->sk_state));
+        ccid3_hc_rx_detect_loss(sk);
+        p_prev = hcrx->ccid3hcrx_p;
+        
+        /* Calculate loss event rate */
+        if (!list_empty(&hcrx->ccid3hcrx_li_hist))
+                /* Scaling up by 1000000 as fixed decimal */
+                hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
+        if (hcrx->ccid3hcrx_p > p_prev) {
+                ccid3_hc_rx_send_feedback(sk);
+                return;
+        }
+}
+static int ccid3_hc_rx_init(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx;
+        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+        hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx),
+                                                      gfp_any());
+        if (hcrx == NULL)
+                return -ENOMEM;
+        memset(hcrx, 0, sizeof(*hcrx));
+        if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+            dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+                hcrx->ccid3hcrx_s = dp->dccps_packet_size;
+        else
+                hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE;
+        hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
+        INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
+        INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
+        /*
+         * XXX this seems to be paranoid, need to think more about this, for
+         * now start with something different than zero. -acme
+         */
+        hcrx->ccid3hcrx_rtt = USEC_PER_SEC / 5;
+        return 0;
+}
+static void ccid3_hc_rx_exit(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+        if (hcrx == NULL)
+                return;
+        ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
+        /* Empty packet history */
+        dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist);
+        /* Empty loss interval history */
+        dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
+        kfree(dp->dccps_hc_rx_ccid_private);
+        dp->dccps_hc_rx_ccid_private = NULL;
+}
+static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
+{
+        const struct dccp_sock *dp = dccp_sk(sk);
+        const struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+        if (hcrx == NULL)
+                return;
+        info->tcpi_ca_state     = hcrx->ccid3hcrx_state;
+        info->tcpi_options      |= TCPI_OPT_TIMESTAMPS;
+        info->tcpi_rcv_rtt      = hcrx->ccid3hcrx_rtt;
+}
+static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
+{
+        const struct dccp_sock *dp = dccp_sk(sk);
+        const struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+        if (hctx == NULL)
+                return;
+        info->tcpi_rto = hctx->ccid3hctx_t_rto;
+        info->tcpi_rtt = hctx->ccid3hctx_rtt;
+}
+static struct ccid ccid3 = {
+        .ccid_id                   = 3,
+        .ccid_name                 = "ccid3",
+        .ccid_owner                = THIS_MODULE,
+        .ccid_init                 = ccid3_init,
+        .ccid_exit                 = ccid3_exit,
+        .ccid_hc_tx_init           = ccid3_hc_tx_init,
+        .ccid_hc_tx_exit           = ccid3_hc_tx_exit,
+        .ccid_hc_tx_send_packet    = ccid3_hc_tx_send_packet,
+        .ccid_hc_tx_packet_sent    = ccid3_hc_tx_packet_sent,
+        .ccid_hc_tx_packet_recv    = ccid3_hc_tx_packet_recv,
+        .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
+        .ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
+        .ccid_hc_rx_init           = ccid3_hc_rx_init,
+        .ccid_hc_rx_exit           = ccid3_hc_rx_exit,
+        .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
+        .ccid_hc_rx_packet_recv    = ccid3_hc_rx_packet_recv,
+        .ccid_hc_rx_get_info       = ccid3_hc_rx_get_info,
+        .ccid_hc_tx_get_info       = ccid3_hc_tx_get_info,
+};
+ 
+module_param(ccid3_debug, int, 0444);
+MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
+static __init int ccid3_module_init(void)
+{
+        int rc = -ENOBUFS;
+        ccid3_rx_hist = dccp_rx_hist_new("ccid3");
+        if (ccid3_rx_hist == NULL)
+                goto out;
+        ccid3_tx_hist = dccp_tx_hist_new("ccid3");
+        if (ccid3_tx_hist == NULL)
+                goto out_free_rx;
+        ccid3_li_hist = dccp_li_hist_new("ccid3");
+        if (ccid3_li_hist == NULL)
+                goto out_free_tx;
+        rc = ccid_register(&ccid3);
+        if (rc != 0) 
+                goto out_free_loss_interval_history;
+out:
+        return rc;
+out_free_loss_interval_history:
+        dccp_li_hist_delete(ccid3_li_hist);
+        ccid3_li_hist = NULL;
+out_free_tx:
+        dccp_tx_hist_delete(ccid3_tx_hist);
+        ccid3_tx_hist = NULL;
+out_free_rx:
+        dccp_rx_hist_delete(ccid3_rx_hist);
+        ccid3_rx_hist = NULL;
+        goto out;
+}
+module_init(ccid3_module_init);
+static __exit void ccid3_module_exit(void)
+{
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+        /*
+         * Hack to use while developing, so that we get rid of the control
+         * sock, that is what keeps a refcount on dccp.ko -acme
+         */
+        extern void dccp_ctl_sock_exit(void);
+        dccp_ctl_sock_exit();
+#endif
+        ccid_unregister(&ccid3);
+        if (ccid3_tx_hist != NULL) {
+                dccp_tx_hist_delete(ccid3_tx_hist);
+                ccid3_tx_hist = NULL;
+        }
+        if (ccid3_rx_hist != NULL) {
+                dccp_rx_hist_delete(ccid3_rx_hist);
+                ccid3_rx_hist = NULL;
+        }
+        if (ccid3_li_hist != NULL) {
+                dccp_li_hist_delete(ccid3_li_hist);
+                ccid3_li_hist = NULL;
+        }
+}
+module_exit(ccid3_module_exit);
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+              "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
new file mode 100644
index 000000000000..ee8cbace6630
--- /dev/null
+++ b/net/dccp/ccids/ccid3.h
@@ -0,0 +1,137 @@
+/*
+ *  net/dccp/ccids/ccid3.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID3_H_
+#define _DCCP_CCID3_H_
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#define TFRC_MIN_PACKET_SIZE       16
+#define TFRC_STD_PACKET_SIZE      256
+#define TFRC_MAX_PACKET_SIZE    65535
+/* Two seconds as per CCID3 spec */
+#define TFRC_INITIAL_TIMEOUT       (2 * USEC_PER_SEC)
+/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
+#define TFRC_OPSYS_HALF_TIME_GRAN  (USEC_PER_SEC / (2 * HZ))
+/* In seconds */
+#define TFRC_MAX_BACK_OFF_TIME     64
+#define TFRC_SMALLEST_P            40
+enum ccid3_options {
+        TFRC_OPT_LOSS_EVENT_RATE = 192,
+        TFRC_OPT_LOSS_INTERVALS  = 193,
+        TFRC_OPT_RECEIVE_RATE    = 194,
+};
+struct ccid3_options_received {
+        u64 ccid3or_seqno:48,
+            ccid3or_loss_intervals_idx:16;
+        u16 ccid3or_loss_intervals_len;
+        u32 ccid3or_loss_event_rate;
+        u32 ccid3or_receive_rate;
+};
+/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock
+ *
+  * @ccid3hctx_state - Sender state
+  * @ccid3hctx_x - Current sending rate
+  * @ccid3hctx_x_recv - Receive rate
+  * @ccid3hctx_x_calc - Calculated send (?) rate
+  * @ccid3hctx_s - Packet size
+  * @ccid3hctx_rtt - Estimate of current round trip time in usecs
+  * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
+  * @ccid3hctx_last_win_count - Last window counter sent
+  * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
+  *                               with last_win_count value sent
+  * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
+  * @ccid3hctx_idle - FIXME
+  * @ccid3hctx_t_ld - Time last doubled during slow start
+  * @ccid3hctx_t_nom - Nominal send time of next packet
+  * @ccid3hctx_t_ipi - Interpacket (send) interval
+  * @ccid3hctx_delta - Send timer delta
+  * @ccid3hctx_hist - Packet history
+  */
+struct ccid3_hc_tx_sock {
+        u32                             ccid3hctx_x;
+        u32                             ccid3hctx_x_recv;
+        u32                             ccid3hctx_x_calc;
+        u16                             ccid3hctx_s;
+        u32                             ccid3hctx_rtt;
+        u32                             ccid3hctx_p;
+        u8                              ccid3hctx_state;
+        u8                              ccid3hctx_last_win_count;
+        u8                              ccid3hctx_idle;
+        struct timeval                  ccid3hctx_t_last_win_count;
+        struct timer_list               ccid3hctx_no_feedback_timer;
+        struct timeval                  ccid3hctx_t_ld;
+        struct timeval                  ccid3hctx_t_nom;
+        u32                             ccid3hctx_t_rto;
+        u32                             ccid3hctx_t_ipi;
+        u32                             ccid3hctx_delta;
+        struct list_head                ccid3hctx_hist;
+        struct ccid3_options_received   ccid3hctx_options_received;
+};
+struct ccid3_hc_rx_sock {
+        u64                     ccid3hcrx_seqno_last_counter:48,
+                                ccid3hcrx_state:8,
+                                ccid3hcrx_last_counter:4;
+        unsigned long           ccid3hcrx_rtt;
+        u32                     ccid3hcrx_p;
+        u32                     ccid3hcrx_bytes_recv;
+        struct timeval          ccid3hcrx_tstamp_last_feedback;
+        struct timeval          ccid3hcrx_tstamp_last_ack;
+        struct list_head        ccid3hcrx_hist;
+        struct list_head        ccid3hcrx_li_hist;
+        u16                     ccid3hcrx_s;
+        u32                     ccid3hcrx_pinv;
+        u32                     ccid3hcrx_elapsed_time;
+        u32                     ccid3hcrx_x_recv;
+};
+#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \
+    ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field)
+#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \
+    ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field)
+#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
new file mode 100644
index 000000000000..5f940a6cbaca
--- /dev/null
+++ b/net/dccp/ccids/lib/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
+dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
new file mode 100644
index 000000000000..4c01a54143ad
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -0,0 +1,144 @@
+/*
+ *  net/dccp/ccids/lib/loss_interval.c
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include "loss_interval.h"
+struct dccp_li_hist *dccp_li_hist_new(const char *name)
+{
+        struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+        static const char dccp_li_hist_mask[] = "li_hist_%s";
+        char *slab_name;
+        if (hist == NULL)
+                goto out;
+        slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1,
+                            GFP_ATOMIC);
+        if (slab_name == NULL)
+                goto out_free_hist;
+        sprintf(slab_name, dccp_li_hist_mask, name);
+        hist->dccplih_slab = kmem_cache_create(slab_name,
+                                             sizeof(struct dccp_li_hist_entry),
+                                               0, SLAB_HWCACHE_ALIGN,
+                                               NULL, NULL);
+        if (hist->dccplih_slab == NULL)
+                goto out_free_slab_name;
+out:
+        return hist;
+out_free_slab_name:
+        kfree(slab_name);
+out_free_hist:
+        kfree(hist);
+        hist = NULL;
+        goto out;
+}
+EXPORT_SYMBOL_GPL(dccp_li_hist_new);
+void dccp_li_hist_delete(struct dccp_li_hist *hist)
+{
+        const char* name = kmem_cache_name(hist->dccplih_slab);
+        kmem_cache_destroy(hist->dccplih_slab);
+        kfree(name);
+        kfree(hist);
+}
+EXPORT_SYMBOL_GPL(dccp_li_hist_delete);
+void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list)
+{
+        struct dccp_li_hist_entry *entry, *next;
+        list_for_each_entry_safe(entry, next, list, dccplih_node) {
+                list_del_init(&entry->dccplih_node);
+                kmem_cache_free(hist->dccplih_slab, entry);
+        }
+}
+EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
+/* Weights used to calculate loss event rate */
+/*
+ * These are integers as per section 8 of RFC3448. We can then divide by 4 *
+ * when we use it.
+ */
+static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = {
+        4, 4, 4, 4, 3, 2, 1, 1,
+};
+u32 dccp_li_hist_calc_i_mean(struct list_head *list)
+{
+        struct dccp_li_hist_entry *li_entry, *li_next;
+        int i = 0;
+        u32 i_tot;
+        u32 i_tot0 = 0;
+        u32 i_tot1 = 0;
+        u32 w_tot  = 0;
+        list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
+                if (i < DCCP_LI_HIST_IVAL_F_LENGTH) {
+                        i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
+                        w_tot  += dccp_li_hist_w[i];
+                }
+                if (i != 0)
+                        i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
+                if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
+                        break;
+        }
+        if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
+                return 0;
+        i_tot = max(i_tot0, i_tot1);
+        /* FIXME: Why do we do this? -Ian McDonald */
+        if (i_tot * 4 < w_tot)
+                i_tot = w_tot * 4;
+        return i_tot * 4 / w_tot;
+}
+EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
+struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+                                                     struct list_head *list,
+                                                     const u64 seq_loss,
+                                                     const u8 win_loss)
+{
+        struct dccp_li_hist_entry *tail = NULL, *entry;
+        int i;
+        for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) {
+                entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
+                if (entry == NULL) {
+                        dccp_li_hist_purge(hist, list);
+                        return NULL;
+                }
+                if (tail == NULL)
+                        tail = entry;
+                list_add(&entry->dccplih_node, list);
+        }
+        entry->dccplih_seqno     = seq_loss;
+        entry->dccplih_win_count = win_loss;
+        return tail;
+}
+EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
new file mode 100644
index 000000000000..13ad47ba1420
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -0,0 +1,61 @@
+#ifndef _DCCP_LI_HIST_
+#define _DCCP_LI_HIST_
+/*
+ *  net/dccp/ccids/lib/loss_interval.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ */
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#define DCCP_LI_HIST_IVAL_F_LENGTH  8
+struct dccp_li_hist {
+        kmem_cache_t *dccplih_slab;
+};
+extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
+extern void dccp_li_hist_delete(struct dccp_li_hist *hist);
+struct dccp_li_hist_entry {
+        struct list_head dccplih_node;
+        u64              dccplih_seqno:48,
+                         dccplih_win_count:4;
+        u32              dccplih_interval;
+};
+static inline struct dccp_li_hist_entry *
+                dccp_li_hist_entry_new(struct dccp_li_hist *hist,
+                                       const unsigned int __nocast prio)
+{
+        return kmem_cache_alloc(hist->dccplih_slab, prio);
+}
+static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist,
+                                             struct dccp_li_hist_entry *entry)
+{
+        if (entry != NULL)
+                kmem_cache_free(hist->dccplih_slab, entry);
+}
+extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
+                               struct list_head *list);
+extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
+extern struct dccp_li_hist_entry *
+                        dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+                                                  struct list_head *list,
+                                                  const u64 seq_loss,
+                                                  const u8 win_loss);
+#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
new file mode 100644
index 000000000000..d3f9d2053830
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -0,0 +1,398 @@
+/*
+ *  net/dccp/packet_history.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include "packet_history.h"
+struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
+{
+        struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+        static const char dccp_rx_hist_mask[] = "rx_hist_%s";
+        char *slab_name;
+        if (hist == NULL)
+                goto out;
+        slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
+                            GFP_ATOMIC);
+        if (slab_name == NULL)
+                goto out_free_hist;
+        sprintf(slab_name, dccp_rx_hist_mask, name);
+        hist->dccprxh_slab = kmem_cache_create(slab_name,
+                                             sizeof(struct dccp_rx_hist_entry),
+                                               0, SLAB_HWCACHE_ALIGN,
+                                               NULL, NULL);
+        if (hist->dccprxh_slab == NULL)
+                goto out_free_slab_name;
+out:
+        return hist;
+out_free_slab_name:
+        kfree(slab_name);
+out_free_hist:
+        kfree(hist);
+        hist = NULL;
+        goto out;
+}
+EXPORT_SYMBOL_GPL(dccp_rx_hist_new);
+void dccp_rx_hist_delete(struct dccp_rx_hist *hist)
+{
+        const char* name = kmem_cache_name(hist->dccprxh_slab);
+        kmem_cache_destroy(hist->dccprxh_slab);
+        kfree(name);
+        kfree(hist);
+}
+EXPORT_SYMBOL_GPL(dccp_rx_hist_delete);
+void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list)
+{
+        struct dccp_rx_hist_entry *entry, *next;
+        list_for_each_entry_safe(entry, next, list, dccphrx_node) {
+                list_del_init(&entry->dccphrx_node);
+                kmem_cache_free(hist->dccprxh_slab, entry);
+        }
+}
+EXPORT_SYMBOL_GPL(dccp_rx_hist_purge);
+struct dccp_rx_hist_entry *
+                dccp_rx_hist_find_data_packet(const struct list_head *list)
+{
+        struct dccp_rx_hist_entry *entry, *packet = NULL;
+        list_for_each_entry(entry, list, dccphrx_node)
+                if (entry->dccphrx_type == DCCP_PKT_DATA ||
+                    entry->dccphrx_type == DCCP_PKT_DATAACK) {
+                        packet = entry;
+                        break;
+                }
+        return packet;
+}
+EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
+int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+                            struct list_head *rx_list,
+                            struct list_head *li_list,
+                            struct dccp_rx_hist_entry *packet)
+{
+        struct dccp_rx_hist_entry *entry, *next, *iter;
+        u8 num_later = 0;
+        iter = dccp_rx_hist_head(rx_list);
+        if (iter == NULL)
+                dccp_rx_hist_add_entry(rx_list, packet);
+        else {
+                const u64 seqno = packet->dccphrx_seqno;
+                if (after48(seqno, iter->dccphrx_seqno))
+                        dccp_rx_hist_add_entry(rx_list, packet);
+                else {
+                        if (dccp_rx_hist_entry_data_packet(iter))
+                                num_later = 1;
+                        list_for_each_entry_continue(iter, rx_list,
+                                                     dccphrx_node) {
+                                if (after48(seqno, iter->dccphrx_seqno)) {
+                                        dccp_rx_hist_add_entry(&iter->dccphrx_node,
+                                                               packet);
+                                        goto trim_history;
+                                }
+                                if (dccp_rx_hist_entry_data_packet(iter))
+                                        num_later++;
+                                if (num_later == TFRC_RECV_NUM_LATE_LOSS) {
+                                        dccp_rx_hist_entry_delete(hist, packet);
+                                        return 1;
+                                }
+                        }
+                        if (num_later < TFRC_RECV_NUM_LATE_LOSS)
+                                dccp_rx_hist_add_entry(rx_list, packet);
+                        /*
+                         * FIXME: else what? should we destroy the packet
+                         * like above?
+                         */
+                }
+        }
+trim_history:
+        /*
+         * Trim history (remove all packets after the NUM_LATE_LOSS + 1
+         * data packets)
+         */
+        num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
+        if (!list_empty(li_list)) {
+                list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+                        if (num_later == 0) {
+                                list_del_init(&entry->dccphrx_node);
+                                dccp_rx_hist_entry_delete(hist, entry);
+                        } else if (dccp_rx_hist_entry_data_packet(entry))
+                                --num_later;
+                }
+        } else {
+                int step = 0;
+                u8 win_count = 0; /* Not needed, but lets shut up gcc */
+                int tmp;
+                /*
+                 * We have no loss interval history so we need at least one
+                 * rtt:s of data packets to approximate rtt.
+                 */
+                list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+                        if (num_later == 0) {
+                                switch (step) {
+                                case 0:
+                                        step = 1;
+                                        /* OK, find next data packet */
+                                        num_later = 1;
+                                        break;
+                                case 1:
+                                        step = 2;
+                                        /* OK, find next data packet */
+                                        num_later = 1;
+                                        win_count = entry->dccphrx_ccval;
+                                        break;
+                                case 2:
+                                        tmp = win_count - entry->dccphrx_ccval;
+                                        if (tmp < 0)
+                                                tmp += TFRC_WIN_COUNT_LIMIT;
+                                        if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) {
+                                                /*
+                                                 * We have found a packet older
+                                                 * than one rtt remove the rest
+                                                 */
+                                                step = 3;
+                                        } else /* OK, find next data packet */
+                                                num_later = 1;
+                                        break;
+                                case 3:
+                                        list_del_init(&entry->dccphrx_node);
+                                        dccp_rx_hist_entry_delete(hist, entry);
+                                        break;
+                                }
+                        } else if (dccp_rx_hist_entry_data_packet(entry))
+                                --num_later;
+                }
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
+u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+                             struct list_head *li_list, u8 *win_loss)
+{
+        struct dccp_rx_hist_entry *entry, *next, *packet;
+        struct dccp_rx_hist_entry *a_loss = NULL;
+        struct dccp_rx_hist_entry *b_loss = NULL;
+        u64 seq_loss = DCCP_MAX_SEQNO + 1;
+        u8 num_later = TFRC_RECV_NUM_LATE_LOSS;
+        list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+                if (num_later == 0) {
+                        b_loss = entry;
+                        break;
+                } else if (dccp_rx_hist_entry_data_packet(entry))
+                        --num_later;
+        }
+        if (b_loss == NULL)
+                goto out;
+        num_later = 1;
+        list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+                if (num_later == 0) {
+                        a_loss = entry;
+                        break;
+                } else if (dccp_rx_hist_entry_data_packet(entry))
+                        --num_later;
+        }
+        if (a_loss == NULL) {
+                if (list_empty(li_list)) {
+                        /* no loss event have occured yet */
+                        LIMIT_NETDEBUG("%s: TODO: find a lost data packet by "
+                                       "comparing to initial seqno\n",
+                                       __FUNCTION__);
+                        goto out;
+                } else {
+                        LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!",
+                                       __FUNCTION__);
+                        goto out;
+                }
+        }
+        /* Locate a lost data packet */
+        entry = packet = b_loss;
+        list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+                u64 delta = dccp_delta_seqno(entry->dccphrx_seqno,
+                                             packet->dccphrx_seqno);
+                if (delta != 0) {
+                        if (dccp_rx_hist_entry_data_packet(packet))
+                                --delta;
+                        /*
+                         * FIXME: check this, probably this % usage is because
+                         * in earlier drafts the ndp count was just 8 bits
+                         * long, but now it cam be up to 24 bits long.
+                         */
+#if 0
+                        if (delta % DCCP_NDP_LIMIT !=
+                            (packet->dccphrx_ndp -
+                             entry->dccphrx_ndp) % DCCP_NDP_LIMIT)
+#endif
+                        if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) {
+                                seq_loss = entry->dccphrx_seqno;
+                                dccp_inc_seqno(&seq_loss);
+                        }
+                }
+                packet = entry;
+                if (packet == a_loss)
+                        break;
+        }
+out:
+        if (seq_loss != DCCP_MAX_SEQNO + 1)
+                *win_loss = a_loss->dccphrx_ccval;
+        else
+                *win_loss = 0; /* Paranoia */
+        return seq_loss;
+}
+EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss);
+struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
+{
+        struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+        static const char dccp_tx_hist_mask[] = "tx_hist_%s";
+        char *slab_name;
+        if (hist == NULL)
+                goto out;
+        slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
+                            GFP_ATOMIC);
+        if (slab_name == NULL)
+                goto out_free_hist;
+        sprintf(slab_name, dccp_tx_hist_mask, name);
+        hist->dccptxh_slab = kmem_cache_create(slab_name,
+                                             sizeof(struct dccp_tx_hist_entry),
+                                               0, SLAB_HWCACHE_ALIGN,
+                                               NULL, NULL);
+        if (hist->dccptxh_slab == NULL)
+                goto out_free_slab_name;
+out:
+        return hist;
+out_free_slab_name:
+        kfree(slab_name);
+out_free_hist:
+        kfree(hist);
+        hist = NULL;
+        goto out;
+}
+EXPORT_SYMBOL_GPL(dccp_tx_hist_new);
+void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
+{
+        const char* name = kmem_cache_name(hist->dccptxh_slab);
+        kmem_cache_destroy(hist->dccptxh_slab);
+        kfree(name);
+        kfree(hist);
+}
+EXPORT_SYMBOL_GPL(dccp_tx_hist_delete);
+struct dccp_tx_hist_entry *
+        dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
+{
+        struct dccp_tx_hist_entry *packet = NULL, *entry;
+        list_for_each_entry(entry, list, dccphtx_node)
+                if (entry->dccphtx_seqno == seq) {
+                        packet = entry;
+                        break;
+                }
+        return packet;
+}
+EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry);
+void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+                              struct list_head *list,
+                              struct dccp_tx_hist_entry *packet)
+{
+        struct dccp_tx_hist_entry *next;
+        list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) {
+                list_del_init(&packet->dccphtx_node);
+                dccp_tx_hist_entry_delete(hist, packet);
+        }
+}
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
+void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list)
+{
+        struct dccp_tx_hist_entry *entry, *next;
+        list_for_each_entry_safe(entry, next, list, dccphtx_node) {
+                list_del_init(&entry->dccphtx_node);
+                dccp_tx_hist_entry_delete(hist, entry);
+        }
+}
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge);
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+              "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC library");
+MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
new file mode 100644
index 000000000000..fb90a91aa93d
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -0,0 +1,199 @@
+/*
+ *  net/dccp/packet_history.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_PKT_HIST_
+#define _DCCP_PKT_HIST_
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include "../../dccp.h"
+/* Number of later packets received before one is considered lost */
+#define TFRC_RECV_NUM_LATE_LOSS  3
+#define TFRC_WIN_COUNT_PER_RTT   4
+#define TFRC_WIN_COUNT_LIMIT    16
+struct dccp_tx_hist_entry {
+        struct list_head dccphtx_node;
+        u64              dccphtx_seqno:48,
+                         dccphtx_ccval:4,
+                         dccphtx_sent:1;
+        u32              dccphtx_rtt;
+        struct timeval   dccphtx_tstamp;
+};
+struct dccp_rx_hist_entry {
+        struct list_head dccphrx_node;
+        u64              dccphrx_seqno:48,
+                         dccphrx_ccval:4,
+                         dccphrx_type:4;
+        u32              dccphrx_ndp; /* In fact it is from 8 to 24 bits */
+        struct timeval   dccphrx_tstamp;
+};
+struct dccp_tx_hist {
+        kmem_cache_t *dccptxh_slab;
+};
+extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
+extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
+struct dccp_rx_hist {
+        kmem_cache_t *dccprxh_slab;
+};
+extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
+extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
+extern struct dccp_rx_hist_entry *
+                dccp_rx_hist_find_data_packet(const struct list_head *list);
+static inline struct dccp_tx_hist_entry *
+                dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
+                                       const unsigned int __nocast prio)
+{
+        struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
+                                                            prio);
+        if (entry != NULL)
+                entry->dccphtx_sent = 0;
+        return entry;
+}
+static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist,
+                                             struct dccp_tx_hist_entry *entry)
+{
+        if (entry != NULL)
+                kmem_cache_free(hist->dccptxh_slab, entry);
+}
+extern struct dccp_tx_hist_entry *
+                        dccp_tx_hist_find_entry(const struct list_head *list,
+                                                const u64 seq);
+static inline void dccp_tx_hist_add_entry(struct list_head *list,
+                                          struct dccp_tx_hist_entry *entry)
+{
+        list_add(&entry->dccphtx_node, list);
+}
+extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+                                     struct list_head *list,
+                                     struct dccp_tx_hist_entry *next);
+extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist,
+                               struct list_head *list);
+static inline struct dccp_tx_hist_entry *
+                dccp_tx_hist_head(struct list_head *list)
+{
+        struct dccp_tx_hist_entry *head = NULL;
+        if (!list_empty(list))
+                head = list_entry(list->next, struct dccp_tx_hist_entry,
+                                  dccphtx_node);
+        return head;
+}
+static inline struct dccp_rx_hist_entry *
+                     dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
+                                            const u32 ndp, 
+                                            const struct sk_buff *skb,
+                                            const unsigned int __nocast prio)
+{
+        struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
+                                                            prio);
+        if (entry != NULL) {
+                const struct dccp_hdr *dh = dccp_hdr(skb);
+                entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+                entry->dccphrx_ccval = dh->dccph_ccval;
+                entry->dccphrx_type  = dh->dccph_type;
+                entry->dccphrx_ndp   = ndp;
+                do_gettimeofday(&(entry->dccphrx_tstamp));
+        }
+        return entry;
+}
+static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
+                                             struct dccp_rx_hist_entry *entry)
+{
+        if (entry != NULL)
+                kmem_cache_free(hist->dccprxh_slab, entry);
+}
+extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
+                               struct list_head *list);
+static inline void dccp_rx_hist_add_entry(struct list_head *list,
+                                          struct dccp_rx_hist_entry *entry)
+{
+        list_add(&entry->dccphrx_node, list);
+}
+static inline struct dccp_rx_hist_entry *
+                dccp_rx_hist_head(struct list_head *list)
+{
+        struct dccp_rx_hist_entry *head = NULL;
+        if (!list_empty(list))
+                head = list_entry(list->next, struct dccp_rx_hist_entry,
+                                  dccphrx_node);
+        return head;
+}
+static inline int
+        dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
+{
+        return entry->dccphrx_type == DCCP_PKT_DATA ||
+               entry->dccphrx_type == DCCP_PKT_DATAACK;
+}
+extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+                                   struct list_head *rx_list,
+                                   struct list_head *li_list,
+                                   struct dccp_rx_hist_entry *packet);
+extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+                                    struct list_head *li_list, u8 *win_loss);
+#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
new file mode 100644
index 000000000000..130c4c40cfe3
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -0,0 +1,22 @@
+#ifndef _TFRC_H_
+#define _TFRC_H_
+/*
+ *  net/dccp/ccids/lib/tfrc.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+#include <linux/types.h>
+extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
new file mode 100644
index 000000000000..d2b5933b4510
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -0,0 +1,644 @@
+/*
+ *  net/dccp/ccids/lib/tfrc_equation.c
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/bug.h>
+#include <asm/div64.h>
+#include "tfrc.h"
+#define TFRC_CALC_X_ARRSIZE 500
+#define TFRC_CALC_X_SPLIT 50000
+/* equivalent to 0.05 */
+static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
+        {     37172,   8172 },
+        {     53499,  11567 },
+        {     66664,  14180 },
+        {     78298,  16388 },
+        {     89021,  18339 },
+        {     99147,  20108 },
+        {    108858,  21738 },
+        {    118273,  23260 },
+        {    127474,  24693 },
+        {    136520,  26052 },
+        {    145456,  27348 },
+        {    154316,  28589 },
+        {    163130,  29783 },
+        {    171919,  30935 },
+        {    180704,  32049 },
+        {    189502,  33130 },
+        {    198328,  34180 },
+        {    207194,  35202 },
+        {    216114,  36198 },
+        {    225097,  37172 },
+        {    234153,  38123 },
+        {    243294,  39055 },
+        {    252527,  39968 },
+        {    261861,  40864 },
+        {    271305,  41743 },
+        {    280866,  42607 },
+        {    290553,  43457 },
+        {    300372,  44293 },
+        {    310333,  45117 },
+        {    320441,  45929 },
+        {    330705,  46729 },
+        {    341131,  47518 },
+        {    351728,  48297 },
+        {    362501,  49066 },
+        {    373460,  49826 },
+        {    384609,  50577 },
+        {    395958,  51320 },
+        {    407513,  52054 },
+        {    419281,  52780 },
+        {    431270,  53499 },
+        {    443487,  54211 },
+        {    455940,  54916 },
+        {    468635,  55614 },
+        {    481581,  56306 },
+        {    494785,  56991 },
+        {    508254,  57671 },
+        {    521996,  58345 },
+        {    536019,  59014 },
+        {    550331,  59677 },
+        {    564939,  60335 },
+        {    579851,  60988 },
+        {    595075,  61636 },
+        {    610619,  62279 },
+        {    626491,  62918 },
+        {    642700,  63553 },
+        {    659253,  64183 },
+        {    676158,  64809 },
+        {    693424,  65431 },
+        {    711060,  66050 },
+        {    729073,  66664 },
+        {    747472,  67275 },
+        {    766266,  67882 },
+        {    785464,  68486 },
+        {    805073,  69087 },
+        {    825103,  69684 },
+        {    845562,  70278 },
+        {    866460,  70868 },
+        {    887805,  71456 },
+        {    909606,  72041 },
+        {    931873,  72623 },
+        {    954614,  73202 },
+        {    977839,  73778 },
+        {   1001557,  74352 },
+        {   1025777,  74923 },
+        {   1050508,  75492 },
+        {   1075761,  76058 },
+        {   1101544,  76621 },
+        {   1127867,  77183 },
+        {   1154739,  77741 },
+        {   1182172,  78298 },
+        {   1210173,  78852 },
+        {   1238753,  79405 },
+        {   1267922,  79955 },
+        {   1297689,  80503 },
+        {   1328066,  81049 },
+        {   1359060,  81593 },
+        {   1390684,  82135 },
+        {   1422947,  82675 },
+        {   1455859,  83213 },
+        {   1489430,  83750 },
+        {   1523671,  84284 },
+        {   1558593,  84817 },
+        {   1594205,  85348 },
+        {   1630518,  85878 },
+        {   1667543,  86406 },
+        {   1705290,  86932 },
+        {   1743770,  87457 },
+        {   1782994,  87980 },
+        {   1822973,  88501 },
+        {   1863717,  89021 },
+        {   1905237,  89540 },
+        {   1947545,  90057 },
+        {   1990650,  90573 },
+        {   2034566,  91087 },
+        {   2079301,  91600 },
+        {   2124869,  92111 },
+        {   2171279,  92622 },
+        {   2218543,  93131 },
+        {   2266673,  93639 },
+        {   2315680,  94145 },
+        {   2365575,  94650 },
+        {   2416371,  95154 },
+        {   2468077,  95657 },
+        {   2520707,  96159 },
+        {   2574271,  96660 },
+        {   2628782,  97159 },
+        {   2684250,  97658 },
+        {   2740689,  98155 },
+        {   2798110,  98651 },
+        {   2856524,  99147 },
+        {   2915944,  99641 },
+        {   2976382, 100134 },
+        {   3037850, 100626 },
+        {   3100360, 101117 },
+        {   3163924, 101608 },
+        {   3228554, 102097 },
+        {   3294263, 102586 },
+        {   3361063, 103073 },
+        {   3428966, 103560 },
+        {   3497984, 104045 },
+        {   3568131, 104530 },
+        {   3639419, 105014 },
+        {   3711860, 105498 },
+        {   3785467, 105980 },
+        {   3860253, 106462 },
+        {   3936229, 106942 },
+        {   4013410, 107422 },
+        {   4091808, 107902 },
+        {   4171435, 108380 },
+        {   4252306, 108858 },
+        {   4334431, 109335 },
+        {   4417825, 109811 },
+        {   4502501, 110287 },
+        {   4588472, 110762 },
+        {   4675750, 111236 },
+        {   4764349, 111709 },
+        {   4854283, 112182 },
+        {   4945564, 112654 },
+        {   5038206, 113126 },
+        {   5132223, 113597 },
+        {   5227627, 114067 },
+        {   5324432, 114537 },
+        {   5422652, 115006 },
+        {   5522299, 115474 },
+        {   5623389, 115942 },
+        {   5725934, 116409 },
+        {   5829948, 116876 },
+        {   5935446, 117342 },
+        {   6042439, 117808 },
+        {   6150943, 118273 },
+        {   6260972, 118738 },
+        {   6372538, 119202 },
+        {   6485657, 119665 },
+        {   6600342, 120128 },
+        {   6716607, 120591 },
+        {   6834467, 121053 },
+        {   6953935, 121514 },
+        {   7075025, 121976 },
+        {   7197752, 122436 },
+        {   7322131, 122896 },
+        {   7448175, 123356 },
+        {   7575898, 123815 },
+        {   7705316, 124274 },
+        {   7836442, 124733 },
+        {   7969291, 125191 },
+        {   8103877, 125648 },
+        {   8240216, 126105 },
+        {   8378321, 126562 },
+        {   8518208, 127018 },
+        {   8659890, 127474 },
+        {   8803384, 127930 },
+        {   8948702, 128385 },
+        {   9095861, 128840 },
+        {   9244875, 129294 },
+        {   9395760, 129748 },
+        {   9548529, 130202 },
+        {   9703198, 130655 },
+        {   9859782, 131108 },
+        {  10018296, 131561 },
+        {  10178755, 132014 },
+        {  10341174, 132466 },
+        {  10505569, 132917 },
+        {  10671954, 133369 },
+        {  10840345, 133820 },
+        {  11010757, 134271 },
+        {  11183206, 134721 },
+        {  11357706, 135171 },
+        {  11534274, 135621 },
+        {  11712924, 136071 },
+        {  11893673, 136520 },
+        {  12076536, 136969 },
+        {  12261527, 137418 },
+        {  12448664, 137867 },
+        {  12637961, 138315 },
+        {  12829435, 138763 },
+        {  13023101, 139211 },
+        {  13218974, 139658 },
+        {  13417071, 140106 },
+        {  13617407, 140553 },
+        {  13819999, 140999 },
+        {  14024862, 141446 },
+        {  14232012, 141892 },
+        {  14441465, 142339 },
+        {  14653238, 142785 },
+        {  14867346, 143230 },
+        {  15083805, 143676 },
+        {  15302632, 144121 },
+        {  15523842, 144566 },
+        {  15747453, 145011 },
+        {  15973479, 145456 },
+        {  16201939, 145900 },
+        {  16432847, 146345 },
+        {  16666221, 146789 },
+        {  16902076, 147233 },
+        {  17140429, 147677 },
+        {  17381297, 148121 },
+        {  17624696, 148564 },
+        {  17870643, 149007 },
+        {  18119154, 149451 },
+        {  18370247, 149894 },
+        {  18623936, 150336 },
+        {  18880241, 150779 },
+        {  19139176, 151222 },
+        {  19400759, 151664 },
+        {  19665007, 152107 },
+        {  19931936, 152549 },
+        {  20201564, 152991 },
+        {  20473907, 153433 },
+        {  20748982, 153875 },
+        {  21026807, 154316 },
+        {  21307399, 154758 },
+        {  21590773, 155199 },
+        {  21876949, 155641 },
+        {  22165941, 156082 },
+        {  22457769, 156523 },
+        {  22752449, 156964 },
+        {  23049999, 157405 },
+        {  23350435, 157846 },
+        {  23653774, 158287 },
+        {  23960036, 158727 },
+        {  24269236, 159168 },
+        {  24581392, 159608 },
+        {  24896521, 160049 },
+        {  25214642, 160489 },
+        {  25535772, 160929 },
+        {  25859927, 161370 },
+        {  26187127, 161810 },
+        {  26517388, 162250 },
+        {  26850728, 162690 },
+        {  27187165, 163130 },
+        {  27526716, 163569 },
+        {  27869400, 164009 },
+        {  28215234, 164449 },
+        {  28564236, 164889 },
+        {  28916423, 165328 },
+        {  29271815, 165768 },
+        {  29630428, 166208 },
+        {  29992281, 166647 },
+        {  30357392, 167087 },
+        {  30725779, 167526 },
+        {  31097459, 167965 },
+        {  31472452, 168405 },
+        {  31850774, 168844 },
+        {  32232445, 169283 },
+        {  32617482, 169723 },
+        {  33005904, 170162 },
+        {  33397730, 170601 },
+        {  33792976, 171041 },
+        {  34191663, 171480 },
+        {  34593807, 171919 },
+        {  34999428, 172358 },
+        {  35408544, 172797 },
+        {  35821174, 173237 },
+        {  36237335, 173676 },
+        {  36657047, 174115 },
+        {  37080329, 174554 },
+        {  37507197, 174993 },
+        {  37937673, 175433 },
+        {  38371773, 175872 },
+        {  38809517, 176311 },
+        {  39250924, 176750 },
+        {  39696012, 177190 },
+        {  40144800, 177629 },
+        {  40597308, 178068 },
+        {  41053553, 178507 },
+        {  41513554, 178947 },
+        {  41977332, 179386 },
+        {  42444904, 179825 },
+        {  42916290, 180265 },
+        {  43391509, 180704 },
+        {  43870579, 181144 },
+        {  44353520, 181583 },
+        {  44840352, 182023 },
+        {  45331092, 182462 },
+        {  45825761, 182902 },
+        {  46324378, 183342 },
+        {  46826961, 183781 },
+        {  47333531, 184221 },
+        {  47844106, 184661 },
+        {  48358706, 185101 },
+        {  48877350, 185541 },
+        {  49400058, 185981 },
+        {  49926849, 186421 },
+        {  50457743, 186861 },
+        {  50992759, 187301 },
+        {  51531916, 187741 },
+        {  52075235, 188181 },
+        {  52622735, 188622 },
+        {  53174435, 189062 },
+        {  53730355, 189502 },
+        {  54290515, 189943 },
+        {  54854935, 190383 },
+        {  55423634, 190824 },
+        {  55996633, 191265 },
+        {  56573950, 191706 },
+        {  57155606, 192146 },
+        {  57741621, 192587 },
+        {  58332014, 193028 },
+        {  58926806, 193470 },
+        {  59526017, 193911 },
+        {  60129666, 194352 },
+        {  60737774, 194793 },
+        {  61350361, 195235 },
+        {  61967446, 195677 },
+        {  62589050, 196118 },
+        {  63215194, 196560 },
+        {  63845897, 197002 },
+        {  64481179, 197444 },
+        {  65121061, 197886 },
+        {  65765563, 198328 },
+        {  66414705, 198770 },
+        {  67068508, 199213 },
+        {  67726992, 199655 },
+        {  68390177, 200098 },
+        {  69058085, 200540 },
+        {  69730735, 200983 },
+        {  70408147, 201426 },
+        {  71090343, 201869 },
+        {  71777343, 202312 },
+        {  72469168, 202755 },
+        {  73165837, 203199 },
+        {  73867373, 203642 },
+        {  74573795, 204086 },
+        {  75285124, 204529 },
+        {  76001380, 204973 },
+        {  76722586, 205417 },
+        {  77448761, 205861 },
+        {  78179926, 206306 },
+        {  78916102, 206750 },
+        {  79657310, 207194 },
+        {  80403571, 207639 },
+        {  81154906, 208084 },
+        {  81911335, 208529 },
+        {  82672880, 208974 },
+        {  83439562, 209419 },
+        {  84211402, 209864 },
+        {  84988421, 210309 },
+        {  85770640, 210755 },
+        {  86558080, 211201 },
+        {  87350762, 211647 },
+        {  88148708, 212093 },
+        {  88951938, 212539 },
+        {  89760475, 212985 },
+        {  90574339, 213432 },
+        {  91393551, 213878 },
+        {  92218133, 214325 },
+        {  93048107, 214772 },
+        {  93883493, 215219 },
+        {  94724314, 215666 },
+        {  95570590, 216114 },
+        {  96422343, 216561 },
+        {  97279594, 217009 },
+        {  98142366, 217457 },
+        {  99010679, 217905 },
+        {  99884556, 218353 },
+        { 100764018, 218801 },
+        { 101649086, 219250 },
+        { 102539782, 219698 },
+        { 103436128, 220147 },
+        { 104338146, 220596 },
+        { 105245857, 221046 },
+        { 106159284, 221495 },
+        { 107078448, 221945 },
+        { 108003370, 222394 },
+        { 108934074, 222844 },
+        { 109870580, 223294 },
+        { 110812910, 223745 },
+        { 111761087, 224195 },
+        { 112715133, 224646 },
+        { 113675069, 225097 },
+        { 114640918, 225548 },
+        { 115612702, 225999 },
+        { 116590442, 226450 },
+        { 117574162, 226902 },
+        { 118563882, 227353 },
+        { 119559626, 227805 },
+        { 120561415, 228258 },
+        { 121569272, 228710 },
+        { 122583219, 229162 },
+        { 123603278, 229615 },
+        { 124629471, 230068 },
+        { 125661822, 230521 },
+        { 126700352, 230974 },
+        { 127745083, 231428 },
+        { 128796039, 231882 },
+        { 129853241, 232336 },
+        { 130916713, 232790 },
+        { 131986475, 233244 },
+        { 133062553, 233699 },
+        { 134144966, 234153 },
+        { 135233739, 234608 },
+        { 136328894, 235064 },
+        { 137430453, 235519 },
+        { 138538440, 235975 },
+        { 139652876, 236430 },
+        { 140773786, 236886 },
+        { 141901190, 237343 },
+        { 143035113, 237799 },
+        { 144175576, 238256 },
+        { 145322604, 238713 },
+        { 146476218, 239170 },
+        { 147636442, 239627 },
+        { 148803298, 240085 },
+        { 149976809, 240542 },
+        { 151156999, 241000 },
+        { 152343890, 241459 },
+        { 153537506, 241917 },
+        { 154737869, 242376 },
+        { 155945002, 242835 },
+        { 157158929, 243294 },
+        { 158379673, 243753 },
+        { 159607257, 244213 },
+        { 160841704, 244673 },
+        { 162083037, 245133 },
+        { 163331279, 245593 },
+        { 164586455, 246054 },
+        { 165848586, 246514 },
+        { 167117696, 246975 },
+        { 168393810, 247437 },
+        { 169676949, 247898 },
+        { 170967138, 248360 },
+        { 172264399, 248822 },
+        { 173568757, 249284 },
+        { 174880235, 249747 },
+        { 176198856, 250209 },
+        { 177524643, 250672 },
+        { 178857621, 251136 },
+        { 180197813, 251599 },
+        { 181545242, 252063 },
+        { 182899933, 252527 },
+        { 184261908, 252991 },
+        { 185631191, 253456 },
+        { 187007807, 253920 },
+        { 188391778, 254385 },
+        { 189783129, 254851 },
+        { 191181884, 255316 },
+        { 192588065, 255782 },
+        { 194001698, 256248 },
+        { 195422805, 256714 },
+        { 196851411, 257181 },
+        { 198287540, 257648 },
+        { 199731215, 258115 },
+        { 201182461, 258582 },
+        { 202641302, 259050 },
+        { 204107760, 259518 },
+        { 205581862, 259986 },
+        { 207063630, 260454 },
+        { 208553088, 260923 },
+        { 210050262, 261392 },
+        { 211555174, 261861 },
+        { 213067849, 262331 },
+        { 214588312, 262800 },
+        { 216116586, 263270 },
+        { 217652696, 263741 },
+        { 219196666, 264211 },
+        { 220748520, 264682 },
+        { 222308282, 265153 },
+        { 223875978, 265625 },
+        { 225451630, 266097 },
+        { 227035265, 266569 },
+        { 228626905, 267041 },
+        { 230226576, 267514 },
+        { 231834302, 267986 },
+        { 233450107, 268460 },
+        { 235074016, 268933 },
+        { 236706054, 269407 },
+        { 238346244, 269881 },
+        { 239994613, 270355 },
+        { 241651183, 270830 },
+        { 243315981, 271305 }
+};
+/* Calculate the send rate as per section 3.1 of RFC3448
+ 
+Returns send rate in bytes per second
+Integer maths and lookups are used as not allowed floating point in kernel
+The function for Xcalc as per section 3.1 of RFC3448 is:
+X =                            s
+     -------------------------------------------------------------
+     R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
+where 
+X is the trasmit rate in bytes/second
+s is the packet size in bytes
+R is the round trip time in seconds
+p is the loss event rate, between 0 and 1.0, of the number of loss events 
+  as a fraction of the number of packets transmitted
+t_RTO is the TCP retransmission timeout value in seconds
+b is the number of packets acknowledged by a single TCP acknowledgement
+we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
+X =                            s
+     -----------------------------------------------------------------------
+     R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
+which we can break down into:
+X =     s
+     --------
+     R * f(p)
+where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
+Function parameters:
+s - bytes
+R - RTT in usecs
+p - loss rate (decimal fraction multiplied by 1,000,000)
+Returns Xcalc in bytes per second
+DON'T alter this code unless you run test cases against it as the code
+has been manipulated to stop underflow/overlow.
+*/
+u32 tfrc_calc_x(u16 s, u32 R, u32 p)
+{
+        int index;
+        u32 f;
+        u64 tmp1, tmp2;
+        if (p < TFRC_CALC_X_SPLIT)
+                index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
+        else
+                index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
+        if (index < 0)
+                /* p should be 0 unless there is a bug in my code */
+                index = 0;
+        if (R == 0)
+                R = 1; /* RTT can't be zero or else divide by zero */
+        BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
+        if (p >= TFRC_CALC_X_SPLIT)
+                f = tfrc_calc_x_lookup[index][0];
+        else
+                f = tfrc_calc_x_lookup[index][1];
+        tmp1 = ((u64)s * 100000000);
+        tmp2 = ((u64)R * (u64)f);
+        do_div(tmp2, 10000);
+        do_div(tmp1, tmp2); 
+        /* Don't alter above math unless you test due to overflow on 32 bit */
+        return (u32)tmp1; 
+}
+EXPORT_SYMBOL_GPL(tfrc_calc_x);
+/*
+ * args: fvalue - function value to match
+ * returns: p closest to that value
+ *
+ * both fvalue and p are multiplied by 1,000,000 to use ints
+ */
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
+{
+        int ctr = 0;
+        int small;
+        if (fvalue < tfrc_calc_x_lookup[0][1])
+                return 0;
+        if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
+                small = 1;
+        else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
+                return 1000000;
+        else
+                small = 0;
+        while (fvalue > tfrc_calc_x_lookup[ctr][small])
+                ctr++;
+        if (small)
+                return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
+        else
+                return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
+}
+EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
new file mode 100644
index 000000000000..33456c0d5937
--- /dev/null
+++ b/net/dccp/dccp.h
@@ -0,0 +1,493 @@
+#ifndef _DCCP_H
+#define _DCCP_H
+/*
+ *  net/dccp/dccp.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ *      This program is free software; you can redistribute it and/or modify it
+ *      under the terms of the GNU General Public License version 2 as
+ *      published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern int dccp_debug;
+#define dccp_pr_debug(format, a...) \
+        do { if (dccp_debug) \
+                printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \
+        } while (0)
+#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \
+                                             printk(format, ##a); } while (0)
+#else
+#define dccp_pr_debug(format, a...)
+#define dccp_pr_debug_cat(format, a...)
+#endif
+extern struct inet_hashinfo dccp_hashinfo;
+extern atomic_t dccp_orphan_count;
+extern int dccp_tw_count;
+extern void dccp_tw_deschedule(struct inet_timewait_sock *tw);
+extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+/* FIXME: Right size this */
+#define DCCP_MAX_OPT_LEN 128
+#define DCCP_MAX_PACKET_HDR 32
+#define MAX_DCCP_HEADER  (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER)
+#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
+                                     * state, about 60 seconds */
+/* draft-ietf-dccp-spec-11.txt initial RTO value */
+#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+/* Maximal interval between probes for local resources.  */
+#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
+#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
+extern struct proto dccp_v4_prot;
+/* is seq1 < seq2 ? */
+static inline int before48(const u64 seq1, const u64 seq2)
+{
+        return (s64)((seq1 << 16) - (seq2 << 16)) < 0;
+}
+/* is seq1 > seq2 ? */
+static inline int after48(const u64 seq1, const u64 seq2)
+{
+        return (s64)((seq2 << 16) - (seq1 << 16)) < 0;
+}
+/* is seq2 <= seq1 <= seq3 ? */
+static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
+{
+        return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
+}
+static inline u64 max48(const u64 seq1, const u64 seq2)
+{
+        return after48(seq1, seq2) ? seq1 : seq2;
+}
+enum {
+        DCCP_MIB_NUM = 0,
+        DCCP_MIB_ACTIVEOPENS,                   /* ActiveOpens */
+        DCCP_MIB_ESTABRESETS,                   /* EstabResets */
+        DCCP_MIB_CURRESTAB,                     /* CurrEstab */
+        DCCP_MIB_OUTSEGS,                       /* OutSegs */ 
+        DCCP_MIB_OUTRSTS,
+        DCCP_MIB_ABORTONTIMEOUT,
+        DCCP_MIB_TIMEOUTS,
+        DCCP_MIB_ABORTFAILED,
+        DCCP_MIB_PASSIVEOPENS,
+        DCCP_MIB_ATTEMPTFAILS,
+        DCCP_MIB_OUTDATAGRAMS,
+        DCCP_MIB_INERRS,
+        DCCP_MIB_OPTMANDATORYERROR,
+        DCCP_MIB_INVALIDOPT,
+        __DCCP_MIB_MAX
+};
+#define DCCP_MIB_MAX    __DCCP_MIB_MAX
+struct dccp_mib {
+        unsigned long   mibs[DCCP_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
+#define DCCP_INC_STATS(field)       SNMP_INC_STATS(dccp_statistics, field)
+#define DCCP_INC_STATS_BH(field)    SNMP_INC_STATS_BH(dccp_statistics, field)
+#define DCCP_INC_STATS_USER(field)  SNMP_INC_STATS_USER(dccp_statistics, field)
+#define DCCP_DEC_STATS(field)       SNMP_DEC_STATS(dccp_statistics, field)
+#define DCCP_ADD_STATS_BH(field, val) \
+                        SNMP_ADD_STATS_BH(dccp_statistics, field, val)
+#define DCCP_ADD_STATS_USER(field, val) \
+                        SNMP_ADD_STATS_USER(dccp_statistics, field, val)
+extern int  dccp_transmit_skb(struct sock *sk, struct sk_buff *skb);
+extern int  dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
+extern int dccp_send_response(struct sock *sk);
+extern void dccp_send_ack(struct sock *sk);
+extern void dccp_send_delayed_ack(struct sock *sk);
+extern void dccp_send_sync(struct sock *sk, const u64 seq,
+                           const enum dccp_pkt_type pkt_type);
+extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo);
+extern void dccp_write_space(struct sock *sk);
+extern void dccp_init_xmit_timers(struct sock *sk);
+static inline void dccp_clear_xmit_timers(struct sock *sk)
+{
+        inet_csk_clear_xmit_timers(sk);
+}
+extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+extern const char *dccp_packet_name(const int type);
+extern const char *dccp_state_name(const int state);
+static inline void dccp_set_state(struct sock *sk, const int state)
+{
+        const int oldstate = sk->sk_state;
+        dccp_pr_debug("%s(%p) %-10.10s -> %s\n",
+                      dccp_role(sk), sk,
+                      dccp_state_name(oldstate), dccp_state_name(state));
+        WARN_ON(state == oldstate);
+        switch (state) {
+        case DCCP_OPEN:
+                if (oldstate != DCCP_OPEN)
+                        DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+                break;
+        case DCCP_CLOSED:
+                if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN)
+                        DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+                sk->sk_prot->unhash(sk);
+                if (inet_csk(sk)->icsk_bind_hash != NULL &&
+                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+                        inet_put_port(&dccp_hashinfo, sk);
+                /* fall through */
+        default:
+                if (oldstate == DCCP_OPEN)
+                        DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+        }
+        /* Change state AFTER socket is unhashed to avoid closed
+         * socket sitting in hash tables.
+         */
+        sk->sk_state = state;
+}
+static inline void dccp_done(struct sock *sk)
+{
+        dccp_set_state(sk, DCCP_CLOSED);
+        dccp_clear_xmit_timers(sk);
+        sk->sk_shutdown = SHUTDOWN_MASK;
+        if (!sock_flag(sk, SOCK_DEAD))
+                sk->sk_state_change(sk);
+        else
+                inet_csk_destroy_sock(sk);
+}
+static inline void dccp_openreq_init(struct request_sock *req,
+                                     struct dccp_sock *dp,
+                                     struct sk_buff *skb)
+{
+        /*
+         * FIXME: fill in the other req fields from the DCCP options
+         * received
+         */
+        inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
+        inet_rsk(req)->acked    = 0;
+        req->rcv_wnd = 0;
+}
+extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+extern struct sock *dccp_create_openreq_child(struct sock *sk,
+                                              const struct request_sock *req,
+                                              const struct sk_buff *skb);
+extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+extern void dccp_v4_err(struct sk_buff *skb, u32);
+extern int dccp_v4_rcv(struct sk_buff *skb);
+extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
+                                              struct sk_buff *skb,
+                                              struct request_sock *req,
+                                              struct dst_entry *dst);
+extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+                                   struct request_sock *req,
+                                   struct request_sock **prev);
+extern int dccp_child_process(struct sock *parent, struct sock *child,
+                              struct sk_buff *skb);
+extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+                                  struct dccp_hdr *dh, unsigned len);
+extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+                                const struct dccp_hdr *dh, const unsigned len);
+extern void             dccp_close(struct sock *sk, long timeout);
+extern struct sk_buff   *dccp_make_response(struct sock *sk,
+                                            struct dst_entry *dst,
+                                            struct request_sock *req);
+extern struct sk_buff   *dccp_make_reset(struct sock *sk,
+                                         struct dst_entry *dst,
+                                         enum dccp_reset_codes code);
+extern int         dccp_connect(struct sock *sk);
+extern int         dccp_disconnect(struct sock *sk, int flags);
+extern int         dccp_getsockopt(struct sock *sk, int level, int optname,
+                                   char __user *optval, int __user *optlen);
+extern int         dccp_setsockopt(struct sock *sk, int level, int optname,
+                                   char __user *optval, int optlen);
+extern int         dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern int         dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+                                struct msghdr *msg, size_t size);
+extern int         dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+                                struct msghdr *msg, size_t len, int nonblock,
+                                int flags, int *addr_len);
+extern void        dccp_shutdown(struct sock *sk, int how);
+extern int         dccp_v4_checksum(const struct sk_buff *skb,
+                                    const u32 saddr, const u32 daddr);
+extern int         dccp_v4_send_reset(struct sock *sk,
+                                      enum dccp_reset_codes code);
+extern void        dccp_send_close(struct sock *sk, const int active);
+struct dccp_skb_cb {
+        __u8 dccpd_type;
+        __u8 dccpd_reset_code;
+        __u8 dccpd_service;
+        __u8 dccpd_ccval;
+        __u64 dccpd_seq;
+        __u64 dccpd_ack_seq;
+        int  dccpd_opt_len;
+};
+#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
+static inline int dccp_non_data_packet(const struct sk_buff *skb)
+{
+        const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+        return type == DCCP_PKT_ACK      ||
+               type == DCCP_PKT_CLOSE    ||
+               type == DCCP_PKT_CLOSEREQ ||
+               type == DCCP_PKT_RESET    ||
+               type == DCCP_PKT_SYNC     ||
+               type == DCCP_PKT_SYNCACK;
+}
+static inline int dccp_packet_without_ack(const struct sk_buff *skb)
+{
+        const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+        return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
+}
+#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1)
+#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2)
+static inline void dccp_set_seqno(u64 *seqno, u64 value)
+{
+        if (value > DCCP_MAX_SEQNO)
+                value -= DCCP_MAX_SEQNO + 1;
+        *seqno = value;
+}
+static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2)
+{
+        return ((seqno2 << 16) - (seqno1 << 16)) >> 16;
+}
+static inline void dccp_inc_seqno(u64 *seqno)
+{
+        if (++*seqno > DCCP_MAX_SEQNO)
+                *seqno = 0;
+}
+static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
+{
+        struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
+                                                           sizeof(*dh));
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+        dh->dccph_seq      = htonl((gss >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+        dh->dccph_seq      = htonl((gss >> 32));
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+        dhx->dccph_seq_low = htonl(gss & 0xffffffff);
+}
+static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
+                                    const u64 gsr)
+{
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+        dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+        dhack->dccph_ack_nr_high = htonl((gsr >> 32));
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+        dhack->dccph_ack_nr_low  = htonl(gsr & 0xffffffff);
+}
+static inline void dccp_update_gsr(struct sock *sk, u64 seq)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        dp->dccps_gsr = seq;
+        dccp_set_seqno(&dp->dccps_swl,
+                       (dp->dccps_gsr + 1 -
+                        (dp->dccps_options.dccpo_sequence_window / 4)));
+        dccp_set_seqno(&dp->dccps_swh,
+                       (dp->dccps_gsr +
+                        (3 * dp->dccps_options.dccpo_sequence_window) / 4));
+}
+static inline void dccp_update_gss(struct sock *sk, u64 seq)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        dp->dccps_awh = dp->dccps_gss = seq;
+        dccp_set_seqno(&dp->dccps_awl,
+                       (dp->dccps_gss -
+                        dp->dccps_options.dccpo_sequence_window + 1));
+}
+extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+extern void dccp_insert_option_elapsed_time(struct sock *sk,
+                                            struct sk_buff *skb,
+                                            u32 elapsed_time);
+extern void dccp_insert_option_timestamp(struct sock *sk,
+                                         struct sk_buff *skb);
+extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+                               unsigned char option,
+                               const void *value, unsigned char len);
+extern struct socket *dccp_ctl_socket;
+#define DCCP_ACKPKTS_STATE_RECEIVED     0
+#define DCCP_ACKPKTS_STATE_ECN_MARKED   (1 << 6)
+#define DCCP_ACKPKTS_STATE_NOT_RECEIVED (3 << 6)
+#define DCCP_ACKPKTS_STATE_MASK         0xC0 /* 11000000 */
+#define DCCP_ACKPKTS_LEN_MASK           0x3F /* 00111111 */
+/** struct dccp_ackpkts - acknowledgeable packets
+ *
+ * This data structure is the one defined in the DCCP draft
+ * Appendix A.
+ *
+ * @dccpap_buf_head - circular buffer head
+ * @dccpap_buf_tail - circular buffer tail
+ * @dccpap_buf_ackno - ack # of the most recent packet acknowledgeable in the
+ *                     buffer (i.e. %dccpap_buf_head)
+ * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
+ *                     by the buffer with State 0
+ *
+ * Additionally, the HC-Receiver must keep some information about the
+ * Ack Vectors it has recently sent. For each packet sent carrying an
+ * Ack Vector, it remembers four variables:
+ *
+ * @dccpap_ack_seqno - the Sequence Number used for the packet
+ *                     (HC-Receiver seqno)
+ * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement.
+ * @dccpap_ack_ackno - the Acknowledgement Number used for the packet
+ *                     (HC-Sender seqno)
+ * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ *
+ * @dccpap_buf_len - circular buffer length
+ * @dccpap_time         - the time in usecs
+ * @dccpap_buf - circular buffer of acknowledgeable packets
+ */
+struct dccp_ackpkts {
+        unsigned int            dccpap_buf_head;
+        unsigned int            dccpap_buf_tail;
+        u64                     dccpap_buf_ackno;
+        u64                     dccpap_ack_seqno;
+        u64                     dccpap_ack_ackno;
+        unsigned int            dccpap_ack_ptr;
+        unsigned int            dccpap_buf_vector_len;
+        unsigned int            dccpap_ack_vector_len;
+        unsigned int            dccpap_buf_len;
+        struct timeval          dccpap_time;
+        u8                      dccpap_buf_nonce;
+        u8                      dccpap_ack_nonce;
+        u8                      dccpap_buf[0];
+};
+extern struct dccp_ackpkts *
+                dccp_ackpkts_alloc(unsigned int len,
+                                  const unsigned int __nocast priority);
+extern void dccp_ackpkts_free(struct dccp_ackpkts *ap);
+extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state);
+extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap,
+                                         struct sock *sk, u64 ackno);
+static inline suseconds_t timeval_usecs(const struct timeval *tv)
+{
+        return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
+}
+static inline suseconds_t timeval_delta(const struct timeval *large,
+                                        const struct timeval *small)
+{
+        time_t      secs  = large->tv_sec  - small->tv_sec;
+        suseconds_t usecs = large->tv_usec - small->tv_usec;
+        if (usecs < 0) {
+                secs--;
+                usecs += USEC_PER_SEC;
+        }
+        return secs * USEC_PER_SEC + usecs;
+}
+static inline void timeval_add_usecs(struct timeval *tv,
+                                     const suseconds_t usecs)
+{
+        tv->tv_usec += usecs;
+        while (tv->tv_usec >= USEC_PER_SEC) {
+                tv->tv_sec++;
+                tv->tv_usec -= USEC_PER_SEC;
+        }
+}
+static inline void timeval_sub_usecs(struct timeval *tv,
+                                     const suseconds_t usecs)
+{
+        tv->tv_usec -= usecs;
+        while (tv->tv_usec < 0) {
+                tv->tv_sec--;
+                tv->tv_usec += USEC_PER_SEC;
+        }
+}
+/*
+ * Returns the difference in usecs between timeval
+ * passed in and current time
+ */
+static inline suseconds_t timeval_now_delta(const struct timeval *tv)
+{
+        struct timeval now;
+        do_gettimeofday(&now);
+        return timeval_delta(&now, tv);
+}
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern void dccp_ackvector_print(const u64 ackno,
+                                 const unsigned char *vector, int len);
+extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap);
+#else
+static inline void dccp_ackvector_print(const u64 ackno,
+                                        const unsigned char *vector,
+                                        int len) { }
+static inline void dccp_ackpkts_print(const struct dccp_ackpkts *ap) { }
+#endif
+#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
new file mode 100644
index 000000000000..f675d8e642d3
--- /dev/null
+++ b/net/dccp/diag.c
@@ -0,0 +1,71 @@
+/*
+ *  net/dccp/diag.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *      This program is free software; you can redistribute it and/or modify it
+ *      under the terms of the GNU General Public License version 2 as
+ *      published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include "ccid.h"
+#include "dccp.h"
+static void dccp_get_info(struct sock *sk, struct tcp_info *info)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        memset(info, 0, sizeof(*info));
+        info->tcpi_state        = sk->sk_state;
+        info->tcpi_retransmits  = icsk->icsk_retransmits;
+        info->tcpi_probes       = icsk->icsk_probes_out;
+        info->tcpi_backoff      = icsk->icsk_backoff;
+        info->tcpi_pmtu         = dp->dccps_pmtu_cookie;
+        if (dp->dccps_options.dccpo_send_ack_vector)
+                info->tcpi_options |= TCPI_OPT_SACK;
+        ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+        ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+}
+static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+                               void *_info)
+{
+        r->idiag_rqueue = r->idiag_wqueue = 0;
+        if (_info != NULL)
+                dccp_get_info(sk, _info);
+}
+static struct inet_diag_handler dccp_diag_handler = {
+        .idiag_hashinfo  = &dccp_hashinfo,
+        .idiag_get_info  = dccp_diag_get_info,
+        .idiag_type      = DCCPDIAG_GETSOCK,
+        .idiag_info_size = sizeof(struct tcp_info),
+};
+static int __init dccp_diag_init(void)
+{
+        return inet_diag_register(&dccp_diag_handler);
+}
+static void __exit dccp_diag_fini(void)
+{
+        inet_diag_unregister(&dccp_diag_handler);
+}
+module_init(dccp_diag_init);
+module_exit(dccp_diag_fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP inet_diag handler");
diff --git a/net/dccp/input.c b/net/dccp/input.c
new file mode 100644
index 000000000000..ef29cef1dafe
--- /dev/null
+++ b/net/dccp/input.c
@@ -0,0 +1,600 @@
+/*
+ *  net/dccp/input.c
+ * 
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include "ccid.h"
+#include "dccp.h"
+static void dccp_fin(struct sock *sk, struct sk_buff *skb)
+{
+        sk->sk_shutdown |= RCV_SHUTDOWN;
+        sock_set_flag(sk, SOCK_DONE);
+        __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
+        __skb_queue_tail(&sk->sk_receive_queue, skb);
+        skb_set_owner_r(skb, sk);
+        sk->sk_data_ready(sk, 0);
+}
+static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
+{
+        dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+        dccp_fin(sk, skb);
+        dccp_set_state(sk, DCCP_CLOSED);
+        sk_wake_async(sk, 1, POLL_HUP);
+}
+static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+{
+        /*
+         *   Step 7: Check for unexpected packet types
+         *      If (S.is_server and P.type == CloseReq)
+         *        Send Sync packet acknowledging P.seqno
+         *        Drop packet and return
+         */
+        if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
+                dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+                return;
+        }
+        dccp_set_state(sk, DCCP_CLOSING);
+        dccp_send_close(sk, 0);
+}
+static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        if (dp->dccps_options.dccpo_send_ack_vector)
+                dccp_ackpkts_check_rcv_ackno(dp->dccps_hc_rx_ackpkts, sk,
+                                             DCCP_SKB_CB(skb)->dccpd_ack_seq);
+}
+static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
+{
+        const struct dccp_hdr *dh = dccp_hdr(skb);
+        struct dccp_sock *dp = dccp_sk(sk);
+        u64 lswl, lawl;
+        /*
+         *   Step 5: Prepare sequence numbers for Sync
+         *     If P.type == Sync or P.type == SyncAck,
+         *        If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
+         *           / * P is valid, so update sequence number variables
+         *               accordingly.  After this update, P will pass the tests
+         *               in Step 6.  A SyncAck is generated if necessary in
+         *               Step 15 * /
+         *           Update S.GSR, S.SWL, S.SWH
+         *        Otherwise,
+         *           Drop packet and return
+         */
+        if (dh->dccph_type == DCCP_PKT_SYNC || 
+            dh->dccph_type == DCCP_PKT_SYNCACK) {
+                if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                              dp->dccps_awl, dp->dccps_awh) &&
+                    !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl))
+                        dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+                else
+                        return -1;
+        }
+        
+        /*
+         *   Step 6: Check sequence numbers
+         *      Let LSWL = S.SWL and LAWL = S.AWL
+         *      If P.type == CloseReq or P.type == Close or P.type == Reset,
+         *        LSWL := S.GSR + 1, LAWL := S.GAR
+         *      If LSWL <= P.seqno <= S.SWH
+         *           and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
+         *        Update S.GSR, S.SWL, S.SWH
+         *        If P.type != Sync,
+         *           Update S.GAR
+         *      Otherwise,
+         *        Send Sync packet acknowledging P.seqno
+         *        Drop packet and return
+         */
+        lswl = dp->dccps_swl;
+        lawl = dp->dccps_awl;
+        if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
+            dh->dccph_type == DCCP_PKT_CLOSE ||
+            dh->dccph_type == DCCP_PKT_RESET) {
+                lswl = dp->dccps_gsr;
+                dccp_inc_seqno(&lswl);
+                lawl = dp->dccps_gar;
+        }
+        if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) &&
+            (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ ||
+             between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                       lawl, dp->dccps_awh))) {
+                dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+                if (dh->dccph_type != DCCP_PKT_SYNC &&
+                    (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
+                     DCCP_PKT_WITHOUT_ACK_SEQ))
+                        dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+        } else {
+                LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, "
+                                            "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
+                                            "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
+                                            "sending SYNC...\n",
+                               dccp_packet_name(dh->dccph_type),
+                               (unsigned long long) lswl,
+                               (unsigned long long)
+                               DCCP_SKB_CB(skb)->dccpd_seq,
+                               (unsigned long long) dp->dccps_swh,
+                               (DCCP_SKB_CB(skb)->dccpd_ack_seq ==
+                                DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists",
+                               (unsigned long long) lawl,
+                               (unsigned long long)
+                               DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                               (unsigned long long) dp->dccps_awh);
+                dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+                return -1;
+        }
+        return 0;
+}
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+                         const struct dccp_hdr *dh, const unsigned len)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        if (dccp_check_seqno(sk, skb))
+                goto discard;
+        if (dccp_parse_options(sk, skb))
+                goto discard;
+        if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+                dccp_event_ack_recv(sk, skb);
+        /*
+         * FIXME: check ECN to see if we should use
+         * DCCP_ACKPKTS_STATE_ECN_MARKED
+         */
+        if (dp->dccps_options.dccpo_send_ack_vector) {
+                struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+                if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+                                     DCCP_SKB_CB(skb)->dccpd_seq,
+                                     DCCP_ACKPKTS_STATE_RECEIVED)) {
+                        LIMIT_NETDEBUG(KERN_WARNING "DCCP: acknowledgeable "
+                                                    "packets buffer full!\n");
+                        ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+                        inet_csk_schedule_ack(sk);
+                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                  TCP_DELACK_MIN,
+                                                  DCCP_RTO_MAX);
+                        goto discard;
+                }
+                /*
+                 * FIXME: this activation is probably wrong, have to study more
+                 * TCP delack machinery and how it fits into DCCP draft, but
+                 * for now it kinda "works" 8)
+                 */
+                if (!inet_csk_ack_scheduled(sk)) {
+                        inet_csk_schedule_ack(sk);
+                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ,
+                                                  DCCP_RTO_MAX);
+                }
+        }
+        ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+        ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+        switch (dccp_hdr(skb)->dccph_type) {
+        case DCCP_PKT_DATAACK:
+        case DCCP_PKT_DATA:
+                /*
+                 * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED
+                 * option if it is.
+                 */
+                __skb_pull(skb, dh->dccph_doff * 4);
+                __skb_queue_tail(&sk->sk_receive_queue, skb);
+                skb_set_owner_r(skb, sk);
+                sk->sk_data_ready(sk, 0);
+                return 0;
+        case DCCP_PKT_ACK:
+                goto discard;
+        case DCCP_PKT_RESET:
+                /*
+                 *  Step 9: Process Reset
+                 *      If P.type == Reset,
+                 *              Tear down connection
+                 *              S.state := TIMEWAIT
+                 *              Set TIMEWAIT timer
+                 *              Drop packet and return
+                */
+                dccp_fin(sk, skb);
+                dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+                return 0;
+        case DCCP_PKT_CLOSEREQ:
+                dccp_rcv_closereq(sk, skb);
+                goto discard;
+        case DCCP_PKT_CLOSE:
+                dccp_rcv_close(sk, skb);
+                return 0;
+        case DCCP_PKT_REQUEST:
+                /* Step 7 
+                 *   or (S.is_server and P.type == Response)
+                 *   or (S.is_client and P.type == Request)
+                 *   or (S.state >= OPEN and P.type == Request
+                 *      and P.seqno >= S.OSR)
+                 *    or (S.state >= OPEN and P.type == Response
+                 *      and P.seqno >= S.OSR)
+                 *    or (S.state == RESPOND and P.type == Data),
+                 *  Send Sync packet acknowledging P.seqno
+                 *  Drop packet and return
+                 */
+                if (dp->dccps_role != DCCP_ROLE_LISTEN)
+                        goto send_sync;
+                goto check_seq;
+        case DCCP_PKT_RESPONSE:
+                if (dp->dccps_role != DCCP_ROLE_CLIENT)
+                        goto send_sync;
+check_seq:
+                if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) {
+send_sync:
+                        dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+                                       DCCP_PKT_SYNC);
+                }
+                break;
+        case DCCP_PKT_SYNC:
+                dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+                               DCCP_PKT_SYNCACK);
+                /*
+                 * From the draft:
+                 *
+                 * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
+                 * MAY have non-zero-length application data areas, whose
+                 * contents * receivers MUST ignore.
+                 */
+                goto discard;
+        }
+        DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
+discard:
+        __kfree_skb(skb);
+        return 0;
+}
+static int dccp_rcv_request_sent_state_process(struct sock *sk,
+                                               struct sk_buff *skb,
+                                               const struct dccp_hdr *dh,
+                                               const unsigned len)
+{
+        /* 
+         *  Step 4: Prepare sequence numbers in REQUEST
+         *     If S.state == REQUEST,
+         *        If (P.type == Response or P.type == Reset)
+         *              and S.AWL <= P.ackno <= S.AWH,
+         *           / * Set sequence number variables corresponding to the
+         *              other endpoint, so P will pass the tests in Step 6 * /
+         *           Set S.GSR, S.ISR, S.SWL, S.SWH
+         *           / * Response processing continues in Step 10; Reset
+         *              processing continues in Step 9 * /
+        */
+        if (dh->dccph_type == DCCP_PKT_RESPONSE) {
+                const struct inet_connection_sock *icsk = inet_csk(sk);
+                struct dccp_sock *dp = dccp_sk(sk);
+                /* Stop the REQUEST timer */
+                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+                BUG_TRAP(sk->sk_send_head != NULL);
+                __kfree_skb(sk->sk_send_head);
+                sk->sk_send_head = NULL;
+                if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                               dp->dccps_awl, dp->dccps_awh)) {
+                        dccp_pr_debug("invalid ackno: S.AWL=%llu, "
+                                      "P.ackno=%llu, S.AWH=%llu \n",
+                                      (unsigned long long)dp->dccps_awl,
+                           (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                                      (unsigned long long)dp->dccps_awh);
+                        goto out_invalid_packet;
+                }
+                dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+                dccp_update_gsr(sk, dp->dccps_isr);
+                /*
+                 * SWL and AWL are initially adjusted so that they are not less than
+                 * the initial Sequence Numbers received and sent, respectively:
+                 *      SWL := max(GSR + 1 - floor(W/4), ISR),
+                 *      AWL := max(GSS - W' + 1, ISS).
+                 * These adjustments MUST be applied only at the beginning of the
+                 * connection.
+                 *
+                 * AWL was adjusted in dccp_v4_connect -acme
+                 */
+                dccp_set_seqno(&dp->dccps_swl,
+                               max48(dp->dccps_swl, dp->dccps_isr));
+                if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 ||
+                    ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) {
+                        ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+                        ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+                        /* FIXME: send appropriate RESET code */
+                        goto out_invalid_packet;
+                }
+                dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+                /*
+                 *    Step 10: Process REQUEST state (second part)
+                 *       If S.state == REQUEST,
+                 *        / * If we get here, P is a valid Response from the
+                 *            server (see Step 4), and we should move to
+                 *            PARTOPEN state. PARTOPEN means send an Ack,
+                 *            don't send Data packets, retransmit Acks
+                 *            periodically, and always include any Init Cookie
+                 *            from the Response * /
+                 *        S.state := PARTOPEN
+                 *        Set PARTOPEN timer
+                 *        Continue with S.state == PARTOPEN
+                 *        / * Step 12 will send the Ack completing the
+                 *            three-way handshake * /
+                 */
+                dccp_set_state(sk, DCCP_PARTOPEN);
+                /* Make sure socket is routed, for correct metrics. */
+                inet_sk_rebuild_header(sk);
+                if (!sock_flag(sk, SOCK_DEAD)) {
+                        sk->sk_state_change(sk);
+                        sk_wake_async(sk, 0, POLL_OUT);
+                }
+                if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
+                    icsk->icsk_accept_queue.rskq_defer_accept) {
+                        /* Save one ACK. Data will be ready after
+                         * several ticks, if write_pending is set.
+                         *
+                         * It may be deleted, but with this feature tcpdumps
+                         * look so _wonderfully_ clever, that I was not able
+                         * to stand against the temptation 8)     --ANK
+                         */
+                        /*
+                         * OK, in DCCP we can as well do a similar trick, its
+                         * even in the draft, but there is no need for us to
+                         * schedule an ack here, as dccp_sendmsg does this for
+                         * us, also stated in the draft. -acme
+                         */
+                        __kfree_skb(skb);
+                        return 0;
+                } 
+                dccp_send_ack(sk);
+                return -1;
+        }
+out_invalid_packet:
+        return 1; /* dccp_v4_do_rcv will send a reset, but...
+                     FIXME: the reset code should be
+                            DCCP_RESET_CODE_PACKET_ERROR */
+}
+static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
+                                                   struct sk_buff *skb,
+                                                   const struct dccp_hdr *dh,
+                                                   const unsigned len)
+{
+        int queued = 0;
+        switch (dh->dccph_type) {
+        case DCCP_PKT_RESET:
+                inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+                break;
+        case DCCP_PKT_DATAACK:
+        case DCCP_PKT_ACK:
+                /*
+                 * FIXME: we should be reseting the PARTOPEN (DELACK) timer
+                 * here but only if we haven't used the DELACK timer for
+                 * something else, like sending a delayed ack for a TIMESTAMP
+                 * echo, etc, for now were not clearing it, sending an extra
+                 * ACK when there is nothing else to do in DELACK is not a big
+                 * deal after all.
+                 */
+                /* Stop the PARTOPEN timer */
+                if (sk->sk_state == DCCP_PARTOPEN)
+                        inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+                dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+                dccp_set_state(sk, DCCP_OPEN);
+                if (dh->dccph_type == DCCP_PKT_DATAACK) {
+                        dccp_rcv_established(sk, skb, dh, len);
+                        queued = 1; /* packet was queued
+                                       (by dccp_rcv_established) */
+                }
+                break;
+        }
+        return queued;
+}
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+                           struct dccp_hdr *dh, unsigned len)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        const int old_state = sk->sk_state;
+        int queued = 0;
+        /*
+         *  Step 3: Process LISTEN state
+         *      (Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv)
+         *
+         *     If S.state == LISTEN,
+         *        If P.type == Request or P contains a valid Init Cookie
+         *              option,
+         *           * Must scan the packet's options to check for an Init
+         *              Cookie.  Only the Init Cookie is processed here,
+         *              however; other options are processed in Step 8.  This
+         *              scan need only be performed if the endpoint uses Init
+         *              Cookies *
+         *           * Generate a new socket and switch to that socket *
+         *           Set S := new socket for this port pair
+         *           S.state = RESPOND
+         *           Choose S.ISS (initial seqno) or set from Init Cookie
+         *           Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+         *           Continue with S.state == RESPOND
+         *           * A Response packet will be generated in Step 11 *
+         *        Otherwise,
+         *           Generate Reset(No Connection) unless P.type == Reset
+         *           Drop packet and return
+         *
+         * NOTE: the check for the packet types is done in
+         *       dccp_rcv_state_process
+         */
+        if (sk->sk_state == DCCP_LISTEN) {
+                if (dh->dccph_type == DCCP_PKT_REQUEST) {
+                        if (dccp_v4_conn_request(sk, skb) < 0)
+                                return 1;
+                        /* FIXME: do congestion control initialization */
+                        goto discard;
+                }
+                if (dh->dccph_type == DCCP_PKT_RESET)
+                        goto discard;
+                /* Caller (dccp_v4_do_rcv) will send Reset(No Connection)*/
+                return 1;
+        }
+        if (sk->sk_state != DCCP_REQUESTING) {
+                if (dccp_check_seqno(sk, skb))
+                        goto discard;
+                /*
+                 * Step 8: Process options and mark acknowledgeable
+                 */
+                if (dccp_parse_options(sk, skb))
+                        goto discard;
+                if (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
+                    DCCP_PKT_WITHOUT_ACK_SEQ)
+                        dccp_event_ack_recv(sk, skb);
+                ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+                ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+                /*
+                 * FIXME: check ECN to see if we should use
+                 * DCCP_ACKPKTS_STATE_ECN_MARKED
+                 */
+                if (dp->dccps_options.dccpo_send_ack_vector) {
+                        if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+                                             DCCP_SKB_CB(skb)->dccpd_seq,
+                                             DCCP_ACKPKTS_STATE_RECEIVED))
+                                goto discard;
+                        /*
+                         * FIXME: this activation is probably wrong, have to
+                         * study more TCP delack machinery and how it fits into
+                         * DCCP draft, but for now it kinda "works" 8)
+                         */
+                        if ((dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno ==
+                             DCCP_MAX_SEQNO + 1) &&
+                            !inet_csk_ack_scheduled(sk)) {
+                                inet_csk_schedule_ack(sk);
+                                inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                          TCP_DELACK_MIN,
+                                                          DCCP_RTO_MAX);
+                        }
+                }
+        }
+        /*
+         *  Step 9: Process Reset
+         *      If P.type == Reset,
+         *              Tear down connection
+         *              S.state := TIMEWAIT
+         *              Set TIMEWAIT timer
+         *              Drop packet and return
+        */
+        if (dh->dccph_type == DCCP_PKT_RESET) {
+                /*
+                 * Queue the equivalent of TCP fin so that dccp_recvmsg
+                 * exits the loop
+                 */
+                dccp_fin(sk, skb);
+                dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+                return 0;
+                /*
+                 *   Step 7: Check for unexpected packet types
+                 *      If (S.is_server and P.type == CloseReq)
+                 *          or (S.is_server and P.type == Response)
+                 *          or (S.is_client and P.type == Request)
+                 *          or (S.state == RESPOND and P.type == Data),
+                 *        Send Sync packet acknowledging P.seqno
+                 *        Drop packet and return
+                 */
+        } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+                    (dh->dccph_type == DCCP_PKT_RESPONSE ||
+                     dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
+                    (dp->dccps_role == DCCP_ROLE_CLIENT &&
+                     dh->dccph_type == DCCP_PKT_REQUEST) ||
+                    (sk->sk_state == DCCP_RESPOND &&
+                     dh->dccph_type == DCCP_PKT_DATA)) {
+                dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+                               DCCP_PKT_SYNC);
+                goto discard;
+        } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+                dccp_rcv_closereq(sk, skb);
+                goto discard;
+        } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+                dccp_rcv_close(sk, skb);
+                return 0;
+        }
+        switch (sk->sk_state) {
+        case DCCP_CLOSED:
+                return 1;
+        case DCCP_REQUESTING:
+                /* FIXME: do congestion control initialization */
+                queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
+                if (queued >= 0)
+                        return queued;
+                __kfree_skb(skb);
+                return 0;
+        case DCCP_RESPOND:
+        case DCCP_PARTOPEN:
+                queued = dccp_rcv_respond_partopen_state_process(sk, skb,
+                                                                 dh, len);
+                break;
+        }
+        if (dh->dccph_type == DCCP_PKT_ACK ||
+            dh->dccph_type == DCCP_PKT_DATAACK) {
+                switch (old_state) {
+                case DCCP_PARTOPEN:
+                        sk->sk_state_change(sk);
+                        sk_wake_async(sk, 0, POLL_OUT);
+                        break;
+                }
+        }
+        if (!queued) { 
+discard:
+                __kfree_skb(skb);
+        }
+        return 0;
+}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
new file mode 100644
index 000000000000..3fc75dbee4b8
--- /dev/null
+++ b/net/dccp/ipv4.c
@@ -0,0 +1,1356 @@
+/*
+ *  net/dccp/ipv4.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/icmp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#include "ccid.h"
+#include "dccp.h"
+struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
+        .lhash_lock     = RW_LOCK_UNLOCKED,
+        .lhash_users    = ATOMIC_INIT(0),
+        .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
+        .portalloc_lock = SPIN_LOCK_UNLOCKED,
+        .port_rover     = 1024 - 1,
+};
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
+{
+        return inet_csk_get_port(&dccp_hashinfo, sk, snum);
+}
+static void dccp_v4_hash(struct sock *sk)
+{
+        inet_hash(&dccp_hashinfo, sk);
+}
+static void dccp_v4_unhash(struct sock *sk)
+{
+        inet_unhash(&dccp_hashinfo, sk);
+}
+/* called with local bh disabled */
+static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
+                                      struct inet_timewait_sock **twp)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        const u32 daddr = inet->rcv_saddr;
+        const u32 saddr = inet->daddr;
+        const int dif = sk->sk_bound_dev_if;
+        INET_ADDR_COOKIE(acookie, saddr, daddr)
+        const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+        const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport,
+                                      dccp_hashinfo.ehash_size);
+        struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash];
+        const struct sock *sk2;
+        const struct hlist_node *node;
+        struct inet_timewait_sock *tw;
+        write_lock(&head->lock);
+        /* Check TIME-WAIT sockets first. */
+        sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
+                tw = inet_twsk(sk2);
+                if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+                        goto not_unique;
+        }
+        tw = NULL;
+        /* And established part... */
+        sk_for_each(sk2, node, &head->chain) {
+                if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+                        goto not_unique;
+        }
+        /* Must record num and sport now. Otherwise we will see
+         * in hash table socket with a funny identity. */
+        inet->num = lport;
+        inet->sport = htons(lport);
+        sk->sk_hashent = hash;
+        BUG_TRAP(sk_unhashed(sk));
+        __sk_add_node(sk, &head->chain);
+        sock_prot_inc_use(sk->sk_prot);
+        write_unlock(&head->lock);
+        if (twp != NULL) {
+                *twp = tw;
+                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+        } else if (tw != NULL) {
+                /* Silly. Should hash-dance instead... */
+                inet_twsk_deschedule(tw, &dccp_death_row);
+                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+                inet_twsk_put(tw);
+        }
+        return 0;
+not_unique:
+        write_unlock(&head->lock);
+        return -EADDRNOTAVAIL;
+}
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+static int dccp_v4_hash_connect(struct sock *sk)
+{
+        const unsigned short snum = inet_sk(sk)->num;
+        struct inet_bind_hashbucket *head;
+        struct inet_bind_bucket *tb;
+        int ret;
+        if (snum == 0) {
+                int rover;
+                int low = sysctl_local_port_range[0];
+                int high = sysctl_local_port_range[1];
+                int remaining = (high - low) + 1;
+                struct hlist_node *node;
+                struct inet_timewait_sock *tw = NULL;
+                local_bh_disable();
+                /* TODO. Actually it is not so bad idea to remove
+                 * dccp_hashinfo.portalloc_lock before next submission to
+                 * Linus.
+                 * As soon as we touch this place at all it is time to think.
+                 *
+                 * Now it protects single _advisory_ variable
+                 * dccp_hashinfo.port_rover, hence it is mostly useless.
+                 * Code will work nicely if we just delete it, but
+                 * I am afraid in contented case it will work not better or
+                 * even worse: another cpu just will hit the same bucket
+                 * and spin there.
+                 * So some cpu salt could remove both contention and
+                 * memory pingpong. Any ideas how to do this in a nice way?
+                 */
+                spin_lock(&dccp_hashinfo.portalloc_lock);
+                rover = dccp_hashinfo.port_rover;
+                do {
+                        rover++;
+                        if ((rover < low) || (rover > high))
+                                rover = low;
+                        head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
+                                                    dccp_hashinfo.bhash_size)];
+                        spin_lock(&head->lock);
+                        /* Does not bother with rcv_saddr checks,
+                         * because the established check is already
+                         * unique enough.
+                         */
+                        inet_bind_bucket_for_each(tb, node, &head->chain) {
+                                if (tb->port == rover) {
+                                        BUG_TRAP(!hlist_empty(&tb->owners));
+                                        if (tb->fastreuse >= 0)
+                                                goto next_port;
+                                        if (!__dccp_v4_check_established(sk,
+                                                                         rover,
+                                                                         &tw))
+                                                goto ok;
+                                        goto next_port;
+                                }
+                        }
+                        tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
+                                                     head, rover);
+                        if (tb == NULL) {
+                                spin_unlock(&head->lock);
+                                break;
+                        }
+                        tb->fastreuse = -1;
+                        goto ok;
+                next_port:
+                        spin_unlock(&head->lock);
+                } while (--remaining > 0);
+                dccp_hashinfo.port_rover = rover;
+                spin_unlock(&dccp_hashinfo.portalloc_lock);
+                local_bh_enable();
+                return -EADDRNOTAVAIL;
+ok:
+                /* All locks still held and bhs disabled */
+                dccp_hashinfo.port_rover = rover;
+                spin_unlock(&dccp_hashinfo.portalloc_lock);
+                inet_bind_hash(sk, tb, rover);
+                if (sk_unhashed(sk)) {
+                        inet_sk(sk)->sport = htons(rover);
+                        __inet_hash(&dccp_hashinfo, sk, 0);
+                }
+                spin_unlock(&head->lock);
+                if (tw != NULL) {
+                        inet_twsk_deschedule(tw, &dccp_death_row);
+                        inet_twsk_put(tw);
+                }
+                ret = 0;
+                goto out;
+        }
+        head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
+                                                 dccp_hashinfo.bhash_size)];
+        tb   = inet_csk(sk)->icsk_bind_hash;
+        spin_lock_bh(&head->lock);
+        if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+                __inet_hash(&dccp_hashinfo, sk, 0);
+                spin_unlock_bh(&head->lock);
+                return 0;
+        } else {
+                spin_unlock(&head->lock);
+                /* No definite answer... Walk to established hash table */
+                ret = __dccp_v4_check_established(sk, snum, NULL);
+out:
+                local_bh_enable();
+                return ret;
+        }
+}
+static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+                           int addr_len)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct dccp_sock *dp = dccp_sk(sk);
+        const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+        struct rtable *rt;
+        u32 daddr, nexthop;
+        int tmp;
+        int err;
+        dp->dccps_role = DCCP_ROLE_CLIENT;
+        if (addr_len < sizeof(struct sockaddr_in))
+                return -EINVAL;
+        if (usin->sin_family != AF_INET)
+                return -EAFNOSUPPORT;
+        nexthop = daddr = usin->sin_addr.s_addr;
+        if (inet->opt != NULL && inet->opt->srr) {
+                if (daddr == 0)
+                        return -EINVAL;
+                nexthop = inet->opt->faddr;
+        }
+        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+                               IPPROTO_DCCP,
+                               inet->sport, usin->sin_port, sk);
+        if (tmp < 0)
+                return tmp;
+        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+                ip_rt_put(rt);
+                return -ENETUNREACH;
+        }
+        if (inet->opt == NULL || !inet->opt->srr)
+                daddr = rt->rt_dst;
+        if (inet->saddr == 0)
+                inet->saddr = rt->rt_src;
+        inet->rcv_saddr = inet->saddr;
+        inet->dport = usin->sin_port;
+        inet->daddr = daddr;
+        dp->dccps_ext_header_len = 0;
+        if (inet->opt != NULL)
+                dp->dccps_ext_header_len = inet->opt->optlen;
+        /*
+         * Socket identity is still unknown (sport may be zero).
+         * However we set state to DCCP_REQUESTING and not releasing socket
+         * lock select source port, enter ourselves into the hash tables and
+         * complete initialization after this.
+         */
+        dccp_set_state(sk, DCCP_REQUESTING);
+        err = dccp_v4_hash_connect(sk);
+        if (err != 0)
+                goto failure;
+        err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
+        if (err != 0)
+                goto failure;
+        /* OK, now commit destination to socket.  */
+        sk_setup_caps(sk, &rt->u.dst);
+        dp->dccps_gar =
+                dp->dccps_iss = secure_dccp_sequence_number(inet->saddr,
+                                                            inet->daddr,
+                                                            inet->sport,
+                                                            usin->sin_port);
+        dccp_update_gss(sk, dp->dccps_iss);
+        /*
+         * SWL and AWL are initially adjusted so that they are not less than
+         * the initial Sequence Numbers received and sent, respectively:
+         *      SWL := max(GSR + 1 - floor(W/4), ISR),
+         *      AWL := max(GSS - W' + 1, ISS).
+         * These adjustments MUST be applied only at the beginning of the
+         * connection.
+         */
+        dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
+        inet->id = dp->dccps_iss ^ jiffies;
+        err = dccp_connect(sk);
+        rt = NULL;
+        if (err != 0)
+                goto failure;
+out:
+        return err;
+failure:
+        /*
+         * This unhashes the socket and releases the local port, if necessary.
+         */
+        dccp_set_state(sk, DCCP_CLOSED);
+        ip_rt_put(rt);
+        sk->sk_route_caps = 0;
+        inet->dport = 0;
+        goto out;
+}
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void dccp_do_pmtu_discovery(struct sock *sk,
+                                          const struct iphdr *iph,
+                                          u32 mtu)
+{
+        struct dst_entry *dst;
+        const struct inet_sock *inet = inet_sk(sk);
+        const struct dccp_sock *dp = dccp_sk(sk);
+        /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
+         * send out by Linux are always < 576bytes so they should go through
+         * unfragmented).
+         */
+        if (sk->sk_state == DCCP_LISTEN)
+                return;
+        /* We don't check in the destentry if pmtu discovery is forbidden
+         * on this route. We just assume that no packet_to_big packets
+         * are send back when pmtu discovery is not active.
+         * There is a small race when the user changes this flag in the
+         * route, but I think that's acceptable.
+         */
+        if ((dst = __sk_dst_check(sk, 0)) == NULL)
+                return;
+        dst->ops->update_pmtu(dst, mtu);
+        /* Something is about to be wrong... Remember soft error
+         * for the case, if this connection will not able to recover.
+         */
+        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+                sk->sk_err_soft = EMSGSIZE;
+        mtu = dst_mtu(dst);
+        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+            dp->dccps_pmtu_cookie > mtu) {
+                dccp_sync_mss(sk, mtu);
+                /*
+                 * From: draft-ietf-dccp-spec-11.txt
+                 *
+                 *      DCCP-Sync packets are the best choice for upward
+                 *      probing, since DCCP-Sync probes do not risk application
+                 *      data loss.
+                 */
+                dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+        } /* else let the usual retransmit timer handle it */
+}
+static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb)
+{
+        int err;
+        struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+        const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
+                                     sizeof(struct dccp_hdr_ext) +
+                                     sizeof(struct dccp_hdr_ack_bits);
+        struct sk_buff *skb;
+        if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+                return;
+        skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+        if (skb == NULL)
+                return;
+        /* Reserve space for headers. */
+        skb_reserve(skb, MAX_DCCP_HEADER);
+        skb->dst = dst_clone(rxskb->dst);
+        skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
+        dh = dccp_hdr(skb);
+        memset(dh, 0, dccp_hdr_ack_len);
+        /* Build DCCP header and checksum it. */
+        dh->dccph_type     = DCCP_PKT_ACK;
+        dh->dccph_sport    = rxdh->dccph_dport;
+        dh->dccph_dport    = rxdh->dccph_sport;
+        dh->dccph_doff     = dccp_hdr_ack_len / 4;
+        dh->dccph_x        = 1;
+        dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
+        dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+                         DCCP_SKB_CB(rxskb)->dccpd_seq);
+        bh_lock_sock(dccp_ctl_socket->sk);
+        err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+                                    rxskb->nh.iph->daddr,
+                                    rxskb->nh.iph->saddr, NULL);
+        bh_unlock_sock(dccp_ctl_socket->sk);
+        if (err == NET_XMIT_CN || err == 0) {
+                DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+                DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+        }
+}
+static void dccp_v4_reqsk_send_ack(struct sk_buff *skb,
+                                   struct request_sock *req)
+{
+        dccp_v4_ctl_send_ack(skb);
+}
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
+                                 struct dst_entry *dst)
+{
+        int err = -1;
+        struct sk_buff *skb;
+        /* First, grab a route. */
+        
+        if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+                goto out;
+        skb = dccp_make_response(sk, dst, req);
+        if (skb != NULL) {
+                const struct inet_request_sock *ireq = inet_rsk(req);
+                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+                                            ireq->rmt_addr,
+                                            ireq->opt);
+                if (err == NET_XMIT_CN)
+                        err = 0;
+        }
+out:
+        dst_release(dst);
+        return err;
+}
+/*
+ * This routine is called by the ICMP module when it gets some sort of error
+ * condition. If err < 0 then the socket should be closed and the error
+ * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
+ * After adjustment header points to the first 8 bytes of the tcp header. We
+ * need to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When someone else
+ * accesses the socket the ICMP is just dropped and for some paths there is no
+ * check at all. A more general error queue to queue errors for later handling
+ * is probably better.
+ */
+void dccp_v4_err(struct sk_buff *skb, u32 info)
+{
+        const struct iphdr *iph = (struct iphdr *)skb->data;
+        const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
+                                                        (iph->ihl << 2));
+        struct dccp_sock *dp;
+        struct inet_sock *inet;
+        const int type = skb->h.icmph->type;
+        const int code = skb->h.icmph->code;
+        struct sock *sk;
+        __u64 seq;
+        int err;
+        if (skb->len < (iph->ihl << 2) + 8) {
+                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+                return;
+        }
+        sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
+                         iph->saddr, dh->dccph_sport, inet_iif(skb));
+        if (sk == NULL) {
+                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+                return;
+        }
+        if (sk->sk_state == DCCP_TIME_WAIT) {
+                inet_twsk_put((struct inet_timewait_sock *)sk);
+                return;
+        }
+        bh_lock_sock(sk);
+        /* If too many ICMPs get dropped on busy
+         * servers this needs to be solved differently.
+         */
+        if (sock_owned_by_user(sk))
+                NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+        if (sk->sk_state == DCCP_CLOSED)
+                goto out;
+        dp = dccp_sk(sk);
+        seq = dccp_hdr_seq(skb);
+        if (sk->sk_state != DCCP_LISTEN &&
+            !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
+                NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+                goto out;
+        }
+        switch (type) {
+        case ICMP_SOURCE_QUENCH:
+                /* Just silently ignore these. */
+                goto out;
+        case ICMP_PARAMETERPROB:
+                err = EPROTO;
+                break;
+        case ICMP_DEST_UNREACH:
+                if (code > NR_ICMP_UNREACH)
+                        goto out;
+                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+                        if (!sock_owned_by_user(sk))
+                                dccp_do_pmtu_discovery(sk, iph, info);
+                        goto out;
+                }
+                err = icmp_err_convert[code].errno;
+                break;
+        case ICMP_TIME_EXCEEDED:
+                err = EHOSTUNREACH;
+                break;
+        default:
+                goto out;
+        }
+        switch (sk->sk_state) {
+                struct request_sock *req , **prev;
+        case DCCP_LISTEN:
+                if (sock_owned_by_user(sk))
+                        goto out;
+                req = inet_csk_search_req(sk, &prev, dh->dccph_dport,
+                                          iph->daddr, iph->saddr);
+                if (!req)
+                        goto out;
+                /*
+                 * ICMPs are not backlogged, hence we cannot get an established
+                 * socket here.
+                 */
+                BUG_TRAP(!req->sk);
+                if (seq != dccp_rsk(req)->dreq_iss) {
+                        NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+                        goto out;
+                }
+                /*
+                 * Still in RESPOND, just remove it silently.
+                 * There is no good way to pass the error to the newly
+                 * created socket, and POSIX does not want network
+                 * errors returned from accept().
+                 */
+                inet_csk_reqsk_queue_drop(sk, req, prev);
+                goto out;
+        case DCCP_REQUESTING:
+        case DCCP_RESPOND:
+                if (!sock_owned_by_user(sk)) {
+                        DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+                        sk->sk_err = err;
+                        sk->sk_error_report(sk);
+                        dccp_done(sk);
+                } else
+                        sk->sk_err_soft = err;
+                goto out;
+        }
+        /* If we've already connected we will keep trying
+         * until we time out, or the user gives up.
+         *
+         * rfc1122 4.2.3.9 allows to consider as hard errors
+         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+         * but it is obsoleted by pmtu discovery).
+         *
+         * Note, that in modern internet, where routing is unreliable
+         * and in each dark corner broken firewalls sit, sending random
+         * errors ordered by their masters even this two messages finally lose
+         * their original sense (even Linux sends invalid PORT_UNREACHs)
+         *
+         * Now we are in compliance with RFCs.
+         *                                                      --ANK (980905)
+         */
+        inet = inet_sk(sk);
+        if (!sock_owned_by_user(sk) && inet->recverr) {
+                sk->sk_err = err;
+                sk->sk_error_report(sk);
+        } else /* Only an error on timeout */
+                sk->sk_err_soft = err;
+out:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
+{
+        struct sk_buff *skb;
+        /*
+         * FIXME: what if rebuild_header fails?
+         * Should we be doing a rebuild_header here?
+         */
+        int err = inet_sk_rebuild_header(sk);
+        if (err != 0)
+                return err;
+        skb = dccp_make_reset(sk, sk->sk_dst_cache, code);
+        if (skb != NULL) {
+                const struct dccp_sock *dp = dccp_sk(sk);
+                const struct inet_sock *inet = inet_sk(sk);
+                err = ip_build_and_send_pkt(skb, sk,
+                                            inet->saddr, inet->daddr, NULL);
+                if (err == NET_XMIT_CN)
+                        err = 0;
+                ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+                ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+        }
+        return err;
+}
+static inline u64 dccp_v4_init_sequence(const struct sock *sk,
+                                        const struct sk_buff *skb)
+{
+        return secure_dccp_sequence_number(skb->nh.iph->daddr,
+                                           skb->nh.iph->saddr,
+                                           dccp_hdr(skb)->dccph_dport,
+                                           dccp_hdr(skb)->dccph_sport);
+}
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+        struct inet_request_sock *ireq;
+        struct dccp_sock dp;
+        struct request_sock *req;
+        struct dccp_request_sock *dreq;
+        const __u32 saddr = skb->nh.iph->saddr;
+        const __u32 daddr = skb->nh.iph->daddr;
+        struct dst_entry *dst = NULL;
+        /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
+        if (((struct rtable *)skb->dst)->rt_flags &
+            (RTCF_BROADCAST | RTCF_MULTICAST))
+                goto drop;
+        /*
+         * TW buckets are converted to open requests without
+         * limitations, they conserve resources and peer is
+         * evidently real one.
+         */
+        if (inet_csk_reqsk_queue_is_full(sk))
+                goto drop;
+        /*
+         * Accept backlog is full. If we have already queued enough
+         * of warm entries in syn queue, drop request. It is better than
+         * clogging syn queue with openreqs with exponentially increasing
+         * timeout.
+         */
+        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+                goto drop;
+        req = reqsk_alloc(sk->sk_prot->rsk_prot);
+        if (req == NULL)
+                goto drop;
+        /* FIXME: process options */
+        dccp_openreq_init(req, &dp, skb);
+        ireq = inet_rsk(req);
+        ireq->loc_addr = daddr;
+        ireq->rmt_addr = saddr;
+        /* FIXME: Merge Aristeu's option parsing code when ready */
+        req->rcv_wnd    = 100; /* Fake, option parsing will get the
+                                  right value */
+        ireq->opt       = NULL;
+        /* 
+         * Step 3: Process LISTEN state
+         *
+         * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+         *
+         * In fact we defer setting S.GSR, S.SWL, S.SWH to
+         * dccp_create_openreq_child.
+         */
+        dreq = dccp_rsk(req);
+        dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+        dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
+        dreq->dreq_service = dccp_hdr_request(skb)->dccph_req_service;
+        if (dccp_v4_send_response(sk, req, dst))
+                goto drop_and_free;
+        inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+        return 0;
+drop_and_free:
+        /*
+         * FIXME: should be reqsk_free after implementing req->rsk_ops
+         */
+        __reqsk_free(req);
+drop:
+        DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+        return -1;
+}
+/*
+ * The three way handshake has completed - we got a valid ACK or DATAACK -
+ * now create the new socket.
+ *
+ * This is the equivalent of TCP's tcp_v4_syn_recv_sock
+ */
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+                                       struct request_sock *req,
+                                       struct dst_entry *dst)
+{
+        struct inet_request_sock *ireq;
+        struct inet_sock *newinet;
+        struct dccp_sock *newdp;
+        struct sock *newsk;
+        if (sk_acceptq_is_full(sk))
+                goto exit_overflow;
+        if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+                goto exit;
+        newsk = dccp_create_openreq_child(sk, req, skb);
+        if (newsk == NULL)
+                goto exit;
+        sk_setup_caps(newsk, dst);
+        newdp              = dccp_sk(newsk);
+        newinet            = inet_sk(newsk);
+        ireq               = inet_rsk(req);
+        newinet->daddr     = ireq->rmt_addr;
+        newinet->rcv_saddr = ireq->loc_addr;
+        newinet->saddr     = ireq->loc_addr;
+        newinet->opt       = ireq->opt;
+        ireq->opt          = NULL;
+        newinet->mc_index  = inet_iif(skb);
+        newinet->mc_ttl    = skb->nh.iph->ttl;
+        newinet->id        = jiffies;
+        dccp_sync_mss(newsk, dst_mtu(dst));
+        __inet_hash(&dccp_hashinfo, newsk, 0);
+        __inet_inherit_port(&dccp_hashinfo, sk, newsk);
+        return newsk;
+exit_overflow:
+        NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+exit:
+        NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+        dst_release(dst);
+        return NULL;
+}
+static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+        const struct dccp_hdr *dh = dccp_hdr(skb);
+        const struct iphdr *iph = skb->nh.iph;
+        struct sock *nsk;
+        struct request_sock **prev;
+        /* Find possible connection requests. */
+        struct request_sock *req = inet_csk_search_req(sk, &prev,
+                                                       dh->dccph_sport,
+                                                       iph->saddr, iph->daddr);
+        if (req != NULL)
+                return dccp_check_req(sk, skb, req, prev);
+        nsk = __inet_lookup_established(&dccp_hashinfo,
+                                        iph->saddr, dh->dccph_sport,
+                                        iph->daddr, ntohs(dh->dccph_dport),
+                                        inet_iif(skb));
+        if (nsk != NULL) {
+                if (nsk->sk_state != DCCP_TIME_WAIT) {
+                        bh_lock_sock(nsk);
+                        return nsk;
+                }
+                inet_twsk_put((struct inet_timewait_sock *)nsk);
+                return NULL;
+        }
+        return sk;
+}
+int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
+                     const u32 daddr)
+{
+        const struct dccp_hdr* dh = dccp_hdr(skb);
+        int checksum_len;
+        u32 tmp;
+        if (dh->dccph_cscov == 0)
+                checksum_len = skb->len;
+        else {
+                checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+                checksum_len = checksum_len < skb->len ? checksum_len :
+                                                         skb->len;
+        }
+        tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+        return csum_tcpudp_magic(saddr, daddr, checksum_len,
+                                 IPPROTO_DCCP, tmp);
+}
+static int dccp_v4_verify_checksum(struct sk_buff *skb,
+                                   const u32 saddr, const u32 daddr)
+{
+        struct dccp_hdr *dh = dccp_hdr(skb);
+        int checksum_len;
+        u32 tmp;
+        if (dh->dccph_cscov == 0)
+                checksum_len = skb->len;
+        else {
+                checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+                checksum_len = checksum_len < skb->len ? checksum_len :
+                                                         skb->len;
+        }
+        tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+        return csum_tcpudp_magic(saddr, daddr, checksum_len,
+                                 IPPROTO_DCCP, tmp) == 0 ? 0 : -1;
+}
+static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
+                                           struct sk_buff *skb)
+{
+        struct rtable *rt;
+        struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
+                            .nl_u = { .ip4_u =
+                                      { .daddr = skb->nh.iph->saddr,
+                                        .saddr = skb->nh.iph->daddr,
+                                        .tos = RT_CONN_FLAGS(sk) } },
+                            .proto = sk->sk_protocol,
+                            .uli_u = { .ports =
+                                       { .sport = dccp_hdr(skb)->dccph_dport,
+                                         .dport = dccp_hdr(skb)->dccph_sport }
+                                     }
+                          };
+        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+                return NULL;
+        }
+        return &rt->u.dst;
+}
+static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb)
+{
+        int err;
+        struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+        const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+                                       sizeof(struct dccp_hdr_ext) +
+                                       sizeof(struct dccp_hdr_reset);
+        struct sk_buff *skb;
+        struct dst_entry *dst;
+        u64 seqno;
+        /* Never send a reset in response to a reset. */
+        if (rxdh->dccph_type == DCCP_PKT_RESET)
+                return;
+        if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+                return;
+        dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb);
+        if (dst == NULL)
+                return;
+        skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+        if (skb == NULL)
+                goto out;
+        /* Reserve space for headers. */
+        skb_reserve(skb, MAX_DCCP_HEADER);
+        skb->dst = dst_clone(dst);
+        skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
+        dh = dccp_hdr(skb);
+        memset(dh, 0, dccp_hdr_reset_len);
+        /* Build DCCP header and checksum it. */
+        dh->dccph_type     = DCCP_PKT_RESET;
+        dh->dccph_sport    = rxdh->dccph_dport;
+        dh->dccph_dport    = rxdh->dccph_sport;
+        dh->dccph_doff     = dccp_hdr_reset_len / 4;
+        dh->dccph_x        = 1;
+        dccp_hdr_reset(skb)->dccph_reset_code =
+                                DCCP_SKB_CB(rxskb)->dccpd_reset_code;
+        /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
+        seqno = 0;
+        if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+                dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
+        dccp_hdr_set_seq(dh, seqno);
+        dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+                         DCCP_SKB_CB(rxskb)->dccpd_seq);
+        dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr,
+                                              rxskb->nh.iph->daddr);
+        bh_lock_sock(dccp_ctl_socket->sk);
+        err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+                                    rxskb->nh.iph->daddr,
+                                    rxskb->nh.iph->saddr, NULL);
+        bh_unlock_sock(dccp_ctl_socket->sk);
+        if (err == NET_XMIT_CN || err == 0) {
+                DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+                DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+        }
+out:
+         dst_release(dst);
+}
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_hdr *dh = dccp_hdr(skb);
+        if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+                if (dccp_rcv_established(sk, skb, dh, skb->len))
+                        goto reset;
+                return 0;
+        }
+        /*
+         *  Step 3: Process LISTEN state
+         *     If S.state == LISTEN,
+         *        If P.type == Request or P contains a valid Init Cookie
+         *              option,
+         *           * Must scan the packet's options to check for an Init
+         *              Cookie.  Only the Init Cookie is processed here,
+         *              however; other options are processed in Step 8.  This
+         *              scan need only be performed if the endpoint uses Init
+         *              Cookies *
+         *           * Generate a new socket and switch to that socket *
+         *           Set S := new socket for this port pair
+         *           S.state = RESPOND
+         *           Choose S.ISS (initial seqno) or set from Init Cookie
+         *           Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+         *           Continue with S.state == RESPOND
+         *           * A Response packet will be generated in Step 11 *
+         *        Otherwise,
+         *           Generate Reset(No Connection) unless P.type == Reset
+         *           Drop packet and return
+         *
+         * NOTE: the check for the packet types is done in
+         *       dccp_rcv_state_process
+         */
+        if (sk->sk_state == DCCP_LISTEN) {
+                struct sock *nsk = dccp_v4_hnd_req(sk, skb);
+                if (nsk == NULL)
+                        goto discard;
+                if (nsk != sk) {
+                        if (dccp_child_process(sk, nsk, skb))
+                                goto reset;
+                        return 0;
+                }
+        }
+        if (dccp_rcv_state_process(sk, skb, dh, skb->len))
+                goto reset;
+        return 0;
+reset:
+        DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+        dccp_v4_ctl_send_reset(skb);
+discard:
+        kfree_skb(skb);
+        return 0;
+}
+static inline int dccp_invalid_packet(struct sk_buff *skb)
+{
+        const struct dccp_hdr *dh;
+        if (skb->pkt_type != PACKET_HOST)
+                return 1;
+        if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
+                LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n");
+                return 1;
+        }
+        dh = dccp_hdr(skb);
+        /* If the packet type is not understood, drop packet and return */
+        if (dh->dccph_type >= DCCP_PKT_INVALID) {
+                LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n");
+                return 1;
+        }
+        /*
+         * If P.Data Offset is too small for packet type, or too large for
+         * packet, drop packet and return
+         */
+        if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+                LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+                                            "too small 1\n",
+                               dh->dccph_doff);
+                return 1;
+        }
+        if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
+                LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+                                            "too small 2\n",
+                               dh->dccph_doff);
+                return 1;
+        }
+        dh = dccp_hdr(skb);
+        /*
+         * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
+         * has short sequence numbers), drop packet and return
+         */
+        if (dh->dccph_x == 0 &&
+            dh->dccph_type != DCCP_PKT_DATA &&
+            dh->dccph_type != DCCP_PKT_ACK &&
+            dh->dccph_type != DCCP_PKT_DATAACK) {
+                LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack "
+                                            "nor DataAck and P.X == 0\n",
+                               dccp_packet_name(dh->dccph_type));
+                return 1;
+        }
+        /* If the header checksum is incorrect, drop packet and return */
+        if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
+                                    skb->nh.iph->daddr) < 0) {
+                LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
+                                            "incorrect\n");
+                return 1;
+        }
+        return 0;
+}
+/* this is called when real data arrives */
+int dccp_v4_rcv(struct sk_buff *skb)
+{
+        const struct dccp_hdr *dh;
+        struct sock *sk;
+        int rc;
+        /* Step 1: Check header basics: */
+        if (dccp_invalid_packet(skb))
+                goto discard_it;
+        dh = dccp_hdr(skb);
+#if 0
+        /*
+         * Use something like this to simulate some DATA/DATAACK loss to test
+         * dccp_ackpkts_add, you'll get something like this on a session that
+         * sends 10 DATA/DATAACK packets:
+         *
+         * ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1|
+         *
+         * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet
+         * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets
+         *                                                 with the same state
+         * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet
+         *
+         * So...
+         *
+         * 281473596467422 was received
+         * 281473596467421 was not received
+         * 281473596467420 was received
+         * 281473596467419 was not received
+         * 281473596467418 was received
+         * 281473596467417 was not received
+         * 281473596467416 was received
+         * 281473596467415 was not received
+         * 281473596467414 was received
+         * 281473596467413 was received (this one was the 3way handshake
+         *                               RESPONSE)
+         *
+         */
+        if (dh->dccph_type == DCCP_PKT_DATA ||
+            dh->dccph_type == DCCP_PKT_DATAACK) {
+                static int discard = 0;
+                if (discard) {
+                        discard = 0;
+                        goto discard_it;
+                }
+                discard = 1;
+        }
+#endif
+        DCCP_SKB_CB(skb)->dccpd_seq  = dccp_hdr_seq(skb);
+        DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+        dccp_pr_debug("%8.8s "
+                      "src=%u.%u.%u.%u@%-5d "
+                      "dst=%u.%u.%u.%u@%-5d seq=%llu",
+                      dccp_packet_name(dh->dccph_type),
+                      NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport),
+                      NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport),
+                      (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+        if (dccp_packet_without_ack(skb)) {
+                DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+                dccp_pr_debug_cat("\n");
+        } else {
+                DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+                dccp_pr_debug_cat(", ack=%llu\n",
+                                  (unsigned long long)
+                                  DCCP_SKB_CB(skb)->dccpd_ack_seq);
+        }
+        /* Step 2:
+         *      Look up flow ID in table and get corresponding socket */
+        sk = __inet_lookup(&dccp_hashinfo,
+                           skb->nh.iph->saddr, dh->dccph_sport,
+                           skb->nh.iph->daddr, ntohs(dh->dccph_dport),
+                           inet_iif(skb));
+        /* 
+         * Step 2:
+         *      If no socket ...
+         *              Generate Reset(No Connection) unless P.type == Reset
+         *              Drop packet and return
+         */
+        if (sk == NULL) {
+                dccp_pr_debug("failed to look up flow ID in table and "
+                              "get corresponding socket\n");
+                goto no_dccp_socket;
+        }
+        /* 
+         * Step 2:
+         *      ... or S.state == TIMEWAIT,
+         *              Generate Reset(No Connection) unless P.type == Reset
+         *              Drop packet and return
+         */
+               
+        if (sk->sk_state == DCCP_TIME_WAIT) {
+                dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: "
+                              "do_time_wait\n");
+                goto do_time_wait;
+        }
+        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+                dccp_pr_debug("xfrm4_policy_check failed\n");
+                goto discard_and_relse;
+        }
+        if (sk_filter(sk, skb, 0)) {
+                dccp_pr_debug("sk_filter failed\n");
+                goto discard_and_relse;
+        }
+        skb->dev = NULL;
+        bh_lock_sock(sk);
+        rc = 0;
+        if (!sock_owned_by_user(sk))
+                rc = dccp_v4_do_rcv(sk, skb);
+        else
+                sk_add_backlog(sk, skb);
+        bh_unlock_sock(sk);
+        sock_put(sk);
+        return rc;
+no_dccp_socket:
+        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+                goto discard_it;
+        /*
+         * Step 2:
+         *              Generate Reset(No Connection) unless P.type == Reset
+         *              Drop packet and return
+         */
+        if (dh->dccph_type != DCCP_PKT_RESET) {
+                DCCP_SKB_CB(skb)->dccpd_reset_code =
+                                        DCCP_RESET_CODE_NO_CONNECTION;
+                dccp_v4_ctl_send_reset(skb);
+        }
+discard_it:
+        /* Discard frame. */
+        kfree_skb(skb);
+        return 0;
+discard_and_relse:
+        sock_put(sk);
+        goto discard_it;
+do_time_wait:
+        inet_twsk_put((struct inet_timewait_sock *)sk);
+        goto no_dccp_socket;
+}
+static int dccp_v4_init_sock(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        static int dccp_ctl_socket_init = 1;
+        dccp_options_init(&dp->dccps_options);
+        if (dp->dccps_options.dccpo_send_ack_vector) {
+                dp->dccps_hc_rx_ackpkts =
+                        dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
+                                           GFP_KERNEL);
+                if (dp->dccps_hc_rx_ackpkts == NULL)
+                        return -ENOMEM;
+        }
+        /*
+         * FIXME: We're hardcoding the CCID, and doing this at this point makes
+         * the listening (master) sock get CCID control blocks, which is not
+         * necessary, but for now, to not mess with the test userspace apps,
+         * lets leave it here, later the real solution is to do this in a
+         * setsockopt(CCIDs-I-want/accept). -acme
+         */
+        if (likely(!dccp_ctl_socket_init)) {
+                dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
+                                                 sk);
+                dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
+                                                 sk);
+                if (dp->dccps_hc_rx_ccid == NULL ||
+                    dp->dccps_hc_tx_ccid == NULL) {
+                        ccid_exit(dp->dccps_hc_rx_ccid, sk);
+                        ccid_exit(dp->dccps_hc_tx_ccid, sk);
+                        dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
+                        dp->dccps_hc_rx_ackpkts = NULL;
+                        dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+                        return -ENOMEM;
+                }
+        } else
+                dccp_ctl_socket_init = 0;
+        dccp_init_xmit_timers(sk);
+        inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+        sk->sk_state = DCCP_CLOSED;
+        sk->sk_write_space = dccp_write_space;
+        dp->dccps_mss_cache = 536;
+        dp->dccps_role = DCCP_ROLE_UNDEFINED;
+        return 0;
+}
+static int dccp_v4_destroy_sock(struct sock *sk)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        /*
+         * DCCP doesn't use sk_qrite_queue, just sk_send_head
+         * for retransmissions
+         */
+        if (sk->sk_send_head != NULL) {
+                kfree_skb(sk->sk_send_head);
+                sk->sk_send_head = NULL;
+        }
+        /* Clean up a referenced DCCP bind bucket. */
+        if (inet_csk(sk)->icsk_bind_hash != NULL)
+                inet_put_port(&dccp_hashinfo, sk);
+        ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+        ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+        dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
+        dp->dccps_hc_rx_ackpkts = NULL;
+        ccid_exit(dp->dccps_hc_rx_ccid, sk);
+        ccid_exit(dp->dccps_hc_tx_ccid, sk);
+        dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+        return 0;
+}
+static void dccp_v4_reqsk_destructor(struct request_sock *req)
+{
+        kfree(inet_rsk(req)->opt);
+}
+static struct request_sock_ops dccp_request_sock_ops = {
+        .family         = PF_INET,
+        .obj_size       = sizeof(struct dccp_request_sock),
+        .rtx_syn_ack    = dccp_v4_send_response,
+        .send_ack       = dccp_v4_reqsk_send_ack,
+        .destructor     = dccp_v4_reqsk_destructor,
+        .send_reset     = dccp_v4_ctl_send_reset,
+};
+struct proto dccp_v4_prot = {
+        .name                   = "DCCP",
+        .owner                  = THIS_MODULE,
+        .close                  = dccp_close,
+        .connect                = dccp_v4_connect,
+        .disconnect             = dccp_disconnect,
+        .ioctl                  = dccp_ioctl,
+        .init                   = dccp_v4_init_sock,
+        .setsockopt             = dccp_setsockopt,
+        .getsockopt             = dccp_getsockopt,
+        .sendmsg                = dccp_sendmsg,
+        .recvmsg                = dccp_recvmsg,
+        .backlog_rcv            = dccp_v4_do_rcv,
+        .hash                   = dccp_v4_hash,
+        .unhash                 = dccp_v4_unhash,
+        .accept                 = inet_csk_accept,
+        .get_port               = dccp_v4_get_port,
+        .shutdown               = dccp_shutdown,
+        .destroy                = dccp_v4_destroy_sock,
+        .orphan_count           = &dccp_orphan_count,
+        .max_header             = MAX_DCCP_HEADER,
+        .obj_size               = sizeof(struct dccp_sock),
+        .rsk_prot               = &dccp_request_sock_ops,
+        .twsk_obj_size          = sizeof(struct inet_timewait_sock),
+};
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
new file mode 100644
index 000000000000..ce5dff4ac22e
--- /dev/null
+++ b/net/dccp/minisocks.c
@@ -0,0 +1,264 @@
+/*
+ *  net/dccp/minisocks.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/inet_timewait_sock.h>
+#include "ccid.h"
+#include "dccp.h"
+struct inet_timewait_death_row dccp_death_row = {
+        .sysctl_max_tw_buckets = NR_FILE * 2,
+        .period         = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+        .death_lock     = SPIN_LOCK_UNLOCKED,
+        .hashinfo       = &dccp_hashinfo,
+        .tw_timer       = TIMER_INITIALIZER(inet_twdr_hangman, 0,
+                                            (unsigned long)&dccp_death_row),
+        .twkill_work    = __WORK_INITIALIZER(dccp_death_row.twkill_work,
+                                             inet_twdr_twkill_work,
+                                             &dccp_death_row),
+/* Short-time timewait calendar */
+        .twcal_hand     = -1,
+        .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+                                            (unsigned long)&dccp_death_row),
+};
+void dccp_time_wait(struct sock *sk, int state, int timeo)
+{
+        struct inet_timewait_sock *tw = NULL;
+        if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
+                tw = inet_twsk_alloc(sk, state);
+        if (tw != NULL) {
+                const struct inet_connection_sock *icsk = inet_csk(sk);
+                const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+                /* Linkage updates. */
+                __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
+                /* Get the TIME_WAIT timeout firing. */
+                if (timeo < rto)
+                        timeo = rto;
+                tw->tw_timeout = DCCP_TIMEWAIT_LEN;
+                if (state == DCCP_TIME_WAIT)
+                        timeo = DCCP_TIMEWAIT_LEN;
+                inet_twsk_schedule(tw, &dccp_death_row, timeo,
+                                   DCCP_TIMEWAIT_LEN);
+                inet_twsk_put(tw);
+        } else {
+                /* Sorry, if we're out of memory, just CLOSE this
+                 * socket up.  We've got bigger problems than
+                 * non-graceful socket closings.
+                 */
+                LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket "
+                                         "table overflow\n");
+        }
+        dccp_done(sk);
+}
+struct sock *dccp_create_openreq_child(struct sock *sk,
+                                       const struct request_sock *req,
+                                       const struct sk_buff *skb)
+{
+        /*
+         * Step 3: Process LISTEN state
+         *
+         * // Generate a new socket and switch to that socket
+         * Set S := new socket for this port pair
+         */
+        struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+        if (newsk != NULL) {
+                const struct dccp_request_sock *dreq = dccp_rsk(req);
+                struct inet_connection_sock *newicsk = inet_csk(sk);
+                struct dccp_sock *newdp = dccp_sk(newsk);
+                newdp->dccps_hc_rx_ackpkts = NULL;
+                newdp->dccps_role = DCCP_ROLE_SERVER;
+                newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
+                if (newdp->dccps_options.dccpo_send_ack_vector) {
+                        newdp->dccps_hc_rx_ackpkts =
+                                dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
+                                                   GFP_ATOMIC);
+                        /*
+                         * XXX: We're using the same CCIDs set on the parent,
+                         * i.e. sk_clone copied the master sock and left the
+                         * CCID pointers for this child, that is why we do the
+                         * __ccid_get calls.
+                         */
+                        if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL))
+                                goto out_free;
+                }
+                if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid,
+                                             newsk) != 0 ||
+                             ccid_hc_tx_init(newdp->dccps_hc_tx_ccid,
+                                             newsk) != 0)) {
+                        dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts);
+                        ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk);
+                        ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk);
+out_free:
+                        /* It is still raw copy of parent, so invalidate
+                         * destructor and make plain sk_free() */
+                        newsk->sk_destruct = NULL;
+                        sk_free(newsk);
+                        return NULL;
+                }
+                __ccid_get(newdp->dccps_hc_rx_ccid);
+                __ccid_get(newdp->dccps_hc_tx_ccid);
+                /*
+                 * Step 3: Process LISTEN state
+                 *
+                 *      Choose S.ISS (initial seqno) or set from Init Cookie
+                 *      Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
+                 *      Cookie
+                 */
+                /* See dccp_v4_conn_request */
+                newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd;
+                newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
+                dccp_update_gsr(newsk, dreq->dreq_isr);
+                newdp->dccps_iss = dreq->dreq_iss;
+                dccp_update_gss(newsk, dreq->dreq_iss);
+                /*
+                 * SWL and AWL are initially adjusted so that they are not less than
+                 * the initial Sequence Numbers received and sent, respectively:
+                 *      SWL := max(GSR + 1 - floor(W/4), ISR),
+                 *      AWL := max(GSS - W' + 1, ISS).
+                 * These adjustments MUST be applied only at the beginning of the
+                 * connection.
+                 */
+                dccp_set_seqno(&newdp->dccps_swl,
+                               max48(newdp->dccps_swl, newdp->dccps_isr));
+                dccp_set_seqno(&newdp->dccps_awl,
+                               max48(newdp->dccps_awl, newdp->dccps_iss));
+                dccp_init_xmit_timers(newsk);
+                DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
+        }
+        return newsk;
+}
+/* 
+ * Process an incoming packet for RESPOND sockets represented
+ * as an request_sock.
+ */
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+                            struct request_sock *req,
+                            struct request_sock **prev)
+{
+        struct sock *child = NULL;
+        /* Check for retransmitted REQUEST */
+        if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
+                if (after48(DCCP_SKB_CB(skb)->dccpd_seq,
+                            dccp_rsk(req)->dreq_isr)) {
+                        struct dccp_request_sock *dreq = dccp_rsk(req);
+                        dccp_pr_debug("Retransmitted REQUEST\n");
+                        /* Send another RESPONSE packet */
+                        dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1);
+                        dccp_set_seqno(&dreq->dreq_isr,
+                                       DCCP_SKB_CB(skb)->dccpd_seq);
+                        req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+                }
+                /* Network Duplicate, discard packet */
+                return NULL;
+        }
+        DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+        if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
+            dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
+                goto drop;
+        /* Invalid ACK */
+        if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) {
+                dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
+                              "dreq_iss=%llu\n",
+                              (unsigned long long)
+                              DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                              (unsigned long long)
+                              dccp_rsk(req)->dreq_iss);
+                goto drop;
+        }
+        child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
+        if (child == NULL)
+                goto listen_overflow;
+        /* FIXME: deal with options */
+        inet_csk_reqsk_queue_unlink(sk, req, prev);
+        inet_csk_reqsk_queue_removed(sk, req);
+        inet_csk_reqsk_queue_add(sk, req, child);
+out:
+        return child;
+listen_overflow:
+        dccp_pr_debug("listen_overflow!\n");
+        DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+drop:
+        if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
+                req->rsk_ops->send_reset(skb);
+        inet_csk_reqsk_queue_drop(sk, req, prev);
+        goto out;
+}
+/*
+ *  Queue segment on the new socket if the new socket is active,
+ *  otherwise we just shortcircuit this and continue with
+ *  the new socket.
+ */
+int dccp_child_process(struct sock *parent, struct sock *child,
+                       struct sk_buff *skb)
+{
+        int ret = 0;
+        const int state = child->sk_state;
+        if (!sock_owned_by_user(child)) {
+                ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
+                                             skb->len);
+                /* Wakeup parent, send SIGIO */
+                if (state == DCCP_RESPOND && child->sk_state != state)
+                        parent->sk_data_ready(parent, 0);
+        } else {
+                /* Alas, it is possible again, because we do lookup
+                 * in main socket hash table and lock on listening
+                 * socket does not protect us more.
+                 */
+                sk_add_backlog(child, skb);
+        }
+        bh_unlock_sock(child);
+        sock_put(child);
+        return ret;
+}
diff --git a/net/dccp/options.c b/net/dccp/options.c
new file mode 100644
index 000000000000..382c5894acb2
--- /dev/null
+++ b/net/dccp/options.c
@@ -0,0 +1,855 @@
+/*
+ *  net/dccp/options.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include "ccid.h"
+#include "dccp.h"
+static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
+                                             struct sock *sk,
+                                             const u64 ackno,
+                                             const unsigned char len,
+                                             const unsigned char *vector);
+/* stores the default values for new connection. may be changed with sysctl */
+static const struct dccp_options dccpo_default_values = {
+        .dccpo_sequence_window    = DCCPF_INITIAL_SEQUENCE_WINDOW,
+        .dccpo_ccid               = DCCPF_INITIAL_CCID,
+        .dccpo_send_ack_vector    = DCCPF_INITIAL_SEND_ACK_VECTOR,
+        .dccpo_send_ndp_count     = DCCPF_INITIAL_SEND_NDP_COUNT,
+};
+void dccp_options_init(struct dccp_options *dccpo)
+{
+        memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo));
+}
+static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
+{
+        u32 value = 0;
+        if (len > 3)
+                value += *bf++ << 24;
+        if (len > 2)
+                value += *bf++ << 16;
+        if (len > 1)
+                value += *bf++ << 8;
+        if (len > 0)
+                value += *bf;
+        return value;
+}
+int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+        const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+                                        "CLIENT rx opt: " : "server rx opt: ";
+#endif
+        const struct dccp_hdr *dh = dccp_hdr(skb);
+        const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+        unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+        unsigned char *opt_ptr = options;
+        const unsigned char *opt_end = (unsigned char *)dh +
+                                        (dh->dccph_doff * 4);
+        struct dccp_options_received *opt_recv = &dp->dccps_options_received;
+        unsigned char opt, len;
+        unsigned char *value;
+        memset(opt_recv, 0, sizeof(*opt_recv));
+        while (opt_ptr != opt_end) {
+                opt   = *opt_ptr++;
+                len   = 0;
+                value = NULL;
+                /* Check if this isn't a single byte option */
+                if (opt > DCCPO_MAX_RESERVED) {
+                        if (opt_ptr == opt_end)
+                                goto out_invalid_option;
+                        len = *opt_ptr++;
+                        if (len < 3)
+                                goto out_invalid_option;
+                        /*
+                         * Remove the type and len fields, leaving
+                         * just the value size
+                         */
+                        len     -= 2;
+                        value   = opt_ptr;
+                        opt_ptr += len;
+                        if (opt_ptr > opt_end)
+                                goto out_invalid_option;
+                }
+                switch (opt) {
+                case DCCPO_PADDING:
+                        break;
+                case DCCPO_NDP_COUNT:
+                        if (len > 3)
+                                goto out_invalid_option;
+                        opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
+                        dccp_pr_debug("%sNDP count=%d\n", debug_prefix,
+                                      opt_recv->dccpor_ndp);
+                        break;
+                case DCCPO_ACK_VECTOR_0:
+                        if (len > DCCP_MAX_ACK_VECTOR_LEN)
+                                goto out_invalid_option;
+                        if (pkt_type == DCCP_PKT_DATA)
+                                continue;
+                        opt_recv->dccpor_ack_vector_len = len;
+                        opt_recv->dccpor_ack_vector_idx = value - options;
+                        dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n",
+                                      debug_prefix, len,
+                                      (unsigned long long)
+                                      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+                        dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                                             value, len);
+                        dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts,
+                                                         sk,
+                                                 DCCP_SKB_CB(skb)->dccpd_ack_seq,
+                                                         len, value);
+                        break;
+                case DCCPO_TIMESTAMP:
+                        if (len != 4)
+                                goto out_invalid_option;
+                        opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
+                        dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
+                        do_gettimeofday(&dp->dccps_timestamp_time);
+                        dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n",
+                                      debug_prefix, opt_recv->dccpor_timestamp,
+                                      (unsigned long long)
+                                      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+                        break;
+                case DCCPO_TIMESTAMP_ECHO:
+                        if (len != 4 && len != 6 && len != 8)
+                                goto out_invalid_option;
+                        opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value);
+                        dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ",
+                                      debug_prefix,
+                                      opt_recv->dccpor_timestamp_echo,
+                                      len + 2,
+                                      (unsigned long long)
+                                      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+                        if (len > 4) {
+                                if (len == 6)
+                                        opt_recv->dccpor_elapsed_time =
+                                                 ntohs(*(u16 *)(value + 4));
+                                else
+                                        opt_recv->dccpor_elapsed_time =
+                                                 ntohl(*(u32 *)(value + 4));
+                                dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n",
+                                      debug_prefix,
+                                      opt_recv->dccpor_elapsed_time);
+                        }
+                        break;
+                case DCCPO_ELAPSED_TIME:
+                        if (len != 2 && len != 4)
+                                goto out_invalid_option;
+                        if (pkt_type == DCCP_PKT_DATA)
+                                continue;
+                        if (len == 2)
+                                opt_recv->dccpor_elapsed_time =
+                                                        ntohs(*(u16 *)value);
+                        else
+                                opt_recv->dccpor_elapsed_time =
+                                                        ntohl(*(u32 *)value);
+                        dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix,
+                                      opt_recv->dccpor_elapsed_time);
+                        break;
+                        /*
+                         * From draft-ietf-dccp-spec-11.txt:
+                         *
+                         *      Option numbers 128 through 191 are for
+                         *      options sent from the HC-Sender to the
+                         *      HC-Receiver; option numbers 192 through 255
+                         *      are for options sent from the HC-Receiver to
+                         *      the HC-Sender.
+                         */
+                case 128 ... 191: {
+                        const u16 idx = value - options;
+                        if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
+                                                     opt, len, idx,
+                                                     value) != 0)
+                                goto out_invalid_option;
+                }
+                        break;
+                case 192 ... 255: {
+                        const u16 idx = value - options;
+                        if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
+                                                     opt, len, idx,
+                                                     value) != 0)
+                                goto out_invalid_option;
+                }
+                        break;
+                default:
+                        pr_info("DCCP(%p): option %d(len=%d) not "
+                                "implemented, ignoring\n",
+                                sk, opt, len);
+                        break;
+                }
+        }
+        return 0;
+out_invalid_option:
+        DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
+        DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
+        pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len);
+        return -1;
+}
+static void dccp_encode_value_var(const u32 value, unsigned char *to,
+                                  const unsigned int len)
+{
+        if (len > 3)
+                *to++ = (value & 0xFF000000) >> 24;
+        if (len > 2)
+                *to++ = (value & 0xFF0000) >> 16;
+        if (len > 1)
+                *to++ = (value & 0xFF00) >> 8;
+        if (len > 0)
+                *to++ = (value & 0xFF);
+}
+static inline int dccp_ndp_len(const int ndp)
+{
+        return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
+}
+void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+                        const unsigned char option,
+                        const void *value, const unsigned char len)
+{
+        unsigned char *to;
+        if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) {
+                LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+                               "%d option!\n", option);
+                return;
+        }
+        DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
+        to    = skb_push(skb, len + 2);
+        *to++ = option;
+        *to++ = len + 2;
+        memcpy(to, value, len);
+}
+EXPORT_SYMBOL_GPL(dccp_insert_option);
+static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        int ndp = dp->dccps_ndp_count;
+        if (dccp_non_data_packet(skb))
+                ++dp->dccps_ndp_count;
+        else
+                dp->dccps_ndp_count = 0;
+        if (ndp > 0) {
+                unsigned char *ptr;
+                const int ndp_len = dccp_ndp_len(ndp);
+                const int len = ndp_len + 2;
+                if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+                        return;
+                DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+                ptr = skb_push(skb, len);
+                *ptr++ = DCCPO_NDP_COUNT;
+                *ptr++ = len;
+                dccp_encode_value_var(ndp, ptr, ndp_len);
+        }
+}
+static inline int dccp_elapsed_time_len(const u32 elapsed_time)
+{
+        return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
+}
+void dccp_insert_option_elapsed_time(struct sock *sk,
+                                     struct sk_buff *skb,
+                                     u32 elapsed_time)
+{
+#ifdef CONFIG_IP_DCCP_DEBUG
+        struct dccp_sock *dp = dccp_sk(sk);
+        const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+                                        "CLIENT TX opt: " : "server TX opt: ";
+#endif
+        const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+        const int len = 2 + elapsed_time_len;
+        unsigned char *to;
+        if (elapsed_time_len == 0)
+                return;
+        if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+                LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
+                                         "insert elapsed time!\n");
+                return;
+        }
+        DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+        to    = skb_push(skb, len);
+        *to++ = DCCPO_ELAPSED_TIME;
+        *to++ = len;
+        if (elapsed_time_len == 2) {
+                const u16 var16 = htons((u16)elapsed_time);
+                memcpy(to, &var16, 2);
+        } else {
+                const u32 var32 = htonl(elapsed_time);
+                memcpy(to, &var32, 4);
+        }
+        dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n",
+                      debug_prefix, elapsed_time,
+                      len,
+                      (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+}
+EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
+static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+        const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+                                        "CLIENT TX opt: " : "server TX opt: ";
+#endif
+        struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+        int len = ap->dccpap_buf_vector_len + 2;
+        const u32 elapsed_time = timeval_now_delta(&ap->dccpap_time) / 10;
+        unsigned char *to, *from;
+        if (elapsed_time != 0)
+                dccp_insert_option_elapsed_time(sk, skb, elapsed_time);
+        if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+                LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
+                                         "insert ACK Vector!\n");
+                return;
+        }
+        /*
+         * XXX: now we have just one ack vector sent record, so
+         * we have to wait for it to be cleared.
+         *
+         * Of course this is not acceptable, but this is just for
+         * basic testing now.
+         */
+        if (ap->dccpap_ack_seqno != DCCP_MAX_SEQNO + 1)
+                return;
+        DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+        to    = skb_push(skb, len);
+        *to++ = DCCPO_ACK_VECTOR_0;
+        *to++ = len;
+        len  = ap->dccpap_buf_vector_len;
+        from = ap->dccpap_buf + ap->dccpap_buf_head;
+        /* Check if buf_head wraps */
+        if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) {
+                const unsigned int tailsize = (ap->dccpap_buf_len -
+                                               ap->dccpap_buf_head);
+                memcpy(to, from, tailsize);
+                to   += tailsize;
+                len  -= tailsize;
+                from = ap->dccpap_buf;
+        }
+        memcpy(to, from, len);
+        /*
+         *      From draft-ietf-dccp-spec-11.txt:
+         *
+         *      For each acknowledgement it sends, the HC-Receiver will add an
+         *      acknowledgement record.  ack_seqno will equal the HC-Receiver
+         *      sequence number it used for the ack packet; ack_ptr will equal
+         *      buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
+         *      equal buf_nonce.
+         *
+         * This implemention uses just one ack record for now.
+         */
+        ap->dccpap_ack_seqno      = DCCP_SKB_CB(skb)->dccpd_seq;
+        ap->dccpap_ack_ptr        = ap->dccpap_buf_head;
+        ap->dccpap_ack_ackno      = ap->dccpap_buf_ackno;
+        ap->dccpap_ack_nonce      = ap->dccpap_buf_nonce;
+        ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len;
+        dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, "
+                      "ack_ackno=%llu\n",
+                      debug_prefix, ap->dccpap_ack_vector_len,
+                      (unsigned long long) ap->dccpap_ack_seqno,
+                      (unsigned long long) ap->dccpap_ack_ackno);
+}
+void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+        struct timeval tv;
+        u32 now;
+        
+        do_gettimeofday(&tv);
+        now = (tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10;
+        /* yes this will overflow but that is the point as we want a
+         * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
+        now = htonl(now);
+        dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+}
+EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
+static void dccp_insert_option_timestamp_echo(struct sock *sk,
+                                              struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+        const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+                                        "CLIENT TX opt: " : "server TX opt: ";
+#endif
+        u32 tstamp_echo;
+        const u32 elapsed_time =
+                        timeval_now_delta(&dp->dccps_timestamp_time) / 10;
+        const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+        const int len = 6 + elapsed_time_len;
+        unsigned char *to;
+        if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+                LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+                                         "timestamp echo!\n");
+                return;
+        }
+        DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+        to    = skb_push(skb, len);
+        *to++ = DCCPO_TIMESTAMP_ECHO;
+        *to++ = len;
+        tstamp_echo = htonl(dp->dccps_timestamp_echo);
+        memcpy(to, &tstamp_echo, 4);
+        to += 4;
+        
+        if (elapsed_time_len == 2) {
+                const u16 var16 = htons((u16)elapsed_time);
+                memcpy(to, &var16, 2);
+        } else if (elapsed_time_len == 4) {
+                const u32 var32 = htonl(elapsed_time);
+                memcpy(to, &var32, 4);
+        }
+        dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n",
+                      debug_prefix, dp->dccps_timestamp_echo,
+                      len,
+                      (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+        dp->dccps_timestamp_echo = 0;
+        dp->dccps_timestamp_time.tv_sec = 0;
+        dp->dccps_timestamp_time.tv_usec = 0;
+}
+void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+        if (dp->dccps_options.dccpo_send_ndp_count)
+                dccp_insert_option_ndp(sk, skb);
+        if (!dccp_packet_without_ack(skb)) {
+                if (dp->dccps_options.dccpo_send_ack_vector &&
+                    (dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno !=
+                     DCCP_MAX_SEQNO + 1))
+                        dccp_insert_option_ack_vector(sk, skb);
+                if (dp->dccps_timestamp_echo != 0)
+                        dccp_insert_option_timestamp_echo(sk, skb);
+        }
+        ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
+        ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
+        /* XXX: insert other options when appropriate */
+        if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
+                /* The length of all options has to be a multiple of 4 */
+                int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
+                if (padding != 0) {
+                        padding = 4 - padding;
+                        memset(skb_push(skb, padding), 0, padding);
+                        DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
+                }
+        }
+}
+struct dccp_ackpkts *dccp_ackpkts_alloc(const unsigned int len,
+                                        const unsigned int __nocast priority)
+{
+        struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority);
+        if (ap != NULL) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+                memset(ap->dccpap_buf, 0xFF, len);
+#endif
+                ap->dccpap_buf_len   = len;
+                ap->dccpap_buf_head  =
+                        ap->dccpap_buf_tail =
+                                ap->dccpap_buf_len - 1;
+                ap->dccpap_buf_ackno =
+                        ap->dccpap_ack_ackno =
+                                ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+                ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0;
+                ap->dccpap_ack_ptr   = 0;
+                ap->dccpap_time.tv_sec = 0;
+                ap->dccpap_time.tv_usec = 0;
+                ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0;
+        }
+        return ap;
+}
+void dccp_ackpkts_free(struct dccp_ackpkts *ap)
+{
+        if (ap != NULL) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+                memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len);
+#endif
+                kfree(ap);
+        }
+}
+static inline u8 dccp_ackpkts_state(const struct dccp_ackpkts *ap,
+                                    const unsigned int index)
+{
+        return ap->dccpap_buf[index] & DCCP_ACKPKTS_STATE_MASK;
+}
+static inline u8 dccp_ackpkts_len(const struct dccp_ackpkts *ap,
+                                  const unsigned int index)
+{
+        return ap->dccpap_buf[index] & DCCP_ACKPKTS_LEN_MASK;
+}
+/*
+ * If several packets are missing, the HC-Receiver may prefer to enter multiple
+ * bytes with run length 0, rather than a single byte with a larger run length;
+ * this simplifies table updates if one of the missing packets arrives.
+ */
+static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap,
+                                                  const unsigned int packets,
+                                                  const unsigned char state)
+{
+        unsigned int gap;
+        signed long new_head;
+        if (ap->dccpap_buf_vector_len + packets > ap->dccpap_buf_len)
+                return -ENOBUFS;
+        gap      = packets - 1;
+        new_head = ap->dccpap_buf_head - packets;
+        if (new_head < 0) {
+                if (gap > 0) {
+                        memset(ap->dccpap_buf, DCCP_ACKPKTS_STATE_NOT_RECEIVED,
+                               gap + new_head + 1);
+                        gap = -new_head;
+                }
+                new_head += ap->dccpap_buf_len;
+        } 
+        ap->dccpap_buf_head = new_head;
+        if (gap > 0)
+                memset(ap->dccpap_buf + ap->dccpap_buf_head + 1,
+                       DCCP_ACKPKTS_STATE_NOT_RECEIVED, gap);
+        ap->dccpap_buf[ap->dccpap_buf_head] = state;
+        ap->dccpap_buf_vector_len += packets;
+        return 0;
+}
+/*
+ * Implements the draft-ietf-dccp-spec-11.txt Appendix A
+ */
+int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state)
+{
+        /*
+         * Check at the right places if the buffer is full, if it is, tell the
+         * caller to start dropping packets till the HC-Sender acks our ACK
+         * vectors, when we will free up space in dccpap_buf.
+         *
+         * We may well decide to do buffer compression, etc, but for now lets
+         * just drop.
+         *
+         * From Appendix A:
+         *
+         *      Of course, the circular buffer may overflow, either when the
+         *      HC-Sender is sending data at a very high rate, when the
+         *      HC-Receiver's acknowledgements are not reaching the HC-Sender,
+         *      or when the HC-Sender is forgetting to acknowledge those acks
+         *      (so the HC-Receiver is unable to clean up old state). In this
+         *      case, the HC-Receiver should either compress the buffer (by
+         *      increasing run lengths when possible), transfer its state to
+         *      a larger buffer, or, as a last resort, drop all received
+         *      packets, without processing them whatsoever, until its buffer
+         *      shrinks again.
+         */
+        /* See if this is the first ackno being inserted */
+        if (ap->dccpap_buf_vector_len == 0) {
+                ap->dccpap_buf[ap->dccpap_buf_head] = state;
+                ap->dccpap_buf_vector_len = 1;
+        } else if (after48(ackno, ap->dccpap_buf_ackno)) {
+                const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno,
+                                                   ackno);
+                /*
+                 * Look if the state of this packet is the same as the
+                 * previous ackno and if so if we can bump the head len.
+                 */
+                if (delta == 1 &&
+                    dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state &&
+                    (dccp_ackpkts_len(ap, ap->dccpap_buf_head) <
+                     DCCP_ACKPKTS_LEN_MASK))
+                        ap->dccpap_buf[ap->dccpap_buf_head]++;
+                else if (dccp_ackpkts_set_buf_head_state(ap, delta, state))
+                        return -ENOBUFS;
+        } else {
+                /*
+                 * A.1.2.  Old Packets
+                 *
+                 *      When a packet with Sequence Number S arrives, and
+                 *      S <= buf_ackno, the HC-Receiver will scan the table
+                 *      for the byte corresponding to S. (Indexing structures
+                 *      could reduce the complexity of this scan.)
+                 */
+                u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno);
+                unsigned int index = ap->dccpap_buf_head;
+                while (1) {
+                        const u8 len = dccp_ackpkts_len(ap, index);
+                        const u8 state = dccp_ackpkts_state(ap, index);
+                        /*
+                         * valid packets not yet in dccpap_buf have a reserved
+                         * entry, with a len equal to 0.
+                         */
+                        if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED &&
+                            len == 0 && delta == 0) { /* Found our
+                                                         reserved seat! */
+                                dccp_pr_debug("Found %llu reserved seat!\n",
+                                              (unsigned long long) ackno);
+                                ap->dccpap_buf[index] = state;
+                                goto out;
+                        }
+                        /* len == 0 means one packet */
+                        if (delta < len + 1)
+                                goto out_duplicate;
+                        delta -= len + 1;
+                        if (++index == ap->dccpap_buf_len)
+                                index = 0;
+                }
+        }
+        ap->dccpap_buf_ackno = ackno;
+        do_gettimeofday(&ap->dccpap_time);
+out:
+        dccp_pr_debug("");
+        dccp_ackpkts_print(ap);
+        return 0;
+out_duplicate:
+        /* Duplicate packet */
+        dccp_pr_debug("Received a dup or already considered lost "
+                      "packet: %llu\n", (unsigned long long) ackno);
+        return -EILSEQ;
+}
+#ifdef CONFIG_IP_DCCP_DEBUG
+void dccp_ackvector_print(const u64 ackno, const unsigned char *vector,
+                          int len)
+{
+        if (!dccp_debug)
+                return;
+        printk("ACK vector len=%d, ackno=%llu |", len,
+               (unsigned long long) ackno);
+        while (len--) {
+                const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6;
+                const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
+                printk("%d,%d|", state, rl);
+                ++vector;
+        }
+        printk("\n");
+}
+void dccp_ackpkts_print(const struct dccp_ackpkts *ap)
+{
+        dccp_ackvector_print(ap->dccpap_buf_ackno,
+                             ap->dccpap_buf + ap->dccpap_buf_head,
+                             ap->dccpap_buf_vector_len);
+}
+#endif
+static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap)
+{
+        /*
+         * As we're keeping track of the ack vector size
+         * (dccpap_buf_vector_len) and the sent ack vector size
+         * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but
+         * keep this code here as in the future we'll implement a vector of
+         * ack records, as suggested in draft-ietf-dccp-spec-11.txt
+         * Appendix A. -acme
+         */
+#if 0
+        ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1;
+        if (ap->dccpap_buf_tail >= ap->dccpap_buf_len)
+                ap->dccpap_buf_tail -= ap->dccpap_buf_len;
+#endif
+        ap->dccpap_buf_vector_len -= ap->dccpap_ack_vector_len;
+}
+void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk,
+                                 u64 ackno)
+{
+        /* Check if we actually sent an ACK vector */
+        if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
+                return;
+        if (ackno == ap->dccpap_ack_seqno) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+                struct dccp_sock *dp = dccp_sk(sk);
+                const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+                                        "CLIENT rx ack: " : "server rx ack: ";
+#endif
+                dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, "
+                              "ack_ackno=%llu, ACKED!\n",
+                              debug_prefix, 1,
+                              (unsigned long long) ap->dccpap_ack_seqno,
+                              (unsigned long long) ap->dccpap_ack_ackno);
+                dccp_ackpkts_trow_away_ack_record(ap);
+                ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+        }
+}
+static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
+                                             struct sock *sk, u64 ackno,
+                                             const unsigned char len,
+                                             const unsigned char *vector)
+{
+        unsigned char i;
+        /* Check if we actually sent an ACK vector */
+        if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
+                return;
+        /*
+         * We're in the receiver half connection, so if the received an ACK
+         * vector ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're
+         * not interested.
+         *
+         * Extra explanation with example:
+         * 
+         * if we received an ACK vector with ackno 50, it can only be acking
+         * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent).
+         */
+        /* dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); */
+        if (before48(ackno, ap->dccpap_ack_seqno)) {
+                /* dccp_pr_debug_cat("yes\n"); */
+                return;
+        }
+        /* dccp_pr_debug_cat("no\n"); */
+        i = len;
+        while (i--) {
+                const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
+                u64 ackno_end_rl;
+                dccp_set_seqno(&ackno_end_rl, ackno - rl);
+                /*
+                 * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl,
+                 * ap->dccpap_ack_seqno, ackno);
+                 */
+                if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) {
+                        const u8 state = (*vector &
+                                          DCCP_ACKPKTS_STATE_MASK) >> 6;
+                        /* dccp_pr_debug_cat("yes\n"); */
+                        if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+                                struct dccp_sock *dp = dccp_sk(sk);
+                                const char *debug_prefix =
+                                        dp->dccps_role == DCCP_ROLE_CLIENT ?
+                                        "CLIENT rx ack: " : "server rx ack: ";
+#endif
+                                dccp_pr_debug("%sACK vector 0, len=%d, "
+                                              "ack_seqno=%llu, ack_ackno=%llu, "
+                                              "ACKED!\n",
+                                              debug_prefix, len,
+                                              (unsigned long long)
+                                              ap->dccpap_ack_seqno,
+                                              (unsigned long long)
+                                              ap->dccpap_ack_ackno);
+                                dccp_ackpkts_trow_away_ack_record(ap);
+                        }
+                        /*
+                         * If dccpap_ack_seqno was not received, no problem
+                         * we'll send another ACK vector.
+                         */
+                        ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+                        break;
+                }
+                /* dccp_pr_debug_cat("no\n"); */
+                dccp_set_seqno(&ackno, ackno_end_rl - 1);
+                ++vector;
+        }
+}
diff --git a/net/dccp/output.c b/net/dccp/output.c
new file mode 100644
index 000000000000..28de157a4326
--- /dev/null
+++ b/net/dccp/output.c
@@ -0,0 +1,528 @@
+/*
+ *  net/dccp/output.c
+ * 
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include "ccid.h"
+#include "dccp.h"
+static inline void dccp_event_ack_sent(struct sock *sk)
+{
+        inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+/*
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the DCCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ */
+int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+        if (likely(skb != NULL)) {
+                const struct inet_sock *inet = inet_sk(sk);
+                struct dccp_sock *dp = dccp_sk(sk);
+                struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+                struct dccp_hdr *dh;
+                /* XXX For now we're using only 48 bits sequence numbers */
+                const int dccp_header_size = sizeof(*dh) +
+                                             sizeof(struct dccp_hdr_ext) +
+                                          dccp_packet_hdr_len(dcb->dccpd_type);
+                int err, set_ack = 1;
+                u64 ackno = dp->dccps_gsr;
+                dccp_inc_seqno(&dp->dccps_gss);
+                switch (dcb->dccpd_type) {
+                case DCCP_PKT_DATA:
+                        set_ack = 0;
+                        break;
+                case DCCP_PKT_SYNC:
+                case DCCP_PKT_SYNCACK:
+                        ackno = dcb->dccpd_seq;
+                        break;
+                }
+                dcb->dccpd_seq = dp->dccps_gss;
+                dccp_insert_options(sk, skb);
+                
+                skb->h.raw = skb_push(skb, dccp_header_size);
+                dh = dccp_hdr(skb);
+                /*
+                 * Data packets are not cloned as they are never retransmitted
+                 */
+                if (skb_cloned(skb))
+                        skb_set_owner_w(skb, sk);
+                /* Build DCCP header and checksum it. */
+                memset(dh, 0, dccp_header_size);
+                dh->dccph_type  = dcb->dccpd_type;
+                dh->dccph_sport = inet->sport;
+                dh->dccph_dport = inet->dport;
+                dh->dccph_doff  = (dccp_header_size + dcb->dccpd_opt_len) / 4;
+                dh->dccph_ccval = dcb->dccpd_ccval;
+                /* XXX For now we're using only 48 bits sequence numbers */
+                dh->dccph_x     = 1;
+                dp->dccps_awh = dp->dccps_gss;
+                dccp_hdr_set_seq(dh, dp->dccps_gss);
+                if (set_ack)
+                        dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
+                switch (dcb->dccpd_type) {
+                case DCCP_PKT_REQUEST:
+                        dccp_hdr_request(skb)->dccph_req_service =
+                                                        dcb->dccpd_service;
+                        break;
+                case DCCP_PKT_RESET:
+                        dccp_hdr_reset(skb)->dccph_reset_code =
+                                                        dcb->dccpd_reset_code;
+                        break;
+                }
+                dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
+                                                      inet->daddr);
+                if (set_ack)
+                        dccp_event_ack_sent(sk);
+                DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+                err = ip_queue_xmit(skb, 0);
+                if (err <= 0)
+                        return err;
+                /* NET_XMIT_CN is special. It does not guarantee,
+                 * that this packet is lost. It tells that device
+                 * is about to start to drop packets or already
+                 * drops some packets of the same priority and
+                 * invokes us to send less aggressively.
+                 */
+                return err == NET_XMIT_CN ? 0 : err;
+        }
+        return -ENOBUFS;
+}
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        int mss_now;
+        /*
+         * FIXME: we really should be using the af_specific thing to support
+         *        IPv6.
+         * mss_now = pmtu - tp->af_specific->net_header_len -
+         *           sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
+         */
+        mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
+                  sizeof(struct dccp_hdr_ext);
+        /* Now subtract optional transport overhead */
+        mss_now -= dp->dccps_ext_header_len;
+        /*
+         * FIXME: this should come from the CCID infrastructure, where, say,
+         * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
+         * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
+         * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
+         * make it a multiple of 4
+         */
+        mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
+        /* And store cached results */
+        dp->dccps_pmtu_cookie = pmtu;
+        dp->dccps_mss_cache = mss_now;
+        return mss_now;
+}
+void dccp_write_space(struct sock *sk)
+{
+        read_lock(&sk->sk_callback_lock);
+        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+                wake_up_interruptible(sk->sk_sleep);
+        /* Should agree with poll, otherwise some programs break */
+        if (sock_writeable(sk))
+                sk_wake_async(sk, 2, POLL_OUT);
+        read_unlock(&sk->sk_callback_lock);
+}
+/**
+ * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * @sk: socket to wait for
+ * @timeo: for how long
+ */
+static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
+                              long *timeo)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        DEFINE_WAIT(wait);
+        long delay;
+        int rc;
+        while (1) {
+                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+                if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+                        goto do_error;
+                if (!*timeo)
+                        goto do_nonblock;
+                if (signal_pending(current))
+                        goto do_interrupted;
+                rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+                                            skb->len);
+                if (rc <= 0)
+                        break;
+                delay = msecs_to_jiffies(rc);
+                if (delay > *timeo || delay < 0)
+                        goto do_nonblock;
+                sk->sk_write_pending++;
+                release_sock(sk);
+                *timeo -= schedule_timeout(delay);
+                lock_sock(sk);
+                sk->sk_write_pending--;
+        }
+out:
+        finish_wait(sk->sk_sleep, &wait);
+        return rc;
+do_error:
+        rc = -EPIPE;
+        goto out;
+do_nonblock:
+        rc = -EAGAIN;
+        goto out;
+do_interrupted:
+        rc = sock_intr_errno(*timeo);
+        goto out;
+}
+int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
+{
+        const struct dccp_sock *dp = dccp_sk(sk);
+        int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+                                         skb->len);
+        if (err > 0)
+                err = dccp_wait_for_ccid(sk, skb, timeo);
+        if (err == 0) {
+                const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+                struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+                const int len = skb->len;
+                if (sk->sk_state == DCCP_PARTOPEN) {
+                        /* See 8.1.5.  Handshake Completion */
+                        inet_csk_schedule_ack(sk);
+                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                  inet_csk(sk)->icsk_rto,
+                                                  DCCP_RTO_MAX);
+                        dcb->dccpd_type = DCCP_PKT_DATAACK;
+                        /*
+                         * FIXME: we really should have a
+                         * dccps_ack_pending or use icsk.
+                         */
+                } else if (inet_csk_ack_scheduled(sk) ||
+                           dp->dccps_timestamp_echo != 0 ||
+                           (dp->dccps_options.dccpo_send_ack_vector &&
+                            ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 &&
+                            ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1))
+                        dcb->dccpd_type = DCCP_PKT_DATAACK;
+                else
+                        dcb->dccpd_type = DCCP_PKT_DATA;
+                err = dccp_transmit_skb(sk, skb);
+                ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+        }
+        return err;
+}
+int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+        if (inet_sk_rebuild_header(sk) != 0)
+                return -EHOSTUNREACH; /* Routing failure or similar. */
+        return dccp_transmit_skb(sk, (skb_cloned(skb) ?
+                                      pskb_copy(skb, GFP_ATOMIC):
+                                      skb_clone(skb, GFP_ATOMIC)));
+}
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+                                   struct request_sock *req)
+{
+        struct dccp_hdr *dh;
+        const int dccp_header_size = sizeof(struct dccp_hdr) +
+                                     sizeof(struct dccp_hdr_ext) +
+                                     sizeof(struct dccp_hdr_response);
+        struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+                                               dccp_header_size, 1,
+                                           GFP_ATOMIC);
+        if (skb == NULL)
+                return NULL;
+        /* Reserve space for headers. */
+        skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+        skb->dst = dst_clone(dst);
+        skb->csum = 0;
+        DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
+        DCCP_SKB_CB(skb)->dccpd_seq  = dccp_rsk(req)->dreq_iss;
+        dccp_insert_options(sk, skb);
+        skb->h.raw = skb_push(skb, dccp_header_size);
+        dh = dccp_hdr(skb);
+        memset(dh, 0, dccp_header_size);
+        dh->dccph_sport = inet_sk(sk)->sport;
+        dh->dccph_dport = inet_rsk(req)->rmt_port;
+        dh->dccph_doff  = (dccp_header_size +
+                           DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+        dh->dccph_type  = DCCP_PKT_RESPONSE;
+        dh->dccph_x     = 1;
+        dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss);
+        dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr);
+        dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr,
+                                              inet_rsk(req)->rmt_addr);
+        DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+        return skb;
+}
+struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
+                                const enum dccp_reset_codes code)
+                                   
+{
+        struct dccp_hdr *dh;
+        struct dccp_sock *dp = dccp_sk(sk);
+        const int dccp_header_size = sizeof(struct dccp_hdr) +
+                                     sizeof(struct dccp_hdr_ext) +
+                                     sizeof(struct dccp_hdr_reset);
+        struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+                                               dccp_header_size, 1,
+                                           GFP_ATOMIC);
+        if (skb == NULL)
+                return NULL;
+        /* Reserve space for headers. */
+        skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+        skb->dst = dst_clone(dst);
+        skb->csum = 0;
+        dccp_inc_seqno(&dp->dccps_gss);
+        DCCP_SKB_CB(skb)->dccpd_reset_code = code;
+        DCCP_SKB_CB(skb)->dccpd_type       = DCCP_PKT_RESET;
+        DCCP_SKB_CB(skb)->dccpd_seq        = dp->dccps_gss;
+        dccp_insert_options(sk, skb);
+        skb->h.raw = skb_push(skb, dccp_header_size);
+        dh = dccp_hdr(skb);
+        memset(dh, 0, dccp_header_size);
+        dh->dccph_sport = inet_sk(sk)->sport;
+        dh->dccph_dport = inet_sk(sk)->dport;
+        dh->dccph_doff  = (dccp_header_size +
+                           DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+        dh->dccph_type  = DCCP_PKT_RESET;
+        dh->dccph_x     = 1;
+        dccp_hdr_set_seq(dh, dp->dccps_gss);
+        dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr);
+        dccp_hdr_reset(skb)->dccph_reset_code = code;
+        dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr,
+                                              inet_sk(sk)->daddr);
+        DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+        return skb;
+}
+/*
+ * Do all connect socket setups that can be done AF independent.
+ */
+static inline void dccp_connect_init(struct sock *sk)
+{
+        struct dst_entry *dst = __sk_dst_get(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        sk->sk_err = 0;
+        sock_reset_flag(sk, SOCK_DONE);
+        
+        dccp_sync_mss(sk, dst_mtu(dst));
+        /*
+         * FIXME: set dp->{dccps_swh,dccps_swl}, with
+         * something like dccp_inc_seq
+         */
+        icsk->icsk_retransmits = 0;
+}
+int dccp_connect(struct sock *sk)
+{
+        struct sk_buff *skb;
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        dccp_connect_init(sk);
+        skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation);
+        if (unlikely(skb == NULL))
+                return -ENOBUFS;
+        /* Reserve space for headers. */
+        skb_reserve(skb, MAX_DCCP_HEADER);
+        DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
+        /* FIXME: set service to something meaningful, coming
+         * from userspace*/
+        DCCP_SKB_CB(skb)->dccpd_service = 0;
+        skb->csum = 0;
+        skb_set_owner_w(skb, sk);
+        BUG_TRAP(sk->sk_send_head == NULL);
+        sk->sk_send_head = skb;
+        dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+        DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
+        /* Timer for repeating the REQUEST until an answer. */
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                  icsk->icsk_rto, DCCP_RTO_MAX);
+        return 0;
+}
+void dccp_send_ack(struct sock *sk)
+{
+        /* If we have been reset, we may not send again. */
+        if (sk->sk_state != DCCP_CLOSED) {
+                struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+                if (skb == NULL) {
+                        inet_csk_schedule_ack(sk);
+                        inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                  TCP_DELACK_MAX,
+                                                  DCCP_RTO_MAX);
+                        return;
+                }
+                /* Reserve space for headers */
+                skb_reserve(skb, MAX_DCCP_HEADER);
+                skb->csum = 0;
+                DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
+                skb_set_owner_w(skb, sk);
+                dccp_transmit_skb(sk, skb);
+        }
+}
+EXPORT_SYMBOL_GPL(dccp_send_ack);
+void dccp_send_delayed_ack(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        /*
+         * FIXME: tune this timer. elapsed time fixes the skew, so no problem
+         * with using 2s, and active senders also piggyback the ACK into a
+         * DATAACK packet, so this is really for quiescent senders.
+         */
+        unsigned long timeout = jiffies + 2 * HZ;
+        /* Use new timeout only if there wasn't a older one earlier. */
+        if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+                /* If delack timer was blocked or is about to expire,
+                 * send ACK now.
+                 *
+                 * FIXME: check the "about to expire" part
+                 */
+                if (icsk->icsk_ack.blocked) {
+                        dccp_send_ack(sk);
+                        return;
+                }
+                if (!time_before(timeout, icsk->icsk_ack.timeout))
+                        timeout = icsk->icsk_ack.timeout;
+        }
+        icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+        icsk->icsk_ack.timeout = timeout;
+        sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+void dccp_send_sync(struct sock *sk, const u64 seq,
+                    const enum dccp_pkt_type pkt_type)
+{
+        /*
+         * We are not putting this on the write queue, so
+         * dccp_transmit_skb() will set the ownership to this
+         * sock.
+         */
+        struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+        if (skb == NULL)
+                /* FIXME: how to make sure the sync is sent? */
+                return;
+        /* Reserve space for headers and prepare control bits. */
+        skb_reserve(skb, MAX_DCCP_HEADER);
+        skb->csum = 0;
+        DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
+        DCCP_SKB_CB(skb)->dccpd_seq = seq;
+        skb_set_owner_w(skb, sk);
+        dccp_transmit_skb(sk, skb);
+}
+/*
+ * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
+ * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
+ * any circumstances.
+ */
+void dccp_send_close(struct sock *sk, const int active)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct sk_buff *skb;
+        const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC;
+        skb = alloc_skb(sk->sk_prot->max_header, prio);
+        if (skb == NULL)
+                return;
+        /* Reserve space for headers and prepare control bits. */
+        skb_reserve(skb, sk->sk_prot->max_header);
+        skb->csum = 0;
+        DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ?
+                                        DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
+        skb_set_owner_w(skb, sk);
+        if (active) {
+                BUG_TRAP(sk->sk_send_head == NULL);
+                sk->sk_send_head = skb;
+                dccp_transmit_skb(sk, skb_clone(skb, prio));
+        } else
+                dccp_transmit_skb(sk, skb);
+        ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+        ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
new file mode 100644
index 000000000000..18a0e69c9dc7
--- /dev/null
+++ b/net/dccp/proto.c
@@ -0,0 +1,826 @@
+/*
+ *  net/dccp/proto.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *      This program is free software; you can redistribute it and/or modify it
+ *      under the terms of the GNU General Public License version 2 as
+ *      published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <net/checksum.h>
+#include <net/inet_common.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <asm/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/poll.h>
+#include <linux/dccp.h>
+#include "ccid.h"
+#include "dccp.h"
+DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+atomic_t dccp_orphan_count = ATOMIC_INIT(0);
+static struct net_protocol dccp_protocol = {
+        .handler        = dccp_v4_rcv,
+        .err_handler    = dccp_v4_err,
+};
+const char *dccp_packet_name(const int type)
+{
+        static const char *dccp_packet_names[] = {
+                [DCCP_PKT_REQUEST]  = "REQUEST",
+                [DCCP_PKT_RESPONSE] = "RESPONSE",
+                [DCCP_PKT_DATA]     = "DATA",
+                [DCCP_PKT_ACK]      = "ACK",
+                [DCCP_PKT_DATAACK]  = "DATAACK",
+                [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
+                [DCCP_PKT_CLOSE]    = "CLOSE",
+                [DCCP_PKT_RESET]    = "RESET",
+                [DCCP_PKT_SYNC]     = "SYNC",
+                [DCCP_PKT_SYNCACK]  = "SYNCACK",
+        };
+        if (type >= DCCP_NR_PKT_TYPES)
+                return "INVALID";
+        else
+                return dccp_packet_names[type];
+}
+EXPORT_SYMBOL_GPL(dccp_packet_name);
+const char *dccp_state_name(const int state)
+{
+        static char *dccp_state_names[] = {
+        [DCCP_OPEN]       = "OPEN",
+        [DCCP_REQUESTING] = "REQUESTING",
+        [DCCP_PARTOPEN]   = "PARTOPEN",
+        [DCCP_LISTEN]     = "LISTEN",
+        [DCCP_RESPOND]    = "RESPOND",
+        [DCCP_CLOSING]    = "CLOSING",
+        [DCCP_TIME_WAIT]  = "TIME_WAIT",
+        [DCCP_CLOSED]     = "CLOSED",
+        };
+        if (state >= DCCP_MAX_STATES)
+                return "INVALID STATE!";
+        else
+                return dccp_state_names[state];
+}
+EXPORT_SYMBOL_GPL(dccp_state_name);
+static inline int dccp_listen_start(struct sock *sk)
+{
+        dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN;
+        return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+}
+int dccp_disconnect(struct sock *sk, int flags)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct inet_sock *inet = inet_sk(sk);
+        int err = 0;
+        const int old_state = sk->sk_state;
+        if (old_state != DCCP_CLOSED)
+                dccp_set_state(sk, DCCP_CLOSED);
+        /* ABORT function of RFC793 */
+        if (old_state == DCCP_LISTEN) {
+                inet_csk_listen_stop(sk);
+        /* FIXME: do the active reset thing */
+        } else if (old_state == DCCP_REQUESTING)
+                sk->sk_err = ECONNRESET;
+        dccp_clear_xmit_timers(sk);
+        __skb_queue_purge(&sk->sk_receive_queue);
+        if (sk->sk_send_head != NULL) {
+                __kfree_skb(sk->sk_send_head);
+                sk->sk_send_head = NULL;
+        }
+        inet->dport = 0;
+        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+                inet_reset_saddr(sk);
+        sk->sk_shutdown = 0;
+        sock_reset_flag(sk, SOCK_DONE);
+        icsk->icsk_backoff = 0;
+        inet_csk_delack_init(sk);
+        __sk_dst_reset(sk);
+        BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+        sk->sk_error_report(sk);
+        return err;
+}
+/*
+ *      Wait for a DCCP event.
+ *
+ *      Note that we don't need to lock the socket, as the upper poll layers
+ *      take care of normal races (between the test and the event) and we don't
+ *      go look at any of the socket buffers directly.
+ */
+static unsigned int dccp_poll(struct file *file, struct socket *sock,
+                              poll_table *wait)
+{
+        unsigned int mask;
+        struct sock *sk = sock->sk;
+        poll_wait(file, sk->sk_sleep, wait);
+        if (sk->sk_state == DCCP_LISTEN)
+                return inet_csk_listen_poll(sk);
+        /* Socket is not locked. We are protected from async events
+           by poll logic and correct handling of state changes
+           made by another threads is impossible in any case.
+         */
+        mask = 0;
+        if (sk->sk_err)
+                mask = POLLERR;
+        if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+                mask |= POLLHUP;
+        if (sk->sk_shutdown & RCV_SHUTDOWN)
+                mask |= POLLIN | POLLRDNORM;
+        /* Connected? */
+        if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+                if (atomic_read(&sk->sk_rmem_alloc) > 0)
+                        mask |= POLLIN | POLLRDNORM;
+                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+                        if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+                                mask |= POLLOUT | POLLWRNORM;
+                        } else {  /* send SIGIO later */
+                                set_bit(SOCK_ASYNC_NOSPACE,
+                                        &sk->sk_socket->flags);
+                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+                                /* Race breaker. If space is freed after
+                                 * wspace test but before the flags are set,
+                                 * IO signal will be lost.
+                                 */
+                                if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+                                        mask |= POLLOUT | POLLWRNORM;
+                        }
+                }
+        }
+        return mask;
+}
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+        dccp_pr_debug("entry\n");
+        return -ENOIOCTLCMD;
+}
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+                    char __user *optval, int optlen)
+{
+        struct dccp_sock *dp;
+        int err;
+        int val;
+        if (level != SOL_DCCP)
+                return ip_setsockopt(sk, level, optname, optval, optlen);
+        if (optlen < sizeof(int))
+                return -EINVAL;
+        if (get_user(val, (int __user *)optval))
+                return -EFAULT;
+        lock_sock(sk);
+        dp = dccp_sk(sk);
+        err = 0;
+        switch (optname) {
+        case DCCP_SOCKOPT_PACKET_SIZE:
+                dp->dccps_packet_size = val;
+                break;
+        default:
+                err = -ENOPROTOOPT;
+                break;
+        }
+        
+        release_sock(sk);
+        return err;
+}
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+                    char __user *optval, int __user *optlen)
+{
+        struct dccp_sock *dp;
+        int val, len;
+        if (level != SOL_DCCP)
+                return ip_getsockopt(sk, level, optname, optval, optlen);
+        if (get_user(len, optlen))
+                return -EFAULT;
+        len = min_t(unsigned int, len, sizeof(int));
+        if (len < 0)
+                return -EINVAL;
+        dp = dccp_sk(sk);
+        switch (optname) {
+        case DCCP_SOCKOPT_PACKET_SIZE:
+                val = dp->dccps_packet_size;
+                break;
+        default:
+                return -ENOPROTOOPT;
+        }
+        if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+                return -EFAULT;
+        return 0;
+}
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                 size_t len)
+{
+        const struct dccp_sock *dp = dccp_sk(sk);
+        const int flags = msg->msg_flags;
+        const int noblock = flags & MSG_DONTWAIT;
+        struct sk_buff *skb;
+        int rc, size;
+        long timeo;
+        if (len > dp->dccps_mss_cache)
+                return -EMSGSIZE;
+        lock_sock(sk);
+        timeo = sock_sndtimeo(sk, noblock);
+        /*
+         * We have to use sk_stream_wait_connect here to set sk_write_pending,
+         * so that the trick in dccp_rcv_request_sent_state_process.
+         */
+        /* Wait for a connection to finish. */
+        if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
+                if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
+                        goto out_release;
+        size = sk->sk_prot->max_header + len;
+        release_sock(sk);
+        skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+        lock_sock(sk);
+        if (skb == NULL)
+                goto out_release;
+        skb_reserve(skb, sk->sk_prot->max_header);
+        rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+        if (rc != 0)
+                goto out_discard;
+        rc = dccp_write_xmit(sk, skb, &timeo);
+        /*
+         * XXX we don't use sk_write_queue, so just discard the packet.
+         *     Current plan however is to _use_ sk_write_queue with
+         *     an algorith similar to tcp_sendmsg, where the main difference
+         *     is that in DCCP we have to respect packet boundaries, so
+         *     no coalescing of skbs.
+         *
+         *     This bug was _quickly_ found & fixed by just looking at an OSTRA
+         *     generated callgraph 8) -acme
+         */
+        if (rc != 0)
+                goto out_discard;
+out_release:
+        release_sock(sk);
+        return rc ? : len;
+out_discard:
+        kfree_skb(skb);
+        goto out_release;
+}
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                 size_t len, int nonblock, int flags, int *addr_len)
+{
+        const struct dccp_hdr *dh;
+        long timeo;
+        lock_sock(sk);
+        if (sk->sk_state == DCCP_LISTEN) {
+                len = -ENOTCONN;
+                goto out;
+        }
+        timeo = sock_rcvtimeo(sk, nonblock);
+        do {
+                struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+                if (skb == NULL)
+                        goto verify_sock_status;
+                dh = dccp_hdr(skb);
+                if (dh->dccph_type == DCCP_PKT_DATA ||
+                    dh->dccph_type == DCCP_PKT_DATAACK)
+                        goto found_ok_skb;
+                if (dh->dccph_type == DCCP_PKT_RESET ||
+                    dh->dccph_type == DCCP_PKT_CLOSE) {
+                        dccp_pr_debug("found fin ok!\n");
+                        len = 0;
+                        goto found_fin_ok;
+                }
+                dccp_pr_debug("packet_type=%s\n",
+                              dccp_packet_name(dh->dccph_type));
+                sk_eat_skb(sk, skb);
+verify_sock_status:
+                if (sock_flag(sk, SOCK_DONE)) {
+                        len = 0;
+                        break;
+                }
+                if (sk->sk_err) {
+                        len = sock_error(sk);
+                        break;
+                }
+                if (sk->sk_shutdown & RCV_SHUTDOWN) {
+                        len = 0;
+                        break;
+                }
+                if (sk->sk_state == DCCP_CLOSED) {
+                        if (!sock_flag(sk, SOCK_DONE)) {
+                                /* This occurs when user tries to read
+                                 * from never connected socket.
+                                 */
+                                len = -ENOTCONN;
+                                break;
+                        }
+                        len = 0;
+                        break;
+                }
+                if (!timeo) {
+                        len = -EAGAIN;
+                        break;
+                }
+                if (signal_pending(current)) {
+                        len = sock_intr_errno(timeo);
+                        break;
+                }
+                sk_wait_data(sk, &timeo);
+                continue;
+        found_ok_skb:
+                if (len > skb->len)
+                        len = skb->len;
+                else if (len < skb->len)
+                        msg->msg_flags |= MSG_TRUNC;
+                if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+                        /* Exception. Bailout! */
+                        len = -EFAULT;
+                        break;
+                }
+        found_fin_ok:
+                if (!(flags & MSG_PEEK))
+                        sk_eat_skb(sk, skb);
+                break;
+        } while (1);
+out:
+        release_sock(sk);
+        return len;
+}
+static int inet_dccp_listen(struct socket *sock, int backlog)
+{
+        struct sock *sk = sock->sk;
+        unsigned char old_state;
+        int err;
+        lock_sock(sk);
+        err = -EINVAL;
+        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
+                goto out;
+        old_state = sk->sk_state;
+        if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
+                goto out;
+        /* Really, if the socket is already in listen state
+         * we can only allow the backlog to be adjusted.
+         */
+        if (old_state != DCCP_LISTEN) {
+                /*
+                 * FIXME: here it probably should be sk->sk_prot->listen_start
+                 * see tcp_listen_start
+                 */
+                err = dccp_listen_start(sk);
+                if (err)
+                        goto out;
+        }
+        sk->sk_max_ack_backlog = backlog;
+        err = 0;
+out:
+        release_sock(sk);
+        return err;
+}
+static const unsigned char dccp_new_state[] = {
+        /* current state:   new state:      action:     */
+        [0]               = DCCP_CLOSED,
+        [DCCP_OPEN]       = DCCP_CLOSING | DCCP_ACTION_FIN,
+        [DCCP_REQUESTING] = DCCP_CLOSED,
+        [DCCP_PARTOPEN]   = DCCP_CLOSING | DCCP_ACTION_FIN,
+        [DCCP_LISTEN]     = DCCP_CLOSED,
+        [DCCP_RESPOND]    = DCCP_CLOSED,
+        [DCCP_CLOSING]    = DCCP_CLOSED,
+        [DCCP_TIME_WAIT]  = DCCP_CLOSED,
+        [DCCP_CLOSED]     = DCCP_CLOSED,
+};
+static int dccp_close_state(struct sock *sk)
+{
+        const int next = dccp_new_state[sk->sk_state];
+        const int ns = next & DCCP_STATE_MASK;
+        if (ns != sk->sk_state)
+                dccp_set_state(sk, ns);
+        return next & DCCP_ACTION_FIN;
+}
+void dccp_close(struct sock *sk, long timeout)
+{
+        struct sk_buff *skb;
+        lock_sock(sk);
+        sk->sk_shutdown = SHUTDOWN_MASK;
+        if (sk->sk_state == DCCP_LISTEN) {
+                dccp_set_state(sk, DCCP_CLOSED);
+                /* Special case. */
+                inet_csk_listen_stop(sk);
+                goto adjudge_to_death;
+        }
+        /*
+         * We need to flush the recv. buffs.  We do this only on the
+         * descriptor close, not protocol-sourced closes, because the
+          *reader process may not have drained the data yet!
+         */
+        /* FIXME: check for unread data */
+        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+                __kfree_skb(skb);
+        }
+        if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+                /* Check zero linger _after_ checking for unread data. */
+                sk->sk_prot->disconnect(sk, 0);
+        } else if (dccp_close_state(sk)) {
+                dccp_send_close(sk, 1);
+        }
+        sk_stream_wait_close(sk, timeout);
+adjudge_to_death:
+        /*
+         * It is the last release_sock in its life. It will remove backlog.
+         */
+        release_sock(sk);
+        /*
+         * Now socket is owned by kernel and we acquire BH lock
+         * to finish close. No need to check for user refs.
+         */
+        local_bh_disable();
+        bh_lock_sock(sk);
+        BUG_TRAP(!sock_owned_by_user(sk));
+        sock_hold(sk);
+        sock_orphan(sk);
+        /*
+         * The last release_sock may have processed the CLOSE or RESET
+         * packet moving sock to CLOSED state, if not we have to fire
+         * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
+         * in draft-ietf-dccp-spec-11. -acme
+         */
+        if (sk->sk_state == DCCP_CLOSING) {
+                /* FIXME: should start at 2 * RTT */
+                /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                          inet_csk(sk)->icsk_rto,
+                                          DCCP_RTO_MAX);
+#if 0
+                /* Yeah, we should use sk->sk_prot->orphan_count, etc */
+                dccp_set_state(sk, DCCP_CLOSED);
+#endif
+        }
+        atomic_inc(sk->sk_prot->orphan_count);
+        if (sk->sk_state == DCCP_CLOSED)
+                inet_csk_destroy_sock(sk);
+        /* Otherwise, socket is reprieved until protocol close. */
+        bh_unlock_sock(sk);
+        local_bh_enable();
+        sock_put(sk);
+}
+void dccp_shutdown(struct sock *sk, int how)
+{
+        dccp_pr_debug("entry\n");
+}
+static struct proto_ops inet_dccp_ops = {
+        .family         = PF_INET,
+        .owner          = THIS_MODULE,
+        .release        = inet_release,
+        .bind           = inet_bind,
+        .connect        = inet_stream_connect,
+        .socketpair     = sock_no_socketpair,
+        .accept         = inet_accept,
+        .getname        = inet_getname,
+        /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
+        .poll           = dccp_poll,
+        .ioctl          = inet_ioctl,
+        /* FIXME: work on inet_listen to rename it to sock_common_listen */
+        .listen         = inet_dccp_listen,
+        .shutdown       = inet_shutdown,
+        .setsockopt     = sock_common_setsockopt,
+        .getsockopt     = sock_common_getsockopt,
+        .sendmsg        = inet_sendmsg,
+        .recvmsg        = sock_common_recvmsg,
+        .mmap           = sock_no_mmap,
+        .sendpage       = sock_no_sendpage,
+};
+extern struct net_proto_family inet_family_ops;
+static struct inet_protosw dccp_v4_protosw = {
+        .type           = SOCK_DCCP,
+        .protocol       = IPPROTO_DCCP,
+        .prot           = &dccp_v4_prot,
+        .ops            = &inet_dccp_ops,
+        .capability     = -1,
+        .no_check       = 0,
+        .flags          = 0,
+};
+/*
+ * This is the global socket data structure used for responding to
+ * the Out-of-the-blue (OOTB) packets. A control sock will be created
+ * for this socket at the initialization time.
+ */
+struct socket *dccp_ctl_socket;
+static char dccp_ctl_socket_err_msg[] __initdata =
+        KERN_ERR "DCCP: Failed to create the control socket.\n";
+static int __init dccp_ctl_sock_init(void)
+{
+        int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
+                                  &dccp_ctl_socket);
+        if (rc < 0)
+                printk(dccp_ctl_socket_err_msg);
+        else {
+                dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
+                inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
+                /* Unhash it so that IP input processing does not even
+                 * see it, we do not wish this socket to see incoming
+                 * packets.
+                 */
+                dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
+        }
+        return rc;
+}
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+void dccp_ctl_sock_exit(void)
+{
+        if (dccp_ctl_socket != NULL) {
+                sock_release(dccp_ctl_socket);
+                dccp_ctl_socket = NULL;
+        }
+}
+EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
+#endif
+static int __init init_dccp_v4_mibs(void)
+{
+        int rc = -ENOMEM;
+        dccp_statistics[0] = alloc_percpu(struct dccp_mib);
+        if (dccp_statistics[0] == NULL)
+                goto out;
+        dccp_statistics[1] = alloc_percpu(struct dccp_mib);
+        if (dccp_statistics[1] == NULL)
+                goto out_free_one;
+        rc = 0;
+out:
+        return rc;
+out_free_one:
+        free_percpu(dccp_statistics[0]);
+        dccp_statistics[0] = NULL;
+        goto out;
+}
+static int thash_entries;
+module_param(thash_entries, int, 0444);
+MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
+#ifdef CONFIG_IP_DCCP_DEBUG
+int dccp_debug;
+module_param(dccp_debug, int, 0444);
+MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+#endif
+static int __init dccp_init(void)
+{
+        unsigned long goal;
+        int ehash_order, bhash_order, i;
+        int rc = proto_register(&dccp_v4_prot, 1);
+        if (rc)
+                goto out;
+        dccp_hashinfo.bind_bucket_cachep =
+                kmem_cache_create("dccp_bind_bucket",
+                                  sizeof(struct inet_bind_bucket), 0,
+                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
+        if (!dccp_hashinfo.bind_bucket_cachep)
+                goto out_proto_unregister;
+        /*
+         * Size and allocate the main established and bind bucket
+         * hash tables.
+         *
+         * The methodology is similar to that of the buffer cache.
+         */
+        if (num_physpages >= (128 * 1024))
+                goal = num_physpages >> (21 - PAGE_SHIFT);
+        else
+                goal = num_physpages >> (23 - PAGE_SHIFT);
+        if (thash_entries)
+                goal = (thash_entries *
+                        sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
+        for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
+                ;
+        do {
+                dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
+                                        sizeof(struct inet_ehash_bucket);
+                dccp_hashinfo.ehash_size >>= 1;
+                while (dccp_hashinfo.ehash_size &
+                       (dccp_hashinfo.ehash_size - 1))
+                        dccp_hashinfo.ehash_size--;
+                dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
+                        __get_free_pages(GFP_ATOMIC, ehash_order);
+        } while (!dccp_hashinfo.ehash && --ehash_order > 0);
+        if (!dccp_hashinfo.ehash) {
+                printk(KERN_CRIT "Failed to allocate DCCP "
+                                 "established hash table\n");
+                goto out_free_bind_bucket_cachep;
+        }
+        for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
+                rwlock_init(&dccp_hashinfo.ehash[i].lock);
+                INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
+        }
+        bhash_order = ehash_order;
+        do {
+                dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
+                                        sizeof(struct inet_bind_hashbucket);
+                if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
+                    bhash_order > 0)
+                        continue;
+                dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
+                        __get_free_pages(GFP_ATOMIC, bhash_order);
+        } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
+        if (!dccp_hashinfo.bhash) {
+                printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
+                goto out_free_dccp_ehash;
+        }
+        for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+                spin_lock_init(&dccp_hashinfo.bhash[i].lock);
+                INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+        }
+        if (init_dccp_v4_mibs())
+                goto out_free_dccp_bhash;
+        rc = -EAGAIN;
+        if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
+                goto out_free_dccp_v4_mibs;
+        inet_register_protosw(&dccp_v4_protosw);
+        rc = dccp_ctl_sock_init();
+        if (rc)
+                goto out_unregister_protosw;
+out:
+        return rc;
+out_unregister_protosw:
+        inet_unregister_protosw(&dccp_v4_protosw);
+        inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
+out_free_dccp_v4_mibs:
+        free_percpu(dccp_statistics[0]);
+        free_percpu(dccp_statistics[1]);
+        dccp_statistics[0] = dccp_statistics[1] = NULL;
+out_free_dccp_bhash:
+        free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
+        dccp_hashinfo.bhash = NULL;
+out_free_dccp_ehash:
+        free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
+        dccp_hashinfo.ehash = NULL;
+out_free_bind_bucket_cachep:
+        kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+        dccp_hashinfo.bind_bucket_cachep = NULL;
+out_proto_unregister:
+        proto_unregister(&dccp_v4_prot);
+        goto out;
+}
+static const char dccp_del_proto_err_msg[] __exitdata =
+        KERN_ERR "can't remove dccp net_protocol\n";
+static void __exit dccp_fini(void)
+{
+        inet_unregister_protosw(&dccp_v4_protosw);
+        if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
+                printk(dccp_del_proto_err_msg);
+        free_percpu(dccp_statistics[0]);
+        free_percpu(dccp_statistics[1]);
+        free_pages((unsigned long)dccp_hashinfo.bhash,
+                   get_order(dccp_hashinfo.bhash_size *
+                             sizeof(struct inet_bind_hashbucket)));
+        free_pages((unsigned long)dccp_hashinfo.ehash,
+                   get_order(dccp_hashinfo.ehash_size *
+                             sizeof(struct inet_ehash_bucket)));
+        kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+        proto_unregister(&dccp_v4_prot);
+}
+module_init(dccp_init);
+module_exit(dccp_fini);
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
new file mode 100644
index 000000000000..aa34b576e228
--- /dev/null
+++ b/net/dccp/timer.c
@@ -0,0 +1,255 @@
+/*
+ *  net/dccp/timer.c
+ * 
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include "dccp.h"
+static void dccp_write_timer(unsigned long data);
+static void dccp_keepalive_timer(unsigned long data);
+static void dccp_delack_timer(unsigned long data);
+void dccp_init_xmit_timers(struct sock *sk)
+{
+        inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
+                                  &dccp_keepalive_timer);
+}
+static void dccp_write_err(struct sock *sk)
+{
+        sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+        sk->sk_error_report(sk);
+        dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+        dccp_done(sk);
+        DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
+}
+/* A write timeout has occurred. Process the after effects. */
+static int dccp_write_timeout(struct sock *sk)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        int retry_until;
+        if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
+                if (icsk->icsk_retransmits != 0)
+                        dst_negative_advice(&sk->sk_dst_cache);
+                retry_until = icsk->icsk_syn_retries ? :
+                            /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */;
+        } else {
+                if (icsk->icsk_retransmits >=
+                     /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) {
+                        /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
+                           black hole detection. :-(
+                           It is place to make it. It is not made. I do not want
+                           to make it. It is disguisting. It does not work in any
+                           case. Let me to cite the same draft, which requires for
+                           us to implement this:
+   "The one security concern raised by this memo is that ICMP black holes
+   are often caused by over-zealous security administrators who block
+   all ICMP messages.  It is vitally important that those who design and
+   deploy security systems understand the impact of strict filtering on
+   upper-layer protocols.  The safest web site in the world is worthless
+   if most TCP implementations cannot transfer data from it.  It would
+   be far nicer to have all of the black holes fixed rather than fixing
+   all of the TCP implementations."
+                           Golden words :-).
+                   */
+                        dst_negative_advice(&sk->sk_dst_cache);
+                }
+                retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */;
+                /*
+                 * FIXME: see tcp_write_timout and tcp_out_of_resources
+                 */
+        }
+        if (icsk->icsk_retransmits >= retry_until) {
+                /* Has it gone just too far? */
+                dccp_write_err(sk);
+                return 1;
+        }
+        return 0;
+}
+/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
+static void dccp_delack_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock *)data;
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later. */
+                icsk->icsk_ack.blocked = 1;
+                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+                sk_reset_timer(sk, &icsk->icsk_delack_timer,
+                               jiffies + TCP_DELACK_MIN);
+                goto out;
+        }
+        if (sk->sk_state == DCCP_CLOSED ||
+            !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+                goto out;
+        if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+                sk_reset_timer(sk, &icsk->icsk_delack_timer,
+                               icsk->icsk_ack.timeout);
+                goto out;
+        }
+        icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+        if (inet_csk_ack_scheduled(sk)) {
+                if (!icsk->icsk_ack.pingpong) {
+                        /* Delayed ACK missed: inflate ATO. */
+                        icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
+                                                 icsk->icsk_rto);
+                } else {
+                        /* Delayed ACK missed: leave pingpong mode and
+                         * deflate ATO.
+                         */
+                        icsk->icsk_ack.pingpong = 0;
+                        icsk->icsk_ack.ato = TCP_ATO_MIN;
+                }
+                dccp_send_ack(sk);
+                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+        }
+out:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+/*
+ *      The DCCP retransmit timer.
+ */
+static void dccp_retransmit_timer(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        /*
+         * sk->sk_send_head has to have one skb with
+         * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
+         * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake
+         * (PARTOPEN timer), etc).
+         */
+        BUG_TRAP(sk->sk_send_head != NULL);
+        /* 
+         * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+         * sent, no need to retransmit, this sock is dead.
+         */
+        if (dccp_write_timeout(sk))
+                goto out;
+        /*
+         * We want to know the number of packets retransmitted, not the
+         * total number of retransmissions of clones of original packets.
+         */
+        if (icsk->icsk_retransmits == 0)
+                DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
+        if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
+                /*
+                 * Retransmission failed because of local congestion,
+                 * do not backoff.
+                 */
+                if (icsk->icsk_retransmits == 0)
+                        icsk->icsk_retransmits = 1;
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                          min(icsk->icsk_rto,
+                                              TCP_RESOURCE_PROBE_INTERVAL),
+                                          DCCP_RTO_MAX);
+                goto out;
+        }
+        icsk->icsk_backoff++;
+        icsk->icsk_retransmits++;
+        icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
+                                  DCCP_RTO_MAX);
+        if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */)
+                __sk_dst_reset(sk);
+out:;
+}
+static void dccp_write_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock *)data;
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        int event = 0;
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later */
+                sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+                               jiffies + (HZ / 20));
+                goto out;
+        }
+        if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
+                goto out;
+        if (time_after(icsk->icsk_timeout, jiffies)) {
+                sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+                               icsk->icsk_timeout);
+                goto out;
+        }
+        event = icsk->icsk_pending;
+        icsk->icsk_pending = 0;
+        switch (event) {
+        case ICSK_TIME_RETRANS:
+                dccp_retransmit_timer(sk);
+                break;
+        }
+out:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
+/*
+ *      Timer for listening sockets
+ */
+static void dccp_response_timer(struct sock *sk)
+{
+        inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
+                                   DCCP_RTO_MAX);
+}
+static void dccp_keepalive_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock *)data;
+        /* Only process if socket is not in use. */
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later. */ 
+                inet_csk_reset_keepalive_timer(sk, HZ / 20);
+                goto out;
+        }
+        if (sk->sk_state == DCCP_LISTEN) {
+                dccp_response_timer(sk);
+                goto out;
+        }
+out:
+        bh_unlock_sock(sk);
+        sock_put(sk);
+}
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index acdd18e6adb2..621680f127af 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -118,7 +118,7 @@ Version 0.0.6    2.1.110   07-aug-98   Eduardo Marcelo Serrat
 #include <linux/netfilter.h>
 #include <linux/seq_file.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/flow.h>
 #include <asm/system.h>
 #include <asm/ioctls.h>
@@ -1763,7 +1763,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
                nskb = skb->next;
                if (skb->len == 0) {
-                        skb_unlink(skb);
+                        skb_unlink(skb, queue);
                        kfree_skb(skb);
                        /* 
                         * N.B. Don't refer to skb or cb after this point
@@ -2064,7 +2064,7 @@ static struct notifier_block dn_dev_notifier = {
        .notifier_call = dn_device_event,
 };
-extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *);
+extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
 static struct packet_type dn_dix_packet_type = {
        .type =         __constant_htons(ETH_P_DNA_RT),
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 00233ecbc9cb..5610bb16dbf9 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -752,16 +752,16 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
        skb = alloc_skb(size, GFP_KERNEL);
        if (!skb) {
-                netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS);
+                netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, ENOBUFS);
                return;
        }
        if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
                kfree_skb(skb);
-                netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL);
+                netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, EINVAL);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR;
+        NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_IFADDR;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_DECnet_IFADDR, GFP_KERNEL);
 }
 static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 202dbde9850d..369f25b60f3f 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -60,7 +60,7 @@
 #include <linux/inet.h>
 #include <linux/route.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 8cce1fdbda90..e0bebf4bbcad 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -479,7 +479,7 @@ int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff
                xmit_count = cb2->xmit_count;
                segnum = cb2->segnum;
                /* Remove and drop ack'ed packet */
-                skb_unlink(ack);
+                skb_unlink(ack, q);
                kfree_skb(ack);
                ack = NULL;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 2399fa8a3f86..2c915f305be3 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -572,7 +572,7 @@ static int dn_route_ptp_hello(struct sk_buff *skb)
        return NET_RX_SUCCESS;
 }
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct dn_skb_cb *cb;
        unsigned char flags = 0;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 28ba5777a25a..eeba56f99323 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n
 static DEFINE_RWLOCK(dn_fib_tables_lock);
 struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1];
-static kmem_cache_t *dn_hash_kmem;
+static kmem_cache_t *dn_hash_kmem __read_mostly;
 static int dn_fib_hash_zombies;
 static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
@@ -349,10 +349,10 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
                kfree_skb(skb);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE;
+        NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_ROUTE;
        if (nlh->nlmsg_flags & NLM_F_ECHO)
                atomic_inc(&skb->users);
-        netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL);
+        netlink_broadcast(rtnl, skb, pid, RTNLGRP_DECnet_ROUTE, GFP_KERNEL);
        if (nlh->nlmsg_flags & NLM_F_ECHO)
                netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
 }
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 284a9998e53d..1ab94c6e22ed 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -19,6 +19,7 @@
 #include <linux/netfilter.h>
 #include <linux/spinlock.h>
 #include <linux/netlink.h>
+#include <linux/netfilter_decnet.h>
 #include <net/sock.h>
 #include <net/flow.h>
@@ -71,10 +72,10 @@ static void dnrmg_send_peer(struct sk_buff *skb)
        switch(flags & DN_RT_CNTL_MSK) {
                case DN_RT_PKT_L1RT:
-                        group = DNRMG_L1_GROUP;
+                        group = DNRNG_NLGRP_L1;
                        break;
                case DN_RT_PKT_L2RT:
-                        group = DNRMG_L2_GROUP;
+                        group = DNRNG_NLGRP_L2;
                        break;
                default:
                        return;
@@ -83,7 +84,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
        skb2 = dnrmg_build_message(skb, &status);
        if (skb2 == NULL)
                return;
-        NETLINK_CB(skb2).dst_groups = group;
+        NETLINK_CB(skb2).dst_group = group;
        netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
 }
@@ -138,7 +139,8 @@ static int __init init(void)
 {
        int rv = 0;
-        dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk);
+        dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
+                                      dnrmg_receive_user_sk, THIS_MODULE);
        if (dnrmg == NULL) {
                printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
                return -ENOMEM;
@@ -162,6 +164,7 @@ static void __exit fini(void)
 MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
 MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
 module_init(init);
 module_exit(fini);
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index de691e119e17..4a62093eb343 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -159,7 +159,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock,
        err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
        if (err)
                goto out_free;
-        sk->sk_stamp = skb->stamp;
+        skb_get_timestamp(skb, &sk->sk_stamp);
        if (msg->msg_name)
                memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
@@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq, int result)
 foundit:
        tx_result(skb->sk, eb->cookie, result);
-        skb_unlink(skb);
+        skb_unlink(skb, &aun_queue);
        spin_unlock_irqrestore(&aun_queue_lock, flags);
        kfree_skb(skb);
 }
@@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h)
                {
                        tx_result(skb->sk, eb->cookie, 
                                  ECTYPE_TRANSMIT_NOT_PRESENT);
-                        skb_unlink(skb);
+                        skb_unlink(skb, &aun_queue);
                        kfree_skb(skb);
                }
                skb = newskb;
@@ -1009,7 +1009,7 @@ release:
 *      Receive an Econet frame from a device.
 */
-static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct ec_framehdr *hdr;
        struct sock *sk;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index f6dbfb99b14d..87a052a9a84f 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,8 +62,6 @@
 #include <asm/system.h>
 #include <asm/checksum.h>
-extern int __init netdev_boot_setup(char *str);
 __setup("ether=", netdev_boot_setup);
 /*
@@ -163,7 +161,6 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
        skb->mac.raw=skb->data;
        skb_pull(skb,ETH_HLEN);
        eth = eth_hdr(skb);
-        skb->input_dev = dev;
        
        if(*eth->h_dest&1)
        {
diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c
index b81a6d532342..66b39fc342d2 100644
--- a/net/ethernet/sysctl_net_ether.c
+++ b/net/ethernet/sysctl_net_ether.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/sysctl.h>
+#include <linux/if_ether.h>
 ctl_table ether_table[] = {
        {0}
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0b3d9f1d8069..e55136ae09f4 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -413,20 +413,19 @@ config INET_TUNNEL
          
          If unsure, say Y.
-config IP_TCPDIAG
+config INET_DIAG
-        tristate "IP: TCP socket monitoring interface"
+        tristate "INET: socket monitoring interface"
        default y
        ---help---
-          Support for TCP socket monitoring interface used by native Linux
+          Support for INET (TCP, DCCP, etc) socket monitoring interface used by
-          tools such as ss. ss is included in iproute2, currently downloadable
+          native Linux tools such as ss. ss is included in iproute2, currently
-          at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
+          downloadable at <http://developer.osdl.org/dev/iproute2>. 
-          and have selected IPv6 as a module, you need to build this as a
-          module too.
          
          If unsure, say Y.
-config IP_TCPDIAG_IPV6
+config INET_TCP_DIAG
-        def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+        depends on INET_DIAG
+        def_tristate INET_DIAG
 config TCP_CONG_ADVANCED
        bool "TCP: advanced congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 55dc6cca1e7b..f0435d00db6b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -4,11 +4,12 @@
 obj-y     := route.o inetpeer.o protocol.o \
             ip_input.o ip_fragment.o ip_forward.o ip_options.o \
-             ip_output.o ip_sockglue.o \
+             ip_output.o ip_sockglue.o inet_hashtables.o \
+             inet_timewait_sock.o inet_connection_sock.o \
             tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
             tcp_minisocks.o tcp_cong.o \
             datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
-             sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
+             sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
 obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
 obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
 obj-$(CONFIG_NETFILTER) += netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
-obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
+obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 163ae4068b5f..bf147f8db399 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
 #include <net/arp.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <linux/skbuff.h>
@@ -112,11 +113,7 @@
 #include <linux/mroute.h>
 #endif
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet_sock_nr;
-#endif
 extern void ip_mc_drop_socket(struct sock *sk);
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
        if (inet->opt)
                kfree(inet->opt);
        dst_release(sk->sk_dst_cache);
-#ifdef INET_REFCNT_DEBUG
+        sk_refcnt_debug_dec(sk);
-        atomic_dec(&inet_sock_nr);
-        printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
-               sk, atomic_read(&inet_sock_nr));
-#endif
 }
 /*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != TCP_LISTEN) {
-                err = tcp_listen_start(sk);
+                err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
                if (err)
                        goto out;
        }
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
        struct proto *answer_prot;
        unsigned char answer_flags;
        char answer_no_check;
-        int err;
+        int try_loading_module = 0;
+        int err = -ESOCKTNOSUPPORT;
        sock->state = SS_UNCONNECTED;
        /* Look for the requested type/protocol pair. */
        answer = NULL;
+lookup_protocol:
        rcu_read_lock();
        list_for_each_rcu(p, &inetsw[sock->type]) {
                answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
                answer = NULL;
        }
-        err = -ESOCKTNOSUPPORT;
+        if (unlikely(answer == NULL)) {
-        if (!answer)
+                if (try_loading_module < 2) {
-                goto out_rcu_unlock;
+                        rcu_read_unlock();
+                        /*
+                         * Be more specific, e.g. net-pf-2-proto-132-type-1
+                         * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+                         */
+                        if (++try_loading_module == 1)
+                                request_module("net-pf-%d-proto-%d-type-%d",
+                                               PF_INET, protocol, sock->type);
+                        /*
+                         * Fall back to generic, e.g. net-pf-2-proto-132
+                         * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+                         */
+                        else
+                                request_module("net-pf-%d-proto-%d",
+                                               PF_INET, protocol);
+                        goto lookup_protocol;
+                } else
+                        goto out_rcu_unlock;
+        }
        err = -EPERM;
        if (answer->capability > 0 && !capable(answer->capability))
                goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
        inet->mc_index  = 0;
        inet->mc_list   = NULL;
-#ifdef INET_REFCNT_DEBUG
+        sk_refcnt_debug_inc(sk);
-        atomic_inc(&inet_sock_nr);
-#endif
        if (inet->num) {
                /* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
        .owner  = THIS_MODULE,
 };
-extern void tcp_init(void);
-extern void tcp_v4_init(struct net_proto_family *);
 /* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
        }
 }
+/*
+ *      Shall we try to damage output packets if routing dev changes?
+ */
+int sysctl_ip_dynaddr;
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        int err;
+        struct rtable *rt;
+        __u32 old_saddr = inet->saddr;
+        __u32 new_saddr;
+        __u32 daddr = inet->daddr;
+        if (inet->opt && inet->opt->srr)
+                daddr = inet->opt->faddr;
+        /* Query new route. */
+        err = ip_route_connect(&rt, daddr, 0,
+                               RT_CONN_FLAGS(sk),
+                               sk->sk_bound_dev_if,
+                               sk->sk_protocol,
+                               inet->sport, inet->dport, sk);
+        if (err)
+                return err;
+        sk_setup_caps(sk, &rt->u.dst);
+        new_saddr = rt->rt_src;
+        if (new_saddr == old_saddr)
+                return 0;
+        if (sysctl_ip_dynaddr > 1) {
+                printk(KERN_INFO "%s(): shifting inet->"
+                                 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+                       __FUNCTION__,
+                       NIPQUAD(old_saddr),
+                       NIPQUAD(new_saddr));
+        }
+        inet->saddr = inet->rcv_saddr = new_saddr;
+        /*
+         * XXX The only one ugly spot where we need to
+         * XXX really change the sockets identity after
+         * XXX it has entered the hashes. -DaveM
+         *
+         * Besides that, it does not check for connection
+         * uniqueness. Wait for troubles.
+         */
+        __sk_prot_rehash(sk);
+        return 0;
+}
+int inet_sk_rebuild_header(struct sock *sk)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+        u32 daddr;
+        int err;
+        /* Route is OK, nothing to do. */
+        if (rt)
+                return 0;
+        /* Reroute. */
+        daddr = inet->daddr;
+        if (inet->opt && inet->opt->srr)
+                daddr = inet->opt->faddr;
+{
+        struct flowi fl = {
+                .oif = sk->sk_bound_dev_if,
+                .nl_u = {
+                        .ip4_u = {
+                                .daddr  = daddr,
+                                .saddr  = inet->saddr,
+                                .tos    = RT_CONN_FLAGS(sk),
+                        },
+                },
+                .proto = sk->sk_protocol,
+                .uli_u = {
+                        .ports = {
+                                .sport = inet->sport,
+                                .dport = inet->dport,
+                        },
+                },
+        };
+                                                
+        err = ip_route_output_flow(&rt, &fl, sk, 0);
+}
+        if (!err)
+                sk_setup_caps(sk, &rt->u.dst);
+        else {
+                /* Routing failed... */
+                sk->sk_route_caps = 0;
+                /*
+                 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+                 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+                 */
+                if (!sysctl_ip_dynaddr ||
+                    sk->sk_state != TCP_SYN_SENT ||
+                    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+                    (err = inet_sk_reselect_saddr(sk)) != 0)
+                        sk->sk_err_soft = -err;
+        }
+        return err;
+}
+EXPORT_SYMBOL(inet_sk_rebuild_header);
 #ifdef CONFIG_IP_MULTICAST
 static struct net_protocol igmp_protocol = {
        .handler =      igmp_rcv,
@@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void)
 }
 static int ipv4_proc_init(void);
-extern void ipfrag_init(void);
 /*
 *      IP protocol layer initialiser
@@ -1128,19 +1248,10 @@ module_init(inet_init);
 /* ------------------------------------------------------------------------ */
 #ifdef CONFIG_PROC_FS
-extern int  fib_proc_init(void);
-extern void fib_proc_exit(void);
 #ifdef CONFIG_IP_FIB_TRIE
 extern int  fib_stat_proc_init(void);
 extern void fib_stat_proc_exit(void);
 #endif
-extern int  ip_misc_proc_init(void);
-extern int  raw_proc_init(void);
-extern void raw_proc_exit(void);
-extern int  tcp4_proc_init(void);
-extern void tcp4_proc_exit(void);
-extern int  udp4_proc_init(void);
-extern void udp4_proc_exit(void);
 static int __init ipv4_proc_init(void)
 {
@@ -1205,7 +1316,3 @@ EXPORT_SYMBOL(inet_stream_ops);
 EXPORT_SYMBOL(inet_unregister_protosw);
 EXPORT_SYMBOL(net_statistics);
 EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
-#ifdef INET_REFCNT_DEBUG
-EXPORT_SYMBOL(inet_sock_nr);
-#endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd612853..8bf312bdea13 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip,
 static void parp_redo(struct sk_buff *skb)
 {
        nf_reset(skb);
-        arp_rcv(skb, skb->dev, NULL);
+        arp_rcv(skb, skb->dev, NULL, skb->dev);
 }
 /*
@@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb)
                                if (n)
                                        neigh_release(n);
-                                if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || 
+                                if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 
                                    skb->pkt_type == PACKET_HOST ||
                                    in_dev->arp_parms->proxy_delay == 0) {
                                        arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -927,7 +927,7 @@ out:
 *      Receive an arp request from the device layer.
 */
-int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct arphdr *arp;
@@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
                goto out_of_mem;
+        memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
        return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
 freeskb:
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561f2542..c1b42b5257f8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
 #include <linux/module.h>
 #include <linux/ip.h>
 #include <linux/in.h>
+#include <net/ip.h>
 #include <net/sock.h>
-#include <net/tcp.h>
 #include <net/route.h>
+#include <net/tcp_states.h>
 int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d8a10e3dd77d..ba2895ae8151 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
        struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
        if (!skb)
-                netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
        else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
                kfree_skb(skb);
-                netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
        } else {
-                NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
+                netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
-                netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
        }
 }
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba57446d5d1f..b31ffc5053d2 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
        x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
        if (!x)
                return;
-        NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+        NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
-                        ntohl(esph->spi), ntohl(iph->daddr)));
+                 ntohl(esph->spi), ntohl(iph->daddr));
        xfrm_state_put(x);
 }
@@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x)
                if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
                    crypto_tfm_alg_digestsize(esp->auth.tfm)) {
-                        NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+                        NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
-                               x->aalg->alg_name,
+                                 x->aalg->alg_name,
-                               crypto_tfm_alg_digestsize(esp->auth.tfm),
+                                 crypto_tfm_alg_digestsize(esp->auth.tfm),
-                               aalg_desc->uinfo.auth.icv_fullbits/8));
+                                 aalg_desc->uinfo.auth.icv_fullbits/8);
                        goto error;
                }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cd8e45ab9580..4e1379f71269 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len)
        nl_fib_lookup(frn, tb);
        
        pid = nlh->nlmsg_pid;           /*pid of sending process */
-        NETLINK_CB(skb).groups = 0;     /* not in mcast group */
        NETLINK_CB(skb).pid = 0;         /* from kernel */
        NETLINK_CB(skb).dst_pid = pid;
-        NETLINK_CB(skb).dst_groups = 0;  /* unicast */
+        NETLINK_CB(skb).dst_group = 0;  /* unicast */
        netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
 }    
 static void nl_fib_lookup_init(void)
 {
-      netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
+      netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
 }
 static void fib_disable_ip(struct net_device *dev, int force)
@@ -662,5 +661,4 @@ void __init ip_fib_init(void)
 }
 EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
 EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b10d6bb5ef3d..2a8c9afc3695 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
 #include "fib_lookup.h"
-static kmem_cache_t *fn_hash_kmem;
+static kmem_cache_t *fn_hash_kmem __read_mostly;
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
 struct fib_node {
        struct hlist_node       fn_hash;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b729d97cfa93..ef6609ea0eb7 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
 struct fib_alias {
        struct list_head        fa_list;
+        struct rcu_head rcu;
        struct fib_info         *fa_info;
        u8                      fa_tos;
        u8                      fa_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e278cb9d0075..d41219e8037c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
                kfree_skb(skb);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+        NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
        if (n->nlmsg_flags&NLM_F_ECHO)
                atomic_inc(&skb->users);
-        netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+        netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
        if (n->nlmsg_flags&NLM_F_ECHO)
                netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
 }
@@ -854,6 +854,7 @@ failure:
        return NULL;
 }
+/* Note! fib_semantic_match intentionally uses  RCU list functions. */
 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
                       struct fib_result *res, __u32 zone, __u32 mask, 
                        int prefixlen)
@@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
        struct fib_alias *fa;
        int nh_sel = 0;
-        list_for_each_entry(fa, head, fa_list) {
+        list_for_each_entry_rcu(fa, head, fa_list) {
                int err;
                if (fa->fa_tos &&
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a701405fab0b..b2dea4e5da77 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
 *              2 of the License, or (at your option) any later version.
 */
-#define VERSION "0.325"
+#define VERSION "0.402"
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -62,6 +62,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
@@ -77,56 +78,55 @@
 #undef CONFIG_IP_FIB_TRIE_STATS
 #define MAX_CHILDS 16384
-#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
 #define KEYLENGTH (8*sizeof(t_key))
 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
-static DEFINE_RWLOCK(fib_lock);
 typedef unsigned int t_key;
 #define T_TNODE 0
 #define T_LEAF  1
 #define NODE_TYPE_MASK  0x1UL
-#define NODE_PARENT(_node) \
+#define NODE_PARENT(node) \
-        ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
+        ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
-#define NODE_SET_PARENT(_node, _ptr) \
-        ((_node)->_parent = (((unsigned long)(_ptr)) | \
+#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
-                     ((_node)->_parent & NODE_TYPE_MASK)))
-#define NODE_INIT_PARENT(_node, _type) \
+#define NODE_SET_PARENT(node, ptr)              \
-        ((_node)->_parent = (_type))
+        rcu_assign_pointer((node)->parent,      \
-#define NODE_TYPE(_node) \
+                           ((unsigned long)(ptr)) | NODE_TYPE(node))
-        ((_node)->_parent & NODE_TYPE_MASK)
+#define IS_TNODE(n) (!(n->parent & T_LEAF))
-#define IS_TNODE(n) (!(n->_parent & T_LEAF))
+#define IS_LEAF(n) (n->parent & T_LEAF)
-#define IS_LEAF(n) (n->_parent & T_LEAF)
 struct node {
-        t_key key;
+        t_key key;
-        unsigned long _parent;
+        unsigned long parent;
 };
 struct leaf {
-        t_key key;
+        t_key key;
-        unsigned long _parent;
+        unsigned long parent;
        struct hlist_head list;
+        struct rcu_head rcu;
 };
 struct leaf_info {
        struct hlist_node hlist;
+        struct rcu_head rcu;
        int plen;
        struct list_head falh;
 };
 struct tnode {
-        t_key key;
+        t_key key;
-        unsigned long _parent;
+        unsigned long parent;
-        unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
+        unsigned short pos:5;           /* 2log(KEYLENGTH) bits needed */
-        unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
+        unsigned short bits:5;          /* 2log(KEYLENGTH) bits needed */
-        unsigned short full_children;  /* KEYLENGTH bits needed */
+        unsigned short full_children;   /* KEYLENGTH bits needed */
-        unsigned short empty_children; /* KEYLENGTH bits needed */
+        unsigned short empty_children;  /* KEYLENGTH bits needed */
-        struct node *child[0];
+        struct rcu_head rcu;
+        struct node *child[0];
 };
 #ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -150,77 +150,45 @@ struct trie_stat {
 };
 struct trie {
-        struct node *trie;
+        struct node *trie;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
        struct trie_use_stats stats;
 #endif
-        int size;
+        int size;
        unsigned int revision;
 };
-static int trie_debug = 0;
-static int tnode_full(struct tnode *tn, struct node *n);
 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
-static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
-extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
-                            struct fib_info **last_resort, int *last_idx, int *dflt);
-extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
-               struct nlmsghdr *n, struct netlink_skb_parms *req);
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
 static struct trie *trie_local = NULL, *trie_main = NULL;
-static void trie_bug(char *err)
-{
+/* rcu_read_lock needs to be hold by caller from readside */
-        printk("Trie Bug: %s\n", err);
-        BUG();
-}
 static inline struct node *tnode_get_child(struct tnode *tn, int i)
 {
-        if (i >= 1<<tn->bits)
+        BUG_ON(i >= 1 << tn->bits);
-                trie_bug("tnode_get_child");
-        return tn->child[i];
+        return rcu_dereference(tn->child[i]);
 }
-static inline int tnode_child_length(struct tnode *tn)
+static inline int tnode_child_length(const struct tnode *tn)
 {
-        return 1<<tn->bits;
+        return 1 << tn->bits;
 }
-/*
-  _________________________________________________________________
-  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
-  ----------------------------------------------------------------
-    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
-  _________________________________________________________________
-  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
-  -----------------------------------------------------------------
-   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
-  tp->pos = 7
-  tp->bits = 3
-  n->pos = 15
-  n->bits=4
-  KEYLENGTH=32
-*/
 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 {
-        if (offset < KEYLENGTH)
+        if (offset < KEYLENGTH)
                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
-        else
+        else
                return 0;
 }
@@ -233,8 +201,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 {
        if (bits == 0 || offset >= KEYLENGTH)
                return 1;
-        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
-        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 }
 static inline int tkey_mismatch(t_key a, int offset, t_key b)
@@ -249,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
        return i;
 }
-/* Candiate for fib_semantics */
-static void fn_free_alias(struct fib_alias *fa)
-{
-        fib_release_info(fa->fa_info);
-        kmem_cache_free(fn_alias_kmem, fa);
-}
 /*
  To understand this stuff, an understanding of keys and all their bits is 
  necessary. Every node in the trie has a key associated with it, but not 
@@ -295,7 +255,7 @@ static void fn_free_alias(struct fib_alias *fa)
  tp->pos = 7
  tp->bits = 3
  n->pos = 15
-  n->bits=4
+  n->bits = 4
  First, let's just ignore the bits that come before the parent tp, that is 
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do 
@@ -320,60 +280,65 @@ static void fn_free_alias(struct fib_alias *fa)
 */
-static void check_tnode(struct tnode *tn)
+static inline void check_tnode(const struct tnode *tn)
 {
-        if (tn && tn->pos+tn->bits > 32) {
+        WARN_ON(tn && tn->pos+tn->bits > 32);
-                printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
-        }
 }
 static int halve_threshold = 25;
 static int inflate_threshold = 50;
-static struct leaf *leaf_new(void)
+static void __alias_free_mem(struct rcu_head *head)
 {
-        struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
-        if (l) {
+        kmem_cache_free(fn_alias_kmem, fa);
-                NODE_INIT_PARENT(l, T_LEAF);
-                INIT_HLIST_HEAD(&l->list);
-        }
-        return l;
 }
-static struct leaf_info *leaf_info_new(int plen)
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
 {
-        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+        call_rcu(&fa->rcu, __alias_free_mem);
-        if (li) {
+}
-                li->plen = plen;
-                INIT_LIST_HEAD(&li->falh);
+static void __leaf_free_rcu(struct rcu_head *head)
-        }
+{
-        return li;
+        kfree(container_of(head, struct leaf, rcu));
+}
+static inline void free_leaf(struct leaf *leaf)
+{
+        call_rcu(&leaf->rcu, __leaf_free_rcu);
 }
-static inline void free_leaf(struct leaf *l)
+static void __leaf_info_free_rcu(struct rcu_head *head)
 {
-        kfree(l);
+        kfree(container_of(head, struct leaf_info, rcu));
 }
-static inline void free_leaf_info(struct leaf_info *li)
+static inline void free_leaf_info(struct leaf_info *leaf)
 {
-        kfree(li);
+        call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 }
 static struct tnode *tnode_alloc(unsigned int size)
 {
-        if (size <= PAGE_SIZE) {
+        struct page *pages;
-                return kmalloc(size, GFP_KERNEL);
-        } else {
+        if (size <= PAGE_SIZE)
-                return (struct tnode *)
+                return kcalloc(size, 1, GFP_KERNEL);
-                        __get_free_pages(GFP_KERNEL, get_order(size));
-        }
+        pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+        if (!pages)
+                return NULL;
+        return page_address(pages);
 }
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
 {
+        struct tnode *tn = container_of(head, struct tnode, rcu);
        unsigned int size = sizeof(struct tnode) +
-                            (1<<tn->bits) * sizeof(struct node *);
+                (1 << tn->bits) * sizeof(struct node *);
        if (size <= PAGE_SIZE)
                kfree(tn);
@@ -381,15 +346,40 @@ static void __tnode_free(struct tnode *tn)
                free_pages((unsigned long)tn, get_order(size));
 }
+static inline void tnode_free(struct tnode *tn)
+{
+        call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+static struct leaf *leaf_new(void)
+{
+        struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+        if (l) {
+                l->parent = T_LEAF;
+                INIT_HLIST_HEAD(&l->list);
+        }
+        return l;
+}
+static struct leaf_info *leaf_info_new(int plen)
+{
+        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+        if (li) {
+                li->plen = plen;
+                INIT_LIST_HEAD(&li->falh);
+        }
+        return li;
+}
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
        int nchildren = 1<<bits;
        int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
        struct tnode *tn = tnode_alloc(sz);
-        if (tn)  {
+        if (tn) {
                memset(tn, 0, sz);
-                NODE_INIT_PARENT(tn, T_TNODE);
+                tn->parent = T_TNODE;
                tn->pos = pos;
                tn->bits = bits;
                tn->key = key;
@@ -397,38 +387,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
                tn->empty_children = 1<<bits;
        }
-        if (trie_debug > 0)
+        pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
-                printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
+                 (unsigned int) (sizeof(struct node) * 1<<bits));
-                       (unsigned int) (sizeof(struct node) * 1<<bits));
        return tn;
 }
-static void tnode_free(struct tnode *tn)
-{
-        if (!tn) {
-                trie_bug("tnode_free\n");
-        }
-        if (IS_LEAF(tn)) {
-                free_leaf((struct leaf *)tn);
-                if (trie_debug > 0 )
-                        printk("FL %p \n", tn);
-        }
-        else if (IS_TNODE(tn)) {
-                __tnode_free(tn);
-                if (trie_debug > 0 )
-                        printk("FT %p \n", tn);
-        }
-        else {
-                trie_bug("tnode_free\n");
-        }
-}
 /*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */
-static inline int tnode_full(struct tnode *tn, struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct node *n)
 {
        if (n == NULL || IS_LEAF(n))
                return 0;
@@ -448,15 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
 {
-        struct node *chi;
+        struct node *chi = tn->child[i];
        int isfull;
-        if (i >= 1<<tn->bits) {
+        BUG_ON(i >= 1<<tn->bits);
-                printk("bits=%d, i=%d\n", tn->bits, i);
-                trie_bug("tnode_put_child_reorg bits");
-        }
-        write_lock_bh(&fib_lock);
-        chi = tn->child[i];
        /* update emptyChildren */
        if (n == NULL && chi != NULL)
@@ -465,33 +430,32 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
                tn->empty_children--;
        /* update fullChildren */
-        if (wasfull == -1)
+        if (wasfull == -1)
                wasfull = tnode_full(tn, chi);
        isfull = tnode_full(tn, n);
        if (wasfull && !isfull)
                tn->full_children--;
        else if (!wasfull && isfull)
                tn->full_children++;
        if (n)
                NODE_SET_PARENT(n, tn);
-        tn->child[i] = n;
+        rcu_assign_pointer(tn->child[i], n);
-        write_unlock_bh(&fib_lock);
 }
 static struct node *resize(struct trie *t, struct tnode *tn)
 {
        int i;
        int err = 0;
+        struct tnode *old_tn;
        if (!tn)
                return NULL;
-        if (trie_debug)
+        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
-                printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+                 tn, inflate_threshold, halve_threshold);
-                      tn, inflate_threshold, halve_threshold);
        /* No children */
        if (tn->empty_children == tnode_child_length(tn)) {
@@ -501,20 +465,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
        /* One child */
        if (tn->empty_children == tnode_child_length(tn) - 1)
                for (i = 0; i < tnode_child_length(tn); i++) {
+                        struct node *n;
-                        write_lock_bh(&fib_lock);
+                        n = tn->child[i];
-                        if (tn->child[i] != NULL) {
+                        if (!n)
+                                continue;
-                                /* compress one level */
-                                struct node *n = tn->child[i];
-                                if (n)
-                                        NODE_INIT_PARENT(n, NODE_TYPE(n));
-                                write_unlock_bh(&fib_lock);
+                        /* compress one level */
-                                tnode_free(tn);
+                        NODE_SET_PARENT(n, NULL);
-                                return n;
+                        tnode_free(tn);
-                        }
+                        return n;
-                        write_unlock_bh(&fib_lock);
                }
        /*
         * Double as long as the resulting node has a number of
@@ -566,16 +526,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
         *
         * expand not_to_be_doubled and to_be_doubled, and shorten:
         * 100 * (tnode_child_length(tn) - tn->empty_children +
-         *    tn->full_children ) >= inflate_threshold * new_child_length
+         *    tn->full_children) >= inflate_threshold * new_child_length
         *
         * expand new_child_length:
         * 100 * (tnode_child_length(tn) - tn->empty_children +
-         *    tn->full_children ) >=
+         *    tn->full_children) >=
         *      inflate_threshold * tnode_child_length(tn) * 2
         *
         * shorten again:
         * 50 * (tn->full_children + tnode_child_length(tn) -
-         *    tn->empty_children ) >= inflate_threshold *
+         *    tn->empty_children) >= inflate_threshold *
         *    tnode_child_length(tn)
         *
         */
@@ -587,9 +547,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
               50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
                                inflate_threshold * tnode_child_length(tn))) {
-                tn = inflate(t, tn, &err);
+                old_tn = tn;
+                tn = inflate(t, tn);
-                if (err) {
+                if (IS_ERR(tn)) {
+                        tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
                        t->stats.resize_node_skipped++;
 #endif
@@ -609,9 +570,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
               100 * (tnode_child_length(tn) - tn->empty_children) <
               halve_threshold * tnode_child_length(tn)) {
-                tn = halve(t, tn, &err);
+                old_tn = tn;
+                tn = halve(t, tn);
-                if (err) {
+                if (IS_ERR(tn)) {
+                        tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
                        t->stats.resize_node_skipped++;
 #endif
@@ -621,44 +583,37 @@ static struct node *resize(struct trie *t, struct tnode *tn)
        /* Only one child remains */
        if (tn->empty_children == tnode_child_length(tn) - 1)
                for (i = 0; i < tnode_child_length(tn); i++) {
-                
+                        struct node *n;
-                        write_lock_bh(&fib_lock);
-                        if (tn->child[i] != NULL) {
+                        n = tn->child[i];
-                                /* compress one level */
+                        if (!n)
-                                struct node *n = tn->child[i];
+                                continue;
-                                if (n)
+                        /* compress one level */
-                                        NODE_INIT_PARENT(n, NODE_TYPE(n));
+                        NODE_SET_PARENT(n, NULL);
-                                write_unlock_bh(&fib_lock);
+                        tnode_free(tn);
-                                tnode_free(tn);
+                        return n;
-                                return n;
-                        }
-                        write_unlock_bh(&fib_lock);
                }
        return (struct node *) tn;
 }
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
 {
        struct tnode *inode;
        struct tnode *oldtnode = tn;
        int olen = tnode_child_length(tn);
        int i;
-        if (trie_debug)
+        pr_debug("In inflate\n");
-                printk("In inflate\n");
        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
-        if (!tn) {
+        if (!tn)
-                *err = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                return oldtnode;
-        }
        /*
         * Preallocate and store tnodes before the actual work so we
@@ -666,8 +621,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
         * fails. In case of failure we return the oldnode and  inflate
         * of tnode is ignored.
         */
-                
-        for(i = 0; i < olen; i++) {
+        for (i = 0; i < olen; i++) {
                struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
                if (inode &&
@@ -675,46 +630,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
                    inode->pos == oldtnode->pos + oldtnode->bits &&
                    inode->bits > 1) {
                        struct tnode *left, *right;
                        t_key m = TKEY_GET_MASK(inode->pos, 1);
                        left = tnode_new(inode->key&(~m), inode->pos + 1,
                                         inode->bits - 1);
+                        if (!left)
+                                goto nomem;
-                        if (!left) {
-                                *err = -ENOMEM;
-                                break;
-                        }
-                
                        right = tnode_new(inode->key|m, inode->pos + 1,
                                          inode->bits - 1);
-                        if (!right) {
+                        if (!right) {
-                                *err = -ENOMEM;
+                                tnode_free(left);
-                                break;
+                                goto nomem;
-                        }
+                        }
                        put_child(t, tn, 2*i, (struct node *) left);
                        put_child(t, tn, 2*i+1, (struct node *) right);
                }
        }
-        if (*err) {
+        for (i = 0; i < olen; i++) {
-                int size = tnode_child_length(tn);
-                int j;
-                for(j = 0; j < size; j++)
-                        if (tn->child[j])
-                                tnode_free((struct tnode *)tn->child[j]);
-                tnode_free(tn);
-        
-                *err = -ENOMEM;
-                return oldtnode;
-        }
-        for(i = 0; i < olen; i++) {
                struct node *node = tnode_get_child(oldtnode, i);
+                struct tnode *left, *right;
+                int size, j;
                /* An empty child */
                if (node == NULL)
@@ -740,76 +679,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
                        put_child(t, tn, 2*i+1, inode->child[1]);
                        tnode_free(inode);
+                        continue;
                }
-                        /* An internal node with more than two children */
+                /* An internal node with more than two children */
-                else {
-                        struct tnode *left, *right;
+                /* We will replace this node 'inode' with two new
-                        int size, j;
+                 * ones, 'left' and 'right', each with half of the
+                 * original children. The two new nodes will have
-                        /* We will replace this node 'inode' with two new
+                 * a position one bit further down the key and this
-                         * ones, 'left' and 'right', each with half of the
+                 * means that the "significant" part of their keys
-                         * original children. The two new nodes will have
+                 * (see the discussion near the top of this file)
-                         * a position one bit further down the key and this
+                 * will differ by one bit, which will be "0" in
-                         * means that the "significant" part of their keys
+                 * left's key and "1" in right's key. Since we are
-                         * (see the discussion near the top of this file)
+                 * moving the key position by one step, the bit that
-                         * will differ by one bit, which will be "0" in
+                 * we are moving away from - the bit at position
-                         * left's key and "1" in right's key. Since we are
+                 * (inode->pos) - is the one that will differ between
-                         * moving the key position by one step, the bit that
+                 * left and right. So... we synthesize that bit in the
-                         * we are moving away from - the bit at position
+                 * two  new keys.
-                         * (inode->pos) - is the one that will differ between
+                 * The mask 'm' below will be a single "one" bit at
-                         * left and right. So... we synthesize that bit in the
+                 * the position (inode->pos)
-                         * two  new keys.
+                 */
-                         * The mask 'm' below will be a single "one" bit at
-                         * the position (inode->pos)
-                         */
-                        /* Use the old key, but set the new significant
-                         *   bit to zero.
-                         */
-                        left = (struct tnode *) tnode_get_child(tn, 2*i);
+                /* Use the old key, but set the new significant
-                        put_child(t, tn, 2*i, NULL);
+                 *   bit to zero.
+                 */
-                        if (!left)
+                left = (struct tnode *) tnode_get_child(tn, 2*i);
-                                BUG();
+                put_child(t, tn, 2*i, NULL);
-                        right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+                BUG_ON(!left);
-                        put_child(t, tn, 2*i+1, NULL);
-                        if (!right)
+                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
-                                BUG();
+                put_child(t, tn, 2*i+1, NULL);
-                        size = tnode_child_length(left);
+                BUG_ON(!right);
-                        for(j = 0; j < size; j++) {
-                                put_child(t, left, j, inode->child[j]);
-                                put_child(t, right, j, inode->child[j + size]);
-                        }
-                        put_child(t, tn, 2*i, resize(t, left));
-                        put_child(t, tn, 2*i+1, resize(t, right));
-                        tnode_free(inode);
+                size = tnode_child_length(left);
+                for (j = 0; j < size; j++) {
+                        put_child(t, left, j, inode->child[j]);
+                        put_child(t, right, j, inode->child[j + size]);
                }
+                put_child(t, tn, 2*i, resize(t, left));
+                put_child(t, tn, 2*i+1, resize(t, right));
+                tnode_free(inode);
        }
        tnode_free(oldtnode);
        return tn;
+nomem:
+        {
+                int size = tnode_child_length(tn);
+                int j;
+                for (j = 0; j < size; j++)
+                        if (tn->child[j])
+                                tnode_free((struct tnode *)tn->child[j]);
+                tnode_free(tn);
+                return ERR_PTR(-ENOMEM);
+        }
 }
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *halve(struct trie *t, struct tnode *tn)
 {
        struct tnode *oldtnode = tn;
        struct node *left, *right;
        int i;
        int olen = tnode_child_length(tn);
-        if (trie_debug) printk("In halve\n");
+        pr_debug("In halve\n");
        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
-        if (!tn) {
+        if (!tn)
-                *err = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                return oldtnode;
-        }
        /*
         * Preallocate and store tnodes before the actual work so we
@@ -818,38 +763,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
         * of tnode is ignored.
         */
-        for(i = 0; i < olen; i += 2) {
+        for (i = 0; i < olen; i += 2) {
                left = tnode_get_child(oldtnode, i);
                right = tnode_get_child(oldtnode, i+1);
                /* Two nonempty children */
-                if (left && right)  {
+                if (left && right) {
-                        struct tnode *newBinNode =
+                        struct tnode *newn;
-                                tnode_new(left->key, tn->pos + tn->bits, 1);
-                        if (!newBinNode) {
+                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
-                                *err = -ENOMEM;
-                                break;
-                        }
-                        put_child(t, tn, i/2, (struct node *)newBinNode);
-                }
-        }
-        if (*err) {
+                        if (!newn)
-                int size = tnode_child_length(tn);
+                                goto nomem;
-                int j;
-                for(j = 0; j < size; j++)
+                        put_child(t, tn, i/2, (struct node *)newn);
-                        if (tn->child[j])
+                }
-                                tnode_free((struct tnode *)tn->child[j]);
-                tnode_free(tn);
-        
-                *err = -ENOMEM;
-                return oldtnode;
        }
-        for(i = 0; i < olen; i += 2) {
+        for (i = 0; i < olen; i += 2) {
+                struct tnode *newBinNode;
                left = tnode_get_child(oldtnode, i);
                right = tnode_get_child(oldtnode, i+1);
@@ -858,88 +792,99 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
                        if (right == NULL)    /* Both are empty */
                                continue;
                        put_child(t, tn, i/2, right);
-                } else if (right == NULL)
+                        continue;
+                }
+                if (right == NULL) {
                        put_child(t, tn, i/2, left);
+                        continue;
+                }
                /* Two nonempty children */
-                else {
+                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
-                        struct tnode *newBinNode =
+                put_child(t, tn, i/2, NULL);
-                                (struct tnode *) tnode_get_child(tn, i/2);
+                put_child(t, newBinNode, 0, left);
-                        put_child(t, tn, i/2, NULL);
+                put_child(t, newBinNode, 1, right);
+                put_child(t, tn, i/2, resize(t, newBinNode));
-                        if (!newBinNode)
-                                BUG();
-                        put_child(t, newBinNode, 0, left);
-                        put_child(t, newBinNode, 1, right);
-                        put_child(t, tn, i/2, resize(t, newBinNode));
-                }
        }
        tnode_free(oldtnode);
        return tn;
+nomem:
+        {
+                int size = tnode_child_length(tn);
+                int j;
+                for (j = 0; j < size; j++)
+                        if (tn->child[j])
+                                tnode_free((struct tnode *)tn->child[j]);
+                tnode_free(tn);
+                return ERR_PTR(-ENOMEM);
+        }
 }
-static void *trie_init(struct trie *t)
+static void trie_init(struct trie *t)
 {
-        if (t) {
+        if (!t)
-                t->size = 0;
+                return;
-                t->trie = NULL;
-                t->revision = 0;
+        t->size = 0;
+        rcu_assign_pointer(t->trie, NULL);
+        t->revision = 0;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-                memset(&t->stats, 0, sizeof(struct trie_use_stats));
+        memset(&t->stats, 0, sizeof(struct trie_use_stats));
 #endif
-        }
-        return t;
 }
+/* readside most use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
 {
        struct hlist_node *node;
        struct leaf_info *li;
-        hlist_for_each_entry(li, node, head, hlist) {
+        hlist_for_each_entry_rcu(li, node, head, hlist)
                if (li->plen == plen)
                        return li;
-        }
        return NULL;
 }
 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
 {
-        struct list_head *fa_head = NULL;
        struct leaf_info *li = find_leaf_info(&l->list, plen);
-        if (li)
+        if (!li)
-                fa_head = &li->falh;
+                return NULL;
-        return fa_head;
+        return &li->falh;
 }
 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 {
-        struct leaf_info *li = NULL, *last = NULL;
+        struct leaf_info *li = NULL, *last = NULL;
-        struct hlist_node *node, *tmp;
+        struct hlist_node *node;
-        write_lock_bh(&fib_lock);
+        if (hlist_empty(head)) {
+                hlist_add_head_rcu(&new->hlist, head);
-        if (hlist_empty(head))
+        } else {
-                hlist_add_head(&new->hlist, head);
+                hlist_for_each_entry(li, node, head, hlist) {
-        else {
+                        if (new->plen > li->plen)
-                hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
+                                break;
-                
-                        if (new->plen > li->plen)
+                        last = li;
-                                break;
+                }
-                
+                if (last)
-                        last = li;
+                        hlist_add_after_rcu(&last->hlist, &new->hlist);
-                }
+                else
-                if (last)
+                        hlist_add_before_rcu(&new->hlist, &li->hlist);
-                        hlist_add_after(&last->hlist, &new->hlist);
+        }
-                else
-                        hlist_add_before(&new->hlist, &li->hlist);
-        }
-        write_unlock_bh(&fib_lock);
 }
+/* rcu_read_lock needs to be hold by caller from readside */
 static struct leaf *
 fib_find_node(struct trie *t, u32 key)
 {
@@ -948,61 +893,43 @@ fib_find_node(struct trie *t, u32 key)
        struct node *n;
        pos = 0;
-        n = t->trie;
+        n = rcu_dereference(t->trie);
        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
                tn = (struct tnode *) n;
-                
                check_tnode(tn);
-                
                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
-                        pos=tn->pos + tn->bits;
+                        pos = tn->pos + tn->bits;
                        n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
-                }
+                } else
-                else
                        break;
        }
        /* Case we have found a leaf. Compare prefixes */
-        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
+        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
-                struct leaf *l = (struct leaf *) n;
+                return (struct leaf *)n;
-                return l;
-        }
        return NULL;
 }
 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 {
-        int i = 0;
        int wasfull;
        t_key cindex, key;
        struct tnode *tp = NULL;
-        if (!tn)
-                BUG();
        key = tn->key;
-        i = 0;
        while (tn != NULL && NODE_PARENT(tn) != NULL) {
-                if (i > 10) {
-                        printk("Rebalance tn=%p \n", tn);
-                        if (tn)                 printk("tn->parent=%p \n", NODE_PARENT(tn));
-                
-                        printk("Rebalance tp=%p \n", tp);
-                        if (tp)                 printk("tp->parent=%p \n", NODE_PARENT(tp));
-                }
-                if (i > 12) BUG();
-                i++;
                tp = NODE_PARENT(tn);
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
                tn = (struct tnode *) resize (t, (struct tnode *)tn);
                tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
-        
                if (!NODE_PARENT(tn))
                        break;
@@ -1015,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
        return (struct node*) tn;
 }
+/* only used from updater-side */
 static  struct list_head *
 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 {
@@ -1050,20 +979,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
                tn = (struct tnode *) n;
-                
                check_tnode(tn);
-        
                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
                        tp = tn;
-                        pos=tn->pos + tn->bits;
+                        pos = tn->pos + tn->bits;
                        n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
-                        if (n && NODE_PARENT(n) != tn) {
+                        BUG_ON(n && NODE_PARENT(n) != tn);
-                                printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
+                } else
-                                BUG();
-                        }
-                }
-                else
                        break;
        }
@@ -1073,17 +998,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
         * tp is n's (parent) ----> NULL or TNODE
         */
-        if (tp && IS_LEAF(tp))
+        BUG_ON(tp && IS_LEAF(tp));
-                BUG();
        /* Case 1: n is a leaf. Compare prefixes */
        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
-                struct leaf *l = ( struct leaf *)  n;
+                struct leaf *l = (struct leaf *) n;
-        
                li = leaf_info_new(plen);
-        
                if (!li) {
                        *err = -ENOMEM;
                        goto err;
@@ -1113,35 +1036,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
        fa_head = &li->falh;
        insert_leaf_info(&l->list, li);
-        /* Case 2: n is NULL, and will just insert a new leaf */
        if (t->trie && n == NULL) {
+                /* Case 2: n is NULL, and will just insert a new leaf */
                NODE_SET_PARENT(l, tp);
-        
-                if (!tp)
-                        BUG();
-                else {
+                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+                put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
-                        put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+        } else {
-                }
+                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
-        }
-        /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
-        else {
                /*
                 *  Add a new tnode here
                 *  first tnode need some special handling
                 */
                if (tp)
-                        pos=tp->pos+tp->bits;
+                        pos = tp->pos+tp->bits;
                else
-                        pos=0;
+                        pos = 0;
                if (n) {
                        newpos = tkey_mismatch(key, pos, n->key);
                        tn = tnode_new(n->key, newpos, 1);
-                }
+                } else {
-                else {
                        newpos = 0;
                        tn = tnode_new(key, newpos, 1); /* First tnode */
                }
@@ -1151,32 +1068,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
                        tnode_free((struct tnode *) l);
                        *err = -ENOMEM;
                        goto err;
-                }               
+                }
-                
                NODE_SET_PARENT(tn, tp);
-                missbit=tkey_extract_bits(key, newpos, 1);
+                missbit = tkey_extract_bits(key, newpos, 1);
                put_child(t, tn, missbit, (struct node *)l);
                put_child(t, tn, 1-missbit, n);
                if (tp) {
                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                        put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
-                }
+                } else {
-                else {
+                        rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
-                        t->trie = (struct node*) tn; /* First tnode */
                        tp = tn;
                }
        }
-        if (tp && tp->pos+tp->bits > 32) {
+        if (tp && tp->pos + tp->bits > 32)
                printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
                       tp, tp->pos, tp->bits, key, plen);
-        }
        /* Rebalance the trie */
-        t->trie = trie_rebalance(t, tp);
+        rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
 done:
        t->revision++;
-err:;
+err:
        return fa_head;
 }
@@ -1204,17 +1122,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
        key = ntohl(key);
-        if (trie_debug)
+        pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
-                printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
-        mask = ntohl( inet_make_mask(plen) );
+        mask = ntohl(inet_make_mask(plen));
        if (key & ~mask)
                return -EINVAL;
        key = key & mask;
-        if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
+        fi = fib_create_info(r, rta, nlhdr, &err);
+        if (!fi)
                goto err;
        l = fib_find_node(t, key);
@@ -1236,8 +1155,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
         * and we need to allocate a new one of those as well.
         */
-        if (fa &&
+        if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
-            fa->fa_info->fib_priority == fi->fib_priority) {
                struct fib_alias *fa_orig;
                err = -EEXIST;
@@ -1248,22 +1166,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
                        struct fib_info *fi_drop;
                        u8 state;
-                        write_lock_bh(&fib_lock);
+                        err = -ENOBUFS;
+                        new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+                        if (new_fa == NULL)
+                                goto out;
                        fi_drop = fa->fa_info;
-                        fa->fa_info = fi;
+                        new_fa->fa_tos = fa->fa_tos;
-                        fa->fa_type = type;
+                        new_fa->fa_info = fi;
-                        fa->fa_scope = r->rtm_scope;
+                        new_fa->fa_type = type;
+                        new_fa->fa_scope = r->rtm_scope;
                        state = fa->fa_state;
-                        fa->fa_state &= ~FA_S_ACCESSED;
+                        new_fa->fa_state &= ~FA_S_ACCESSED;
-                        write_unlock_bh(&fib_lock);
+                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+                        alias_free_mem_rcu(fa);
                        fib_release_info(fi_drop);
                        if (state & FA_S_ACCESSED)
-                          rt_cache_flush(-1);
+                                rt_cache_flush(-1);
-                            goto succeeded;
+                        goto succeeded;
                }
                /* Error if we find a perfect match which
                 * uses the same scope, type, and nexthop
@@ -1285,7 +1208,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
                        fa = fa_orig;
        }
        err = -ENOENT;
-        if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
+        if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
                goto out;
        err = -ENOBUFS;
@@ -1298,9 +1221,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
        new_fa->fa_type = type;
        new_fa->fa_scope = r->rtm_scope;
        new_fa->fa_state = 0;
-#if 0
-        new_fa->dst = NULL;
-#endif
        /*
         * Insert new entry to the list.
         */
@@ -1312,12 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
                        goto out_free_new_fa;
        }
-        write_lock_bh(&fib_lock);
+        list_add_tail_rcu(&new_fa->fa_list,
+                          (fa ? &fa->fa_list : fa_head));
-        list_add_tail(&new_fa->fa_list,
-                 (fa ? &fa->fa_list : fa_head));
-        write_unlock_bh(&fib_lock);
        rt_cache_flush(-1);
        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1328,38 +1244,40 @@ out_free_new_fa:
        kmem_cache_free(fn_alias_kmem, new_fa);
 out:
        fib_release_info(fi);
-err:;
+err:
        return err;
 }
-static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
-                             struct fib_result *res, int *err)
+/* should be clalled with rcu_read_lock */
+static inline int check_leaf(struct trie *t, struct leaf *l,
+                             t_key key, int *plen, const struct flowi *flp,
+                             struct fib_result *res)
 {
-        int i;
+        int err, i;
        t_key mask;
        struct leaf_info *li;
        struct hlist_head *hhead = &l->list;
        struct hlist_node *node;
-        hlist_for_each_entry(li, node, hhead, hlist) {
+        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
                i = li->plen;
                mask = ntohl(inet_make_mask(i));
                if (l->key != (key & mask))
                        continue;
-                if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
+                if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
                        *plen = i;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
                        t->stats.semantic_match_passed++;
 #endif
-                        return 1;
+                        return err;
                }
 #ifdef CONFIG_IP_FIB_TRIE_STATS
                t->stats.semantic_match_miss++;
 #endif
        }
-        return 0;
+        return 1;
 }
 static int
@@ -1370,13 +1288,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
        struct node *n;
        struct tnode *pn;
        int pos, bits;
-        t_key key=ntohl(flp->fl4_dst);
+        t_key key = ntohl(flp->fl4_dst);
        int chopped_off;
        t_key cindex = 0;
        int current_prefix_length = KEYLENGTH;
-        n = t->trie;
+        struct tnode *cn;
+        t_key node_prefix, key_prefix, pref_mismatch;
+        int mp;
+        rcu_read_lock();
-        read_lock(&fib_lock);
+        n = rcu_dereference(t->trie);
        if (!n)
                goto failed;
@@ -1386,15 +1308,14 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
        /* Just a leaf? */
        if (IS_LEAF(n)) {
-                if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+                if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
                        goto found;
                goto failed;
        }
        pn = (struct tnode *) n;
        chopped_off = 0;
-        while (pn) {
+        while (pn) {
                pos = pn->pos;
                bits = pn->bits;
@@ -1410,130 +1331,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
                        goto backtrace;
                }
-                if (IS_TNODE(n)) {
+                if (IS_LEAF(n)) {
+                        if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
+                                goto found;
+                        else
+                                goto backtrace;
+                }
 #define HL_OPTIMIZE
 #ifdef HL_OPTIMIZE
-                        struct tnode *cn = (struct tnode *)n;
+                cn = (struct tnode *)n;
-                        t_key node_prefix, key_prefix, pref_mismatch;
-                        int mp;
-                        /*
+                /*
-                         * It's a tnode, and we can do some extra checks here if we
+                 * It's a tnode, and we can do some extra checks here if we
-                         * like, to avoid descending into a dead-end branch.
+                 * like, to avoid descending into a dead-end branch.
-                         * This tnode is in the parent's child array at index
+                 * This tnode is in the parent's child array at index
-                         * key[p_pos..p_pos+p_bits] but potentially with some bits
+                 * key[p_pos..p_pos+p_bits] but potentially with some bits
-                         * chopped off, so in reality the index may be just a
+                 * chopped off, so in reality the index may be just a
-                         * subprefix, padded with zero at the end.
+                 * subprefix, padded with zero at the end.
-                         * We can also take a look at any skipped bits in this
+                 * We can also take a look at any skipped bits in this
-                         * tnode - everything up to p_pos is supposed to be ok,
+                 * tnode - everything up to p_pos is supposed to be ok,
-                         * and the non-chopped bits of the index (se previous
+                 * and the non-chopped bits of the index (se previous
-                         * paragraph) are also guaranteed ok, but the rest is
+                 * paragraph) are also guaranteed ok, but the rest is
-                         * considered unknown.
+                 * considered unknown.
-                         *
+                 *
-                         * The skipped bits are key[pos+bits..cn->pos].
+                 * The skipped bits are key[pos+bits..cn->pos].
-                         */
+                 */
-                
-                        /* If current_prefix_length < pos+bits, we are already doing
-                         * actual prefix  matching, which means everything from
-                         * pos+(bits-chopped_off) onward must be zero along some
-                         * branch of this subtree - otherwise there is *no* valid
-                         * prefix present. Here we can only check the skipped
-                         * bits. Remember, since we have already indexed into the
-                         * parent's child array, we know that the bits we chopped of
-                         * *are* zero.
-                         */
-                        /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+                /* If current_prefix_length < pos+bits, we are already doing
-                
+                 * actual prefix  matching, which means everything from
-                        if (current_prefix_length < pos+bits) {
+                 * pos+(bits-chopped_off) onward must be zero along some
-                                if (tkey_extract_bits(cn->key, current_prefix_length,
+                 * branch of this subtree - otherwise there is *no* valid
-                                                      cn->pos - current_prefix_length) != 0 ||
+                 * prefix present. Here we can only check the skipped
-                                    !(cn->child[0]))
+                 * bits. Remember, since we have already indexed into the
-                                        goto backtrace;
+                 * parent's child array, we know that the bits we chopped of
-                        }
+                 * *are* zero.
+                 */
-                        /*
+                /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
-                         * If chopped_off=0, the index is fully validated and we
-                         * only need to look at the skipped bits for this, the new,
-                         * tnode. What we actually want to do is to find out if
-                         * these skipped bits match our key perfectly, or if we will
-                         * have to count on finding a matching prefix further down,
-                         * because if we do, we would like to have some way of
-                         * verifying the existence of such a prefix at this point.
-                         */
-                        /* The only thing we can do at this point is to verify that
+                if (current_prefix_length < pos+bits) {
-                         * any such matching prefix can indeed be a prefix to our
+                        if (tkey_extract_bits(cn->key, current_prefix_length,
-                         * key, and if the bits in the node we are inspecting that
+                                                cn->pos - current_prefix_length) != 0 ||
-                         * do not match our key are not ZERO, this cannot be true.
+                            !(cn->child[0]))
-                         * Thus, find out where there is a mismatch (before cn->pos)
+                                goto backtrace;
-                         * and verify that all the mismatching bits are zero in the
+                }
-                         * new tnode's key.
-                         */
-                        /* Note: We aren't very concerned about the piece of the key
+                /*
-                         * that precede pn->pos+pn->bits, since these have already been
+                 * If chopped_off=0, the index is fully validated and we
-                         * checked. The bits after cn->pos aren't checked since these are
+                 * only need to look at the skipped bits for this, the new,
-                         * by definition "unknown" at this point. Thus, what we want to
+                 * tnode. What we actually want to do is to find out if
-                         * see is if we are about to enter the "prefix matching" state,
+                 * these skipped bits match our key perfectly, or if we will
-                         * and in that case verify that the skipped bits that will prevail
+                 * have to count on finding a matching prefix further down,
-                         * throughout this subtree are zero, as they have to be if we are
+                 * because if we do, we would like to have some way of
-                         * to find a matching prefix.
+                 * verifying the existence of such a prefix at this point.
-                         */
+                 */
-                        node_prefix = MASK_PFX(cn->key, cn->pos);
+                /* The only thing we can do at this point is to verify that
-                        key_prefix = MASK_PFX(key, cn->pos);
+                 * any such matching prefix can indeed be a prefix to our
-                        pref_mismatch = key_prefix^node_prefix;
+                 * key, and if the bits in the node we are inspecting that
-                        mp = 0;
+                 * do not match our key are not ZERO, this cannot be true.
+                 * Thus, find out where there is a mismatch (before cn->pos)
+                 * and verify that all the mismatching bits are zero in the
+                 * new tnode's key.
+                 */
-                        /* In short: If skipped bits in this node do not match the search
+                /* Note: We aren't very concerned about the piece of the key
-                         * key, enter the "prefix matching" state.directly.
+                 * that precede pn->pos+pn->bits, since these have already been
-                         */
+                 * checked. The bits after cn->pos aren't checked since these are
-                        if (pref_mismatch) {
+                 * by definition "unknown" at this point. Thus, what we want to
-                                while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+                 * see is if we are about to enter the "prefix matching" state,
-                                        mp++;
+                 * and in that case verify that the skipped bits that will prevail
-                                        pref_mismatch = pref_mismatch <<1;
+                 * throughout this subtree are zero, as they have to be if we are
-                                }
+                 * to find a matching prefix.
-                                key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+                 */
-                        
-                                if (key_prefix != 0)
+                node_prefix = MASK_PFX(cn->key, cn->pos);
-                                        goto backtrace;
+                key_prefix = MASK_PFX(key, cn->pos);
+                pref_mismatch = key_prefix^node_prefix;
-                                if (current_prefix_length >= cn->pos)
+                mp = 0;
-                                        current_prefix_length=mp;
-                       }
+                /* In short: If skipped bits in this node do not match the search
-#endif
+                 * key, enter the "prefix matching" state.directly.
-                       pn = (struct tnode *)n; /* Descend */
+                 */
-                       chopped_off = 0;
+                if (pref_mismatch) {
-                       continue;
+                        while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+                                mp++;
+                                pref_mismatch = pref_mismatch <<1;
+                        }
+                        key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+                        if (key_prefix != 0)
+                                goto backtrace;
+                        if (current_prefix_length >= cn->pos)
+                                current_prefix_length = mp;
                }
-                if (IS_LEAF(n)) {
+#endif
-                        if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+                pn = (struct tnode *)n; /* Descend */
-                                goto found;
+                chopped_off = 0;
-               }
+                continue;
 backtrace:
                chopped_off++;
                /* As zero don't change the child key (cindex) */
-                while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
+                while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
                        chopped_off++;
-                }
                /* Decrease current_... with bits chopped off */
                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
                        current_prefix_length = pn->pos + pn->bits - chopped_off;
-        
                /*
                 * Either we do the actual chop off according or if we have
                 * chopped off all bits in this tnode walk up to our parent.
                 */
-                if (chopped_off <= pn->bits)
+                if (chopped_off <= pn->bits) {
                        cindex &= ~(1 << (chopped_off-1));
-                else {
+                } else {
                        if (NODE_PARENT(pn) == NULL)
                                goto failed;
-                
                        /* Get Child's index */
                        cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
                        pn = NODE_PARENT(pn);
@@ -1548,10 +1468,11 @@ backtrace:
 failed:
        ret = 1;
 found:
-        read_unlock(&fib_lock);
+        rcu_read_unlock();
        return ret;
 }
+/* only called from updater side */
 static int trie_leaf_remove(struct trie *t, t_key key)
 {
        t_key cindex;
@@ -1559,24 +1480,20 @@ static int trie_leaf_remove(struct trie *t, t_key key)
        struct node *n = t->trie;
        struct leaf *l;
-        if (trie_debug)
+        pr_debug("entering trie_leaf_remove(%p)\n", n);
-                printk("entering trie_leaf_remove(%p)\n", n);
        /* Note that in the case skipped bits, those bits are *not* checked!
         * When we finish this, we will have NULL or a T_LEAF, and the
         * T_LEAF may or may not match our key.
         */
-        while (n != NULL && IS_TNODE(n)) {
+        while (n != NULL && IS_TNODE(n)) {
                struct tnode *tn = (struct tnode *) n;
                check_tnode(tn);
                n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
-                        if (n && NODE_PARENT(n) != tn) {
+                BUG_ON(n && NODE_PARENT(n) != tn);
-                                printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
+        }
-                                BUG();
-                        }
-        }
        l = (struct leaf *) n;
        if (!n || !tkey_equals(l->key, key))
@@ -1590,23 +1507,24 @@ static int trie_leaf_remove(struct trie *t, t_key key)
        t->revision++;
        t->size--;
+        preempt_disable();
        tp = NODE_PARENT(n);
        tnode_free((struct tnode *) n);
        if (tp) {
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                put_child(t, (struct tnode *)tp, cindex, NULL);
-                t->trie = trie_rebalance(t, tp);
+                rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
-        }
+        } else
-        else
+                rcu_assign_pointer(t->trie, NULL);
-                t->trie = NULL;
+        preempt_enable();
        return 1;
 }
 static int
 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-               struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+                struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
 {
        struct trie *t = (struct trie *) tb->tb_data;
        u32 key, mask;
@@ -1615,6 +1533,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
        struct fib_alias *fa, *fa_to_delete;
        struct list_head *fa_head;
        struct leaf *l;
+        struct leaf_info *li;
        if (plen > 32)
                return -EINVAL;
@@ -1624,7 +1544,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
                memcpy(&key, rta->rta_dst, 4);
        key = ntohl(key);
-        mask = ntohl( inet_make_mask(plen) );
+        mask = ntohl(inet_make_mask(plen));
        if (key & ~mask)
                return -EINVAL;
@@ -1641,11 +1561,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
        if (!fa)
                return -ESRCH;
-        if (trie_debug)
+        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
-                printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
        fa_to_delete = NULL;
        fa_head = fa->fa_list.prev;
        list_for_each_entry(fa, fa_head, fa_list) {
                struct fib_info *fi = fa->fa_info;
@@ -1664,39 +1584,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
                }
        }
-        if (fa_to_delete) {
+        if (!fa_to_delete)
-                int kill_li = 0;
+                return -ESRCH;
-                struct leaf_info *li;
-                fa = fa_to_delete;
-                rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
-                l = fib_find_node(t, key);
+        fa = fa_to_delete;
-                li = find_leaf_info(&l->list, plen);
+        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
-                write_lock_bh(&fib_lock);
+        l = fib_find_node(t, key);
+        li = find_leaf_info(&l->list, plen);
-                list_del(&fa->fa_list);
+        list_del_rcu(&fa->fa_list);
-                if (list_empty(fa_head)) {
+        if (list_empty(fa_head)) {
-                        hlist_del(&li->hlist);
+                hlist_del_rcu(&li->hlist);
-                        kill_li = 1;
+                free_leaf_info(li);
-                }
+        }
-                write_unlock_bh(&fib_lock);
-        
-                if (kill_li)
-                        free_leaf_info(li);
-                if (hlist_empty(&l->list))
+        if (hlist_empty(&l->list))
-                        trie_leaf_remove(t, key);
+                trie_leaf_remove(t, key);
-                if (fa->fa_state & FA_S_ACCESSED)
+        if (fa->fa_state & FA_S_ACCESSED)
-                        rt_cache_flush(-1);
+                rt_cache_flush(-1);
-                fn_free_alias(fa);
+        fib_release_info(fa->fa_info);
-                return 0;
+        alias_free_mem_rcu(fa);
-        }
+        return 0;
-        return -ESRCH;
 }
 static int trie_flush_list(struct trie *t, struct list_head *head)
@@ -1706,14 +1618,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
                struct fib_info *fi = fa->fa_info;
-        
-                if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-                        write_lock_bh(&fib_lock);
-                        list_del(&fa->fa_list);
-                        write_unlock_bh(&fib_lock);
-                        fn_free_alias(fa);
+                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+                        list_del_rcu(&fa->fa_list);
+                        fib_release_info(fa->fa_info);
+                        alias_free_mem_rcu(fa);
                        found++;
                }
        }
@@ -1728,37 +1637,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
        struct leaf_info *li = NULL;
        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
-                
                found += trie_flush_list(t, &li->falh);
                if (list_empty(&li->falh)) {
+                        hlist_del_rcu(&li->hlist);
-                        write_lock_bh(&fib_lock);
-                        hlist_del(&li->hlist);
-                        write_unlock_bh(&fib_lock);
                        free_leaf_info(li);
                }
        }
        return found;
 }
+/* rcu_read_lock needs to be hold by caller from readside */
 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
 {
        struct node *c = (struct node *) thisleaf;
        struct tnode *p;
        int idx;
+        struct node *trie = rcu_dereference(t->trie);
        if (c == NULL) {
-                if (t->trie == NULL)
+                if (trie == NULL)
                        return NULL;
-                if (IS_LEAF(t->trie))          /* trie w. just a leaf */
+                if (IS_LEAF(trie))          /* trie w. just a leaf */
-                        return (struct leaf *) t->trie;
+                        return (struct leaf *) trie;
-                p = (struct tnode*) t->trie;  /* Start */
+                p = (struct tnode*) trie;  /* Start */
-        }
+        } else
-        else
                p = (struct tnode *) NODE_PARENT(c);
        while (p) {
@@ -1771,29 +1677,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
                        pos = 0;
                last = 1 << p->bits;
-                for(idx = pos; idx < last ; idx++) {
+                for (idx = pos; idx < last ; idx++) {
-                        if (p->child[idx]) {
+                        c = rcu_dereference(p->child[idx]);
-                                /* Decend if tnode */
+                        if (!c)
+                                continue;
-                                while (IS_TNODE(p->child[idx])) {
-                                        p = (struct tnode*) p->child[idx];
+                        /* Decend if tnode */
-                                        idx = 0;
+                        while (IS_TNODE(c)) {
-                                
+                                p = (struct tnode *) c;
-                                        /* Rightmost non-NULL branch */
+                                idx = 0;
-                                        if (p && IS_TNODE(p))
-                                                while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
+                                /* Rightmost non-NULL branch */
+                                if (p && IS_TNODE(p))
-                                        /* Done with this tnode? */
+                                        while (!(c = rcu_dereference(p->child[idx]))
-                                        if (idx >= (1 << p->bits) || p->child[idx] == NULL )
+                                               && idx < (1<<p->bits)) idx++;
-                                                goto up;
-                                }
+                                /* Done with this tnode? */
-                                return (struct leaf*) p->child[idx];
+                                if (idx >= (1 << p->bits) || !c)
+                                        goto up;
                        }
+                        return (struct leaf *) c;
                }
 up:
                /* No more children go up one step  */
-                c = (struct node*) p;
+                c = (struct node *) p;
                p = (struct tnode *) NODE_PARENT(p);
        }
        return NULL; /* Ready. Root of trie */
@@ -1807,23 +1715,24 @@ static int fn_trie_flush(struct fib_table *tb)
        t->revision++;
-        for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+        rcu_read_lock();
+        for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
                found += trie_flush_leaf(t, l);
                if (ll && hlist_empty(&ll->list))
                        trie_leaf_remove(t, ll->key);
                ll = l;
        }
+        rcu_read_unlock();  
        if (ll && hlist_empty(&ll->list))
                trie_leaf_remove(t, ll->key);
-        if (trie_debug)
+        pr_debug("trie_flush found=%d\n", found);
-                printk("trie_flush found=%d\n", found);
        return found;
 }
-static int trie_last_dflt=-1;
+static int trie_last_dflt = -1;
 static void
 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
@@ -1840,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
        last_resort = NULL;
        order = -1;
-        read_lock(&fib_lock);
+        rcu_read_lock();
        l = fib_find_node(t, 0);
        if (!l)
@@ -1853,20 +1762,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
        if (list_empty(fa_head))
                goto out;
-        list_for_each_entry(fa, fa_head, fa_list) {
+        list_for_each_entry_rcu(fa, fa_head, fa_list) {
                struct fib_info *next_fi = fa->fa_info;
-        
                if (fa->fa_scope != res->scope ||
                    fa->fa_type != RTN_UNICAST)
                        continue;
-        
                if (next_fi->fib_priority > res->fi->fib_priority)
                        break;
                if (!next_fi->fib_nh[0].nh_gw ||
                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
                        continue;
                fa->fa_state |= FA_S_ACCESSED;
-        
                if (fi == NULL) {
                        if (next_fi != res->fi)
                                break;
@@ -1904,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
        }
        trie_last_dflt = last_idx;
 out:;
-        read_unlock(&fib_lock);
+        rcu_read_unlock();
 }
 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
@@ -1913,12 +1822,14 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
        int i, s_i;
        struct fib_alias *fa;
-        u32 xkey=htonl(key);
+        u32 xkey = htonl(key);
-        s_i=cb->args[3];
+        s_i = cb->args[3];
        i = 0;
-        list_for_each_entry(fa, fah, fa_list) {
+        /* rcu_read_lock is hold by caller */
+        list_for_each_entry_rcu(fa, fah, fa_list) {
                if (i < s_i) {
                        i++;
                        continue;
@@ -1946,10 +1857,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
                                  fa->fa_info, 0) < 0) {
                        cb->args[3] = i;
                        return -1;
-                        }
+                }
                i++;
        }
-        cb->args[3]=i;
+        cb->args[3] = i;
        return skb->len;
 }
@@ -1959,10 +1870,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
        int h, s_h;
        struct list_head *fa_head;
        struct leaf *l = NULL;
-        s_h=cb->args[2];
-        for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+        s_h = cb->args[2];
+        for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
                if (h < s_h)
                        continue;
                if (h > s_h)
@@ -1970,7 +1881,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
                               sizeof(cb->args) - 3*sizeof(cb->args[0]));
                fa_head = get_fa_head(l, plen);
-        
                if (!fa_head)
                        continue;
@@ -1978,11 +1889,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
                        continue;
                if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
-                        cb->args[2]=h;
+                        cb->args[2] = h;
                        return -1;
                }
        }
-        cb->args[2]=h;
+        cb->args[2] = h;
        return skb->len;
 }
@@ -1993,25 +1904,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
        s_m = cb->args[1];
-        read_lock(&fib_lock);
+        rcu_read_lock();
-        for (m=0; m<=32; m++) {
+        for (m = 0; m <= 32; m++) {
                if (m < s_m)
                        continue;
                if (m > s_m)
                        memset(&cb->args[2], 0,
-                               sizeof(cb->args) - 2*sizeof(cb->args[0]));
+                                sizeof(cb->args) - 2*sizeof(cb->args[0]));
                if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
                        cb->args[1] = m;
                        goto out;
                }
        }
-        read_unlock(&fib_lock);
+        rcu_read_unlock();
        cb->args[1] = m;
        return skb->len;
- out:
+out:
-        read_unlock(&fib_lock);
+        rcu_read_unlock();
        return -1;
 }
@@ -2051,9 +1961,9 @@ struct fib_table * __init fib_hash_init(int id)
        trie_init(t);
        if (id == RT_TABLE_LOCAL)
-                trie_local = t;
+                trie_local = t;
        else if (id == RT_TABLE_MAIN)
-                trie_main = t;
+                trie_main = t;
        if (id == RT_TABLE_LOCAL)
                printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -2065,7 +1975,8 @@ struct fib_table * __init fib_hash_init(int id)
 static void putspace_seq(struct seq_file *seq, int n)
 {
-        while (n--) seq_printf(seq, " ");
+        while (n--)
+                seq_printf(seq, " ");
 }
 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
@@ -2086,29 +1997,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
                seq_printf(seq, "%d/", cindex);
                printbin_seq(seq, cindex, bits);
                seq_printf(seq, ": ");
-        }
+        } else
-        else
                seq_printf(seq, "<root>: ");
        seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
-        if (IS_LEAF(n))
-                seq_printf(seq, "key=%d.%d.%d.%d\n",
-                           n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
-        else {
-                int plen = ((struct tnode *)n)->pos;
-                t_key prf=MASK_PFX(n->key, plen);
-                seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
-                           prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
-        }
        if (IS_LEAF(n)) {
-                struct leaf *l=(struct leaf *)n;
+                struct leaf *l = (struct leaf *)n;
                struct fib_alias *fa;
                int i;
-                for (i=32; i>=0; i--)
-                  if (find_leaf_info(&l->list, i)) {
+                seq_printf(seq, "key=%d.%d.%d.%d\n",
-                
+                           n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
+                for (i = 32; i >= 0; i--)
+                        if (find_leaf_info(&l->list, i)) {
                                struct list_head *fa_head = get_fa_head(l, i);
-                        
                                if (!fa_head)
                                        continue;
@@ -2118,17 +2022,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
                                putspace_seq(seq, indent+2);
                                seq_printf(seq, "{/%d...dumping}\n", i);
+                                list_for_each_entry_rcu(fa, fa_head, fa_list) {
-                                list_for_each_entry(fa, fa_head, fa_list) {
                                        putspace_seq(seq, indent+2);
-                                        if (fa->fa_info->fib_nh == NULL) {
-                                                seq_printf(seq, "Error _fib_nh=NULL\n");
-                                                continue;
-                                        }
                                        if (fa->fa_info == NULL) {
                                                seq_printf(seq, "Error fa_info=NULL\n");
                                                continue;
                                        }
+                                        if (fa->fa_info->fib_nh == NULL) {
+                                                seq_printf(seq, "Error _fib_nh=NULL\n");
+                                                continue;
+                                        }
                                        seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
                                              fa->fa_type,
@@ -2136,11 +2039,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
                                              fa->fa_tos);
                                }
                        }
-        }
+        } else {
-        else if (IS_TNODE(n)) {
                struct tnode *tn = (struct tnode *)n;
+                int plen = ((struct tnode *)n)->pos;
+                t_key prf = MASK_PFX(n->key, plen);
+                seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
+                           prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
                putspace_seq(seq, indent); seq_printf(seq, "|    ");
-                seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
+                seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
                printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
                seq_printf(seq, "}\n");
                putspace_seq(seq, indent); seq_printf(seq, "|    ");
@@ -2154,194 +2062,196 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 {
-        struct node *n = t->trie;
+        struct node *n;
-        int cindex=0;
+        int cindex = 0;
-        int indent=1;
+        int indent = 1;
-        int pend=0;
+        int pend = 0;
        int depth = 0;
+        struct tnode *tn;
-        read_lock(&fib_lock);
+        rcu_read_lock();
+        n = rcu_dereference(t->trie);
        seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
-        if (n) {
-                printnode_seq(seq, indent, n, pend, cindex, 0);
-                if (IS_TNODE(n)) {
-                        struct tnode *tn = (struct tnode *)n;
-                        pend = tn->pos+tn->bits;
-                        putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
-                        indent += 3;
-                        depth++;
-                        while (tn && cindex < (1 << tn->bits)) {
-                                if (tn->child[cindex]) {
-                                
-                                        /* Got a child */
-                                
-                                        printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
-                                        if (IS_LEAF(tn->child[cindex])) {
-                                                cindex++;
-                                        
-                                        }
-                                        else {
-                                                /*
-                                                 * New tnode. Decend one level
-                                                 */
-                                        
-                                                depth++;
-                                                n = tn->child[cindex];
-                                                tn = (struct tnode *)n;
-                                                pend = tn->pos+tn->bits;
-                                                putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
-                                                indent+=3;
-                                                cindex=0;
-                                        }
-                                }
-                                else
-                                        cindex++;
+        if (!n) {
+                seq_printf(seq, "------ trie is empty\n");
+                rcu_read_unlock();
+                return;
+        }
+        printnode_seq(seq, indent, n, pend, cindex, 0);
+        if (!IS_TNODE(n)) {
+                rcu_read_unlock();
+                return;
+        }
+        tn = (struct tnode *)n;
+        pend = tn->pos+tn->bits;
+        putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
+        indent += 3;
+        depth++;
+        while (tn && cindex < (1 << tn->bits)) {
+                struct node *child = rcu_dereference(tn->child[cindex]);
+                if (!child)
+                        cindex++;
+                else {
+                        /* Got a child */
+                        printnode_seq(seq, indent, child, pend,
+                                      cindex, tn->bits);
+                        if (IS_LEAF(child))
+                                cindex++;
+                        else {
                                /*
-                                 * Test if we are done
+                                 * New tnode. Decend one level
                                 */
-                        
-                                while (cindex >= (1 << tn->bits)) {
-                                        /*
+                                depth++;
-                                         * Move upwards and test for root
+                                n = child;
-                                         * pop off all traversed  nodes
+                                tn = (struct tnode *)n;
-                                         */
+                                pend = tn->pos+tn->bits;
-                                
+                                putspace_seq(seq, indent);
-                                        if (NODE_PARENT(tn) == NULL) {
+                                seq_printf(seq, "\\--\n");
-                                                tn = NULL;
+                                indent += 3;
-                                                n = NULL;
+                                cindex = 0;
-                                                break;
-                                        }
-                                        else {
-                                                cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-                                                tn = NODE_PARENT(tn);
-                                                cindex++;
-                                                n = (struct node *)tn;
-                                                pend = tn->pos+tn->bits;
-                                                indent-=3;
-                                                depth--;
-                                        }
-                                }
                        }
                }
-                else n = NULL;
-        }
-        else seq_printf(seq, "------ trie is empty\n");
-        read_unlock(&fib_lock);
+                /*
+                 * Test if we are done
+                 */
+                while (cindex >= (1 << tn->bits)) {
+                        /*
+                         * Move upwards and test for root
+                         * pop off all traversed  nodes
+                         */
+                        if (NODE_PARENT(tn) == NULL) {
+                                tn = NULL;
+                                break;
+                        }
+                        cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+                        cindex++;
+                        tn = NODE_PARENT(tn);
+                        pend = tn->pos + tn->bits;
+                        indent -= 3;
+                        depth--;
+                }
+        }
+        rcu_read_unlock();
 }
 static struct trie_stat *trie_stat_new(void)
 {
-        struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
+        struct trie_stat *s;
        int i;
-        if (s) {
+        s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
-                s->totdepth = 0;
+        if (!s)
-                s->maxdepth = 0;
+                return NULL;
-                s->tnodes = 0;
-                s->leaves = 0;
+        s->totdepth = 0;
-                s->nullpointers = 0;
+        s->maxdepth = 0;
-        
+        s->tnodes = 0;
-                for(i=0; i< MAX_CHILDS; i++)
+        s->leaves = 0;
-                        s->nodesizes[i] = 0;
+        s->nullpointers = 0;
-        }
+        for (i = 0; i < MAX_CHILDS; i++)
+                s->nodesizes[i] = 0;
        return s;
 }
 static struct trie_stat *trie_collect_stats(struct trie *t)
 {
-        struct node *n = t->trie;
+        struct node *n;
        struct trie_stat *s = trie_stat_new();
        int cindex = 0;
-        int indent = 1;
        int pend = 0;
        int depth = 0;
-        read_lock(&fib_lock);   
+        if (!s)
+                return NULL;
-        if (s) {
+        rcu_read_lock();
-                if (n) {
+        n = rcu_dereference(t->trie);
-                        if (IS_TNODE(n)) {
-                                struct tnode *tn = (struct tnode *)n;
-                                pend = tn->pos+tn->bits;
-                                indent += 3;
-                                s->nodesizes[tn->bits]++;
-                                depth++;
-                                while (tn && cindex < (1 << tn->bits)) {
+        if (!n)
-                                        if (tn->child[cindex]) {
+                return s;
-                                                /* Got a child */
-                                
+        if (IS_TNODE(n)) {
-                                                if (IS_LEAF(tn->child[cindex])) {
+                struct tnode *tn = (struct tnode *)n;
-                                                        cindex++;
+                pend = tn->pos+tn->bits;
-                                        
+                s->nodesizes[tn->bits]++;
-                                                        /* stats */
+                depth++;
-                                                        if (depth > s->maxdepth)
-                                                                s->maxdepth = depth;
+                while (tn && cindex < (1 << tn->bits)) {
-                                                        s->totdepth += depth;
+                        struct node *ch = rcu_dereference(tn->child[cindex]);
-                                                        s->leaves++;
+                        if (ch) {
-                                                }
-                                
-                                                else {
-                                                        /*
-                                                         * New tnode. Decend one level
-                                                         */
-                                        
-                                                        s->tnodes++;
-                                                        s->nodesizes[tn->bits]++;
-                                                        depth++;
-                                        
-                                                        n = tn->child[cindex];
-                                                        tn = (struct tnode *)n;
-                                                        pend = tn->pos+tn->bits;
-                                                        indent += 3;
-                                                        cindex = 0;
-                                                }
-                                        }
-                                        else {
-                                                cindex++;
-                                                s->nullpointers++;
-                                        }
+                                /* Got a child */
+                                if (IS_LEAF(tn->child[cindex])) {
+                                        cindex++;
+                                        /* stats */
+                                        if (depth > s->maxdepth)
+                                                s->maxdepth = depth;
+                                        s->totdepth += depth;
+                                        s->leaves++;
+                                } else {
                                        /*
-                                         * Test if we are done
+                                         * New tnode. Decend one level
                                         */
-                        
-                                        while (cindex >= (1 << tn->bits)) {
+                                        s->tnodes++;
+                                        s->nodesizes[tn->bits]++;
-                                                /*
+                                        depth++;
-                                                 * Move upwards and test for root
-                                                 * pop off all traversed  nodes
+                                        n = ch;
-                                                 */
+                                        tn = (struct tnode *)n;
+                                        pend = tn->pos+tn->bits;
-                                        
-                                                if (NODE_PARENT(tn) == NULL) {
+                                        cindex = 0;
-                                                        tn = NULL;
-                                                        n = NULL;
-                                                        break;
-                                                }
-                                                else {
-                                                        cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-                                                        tn = NODE_PARENT(tn);
-                                                        cindex++;
-                                                        n = (struct node *)tn;
-                                                        pend = tn->pos+tn->bits;
-                                                        indent -= 3;
-                                                        depth--;
-                                                }
-                                        }
                                }
+                        } else {
+                                cindex++;
+                                s->nullpointers++;
                        }
-                        else n = NULL;
+                        /*
+                         * Test if we are done
+                         */
+                        while (cindex >= (1 << tn->bits)) {
+                                /*
+                                 * Move upwards and test for root
+                                 * pop off all traversed  nodes
+                                 */
+                                if (NODE_PARENT(tn) == NULL) {
+                                        tn = NULL;
+                                        n = NULL;
+                                        break;
+                                }
+                                cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+                                tn = NODE_PARENT(tn);
+                                cindex++;
+                                n = (struct node *)tn;
+                                pend = tn->pos+tn->bits;
+                                depth--;
+                        }
                }
        }
-        read_unlock(&fib_lock); 
+        rcu_read_unlock();
        return s;
 }
@@ -2359,17 +2269,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        void *v = NULL;
+        if (!ip_fib_main_table)
+                return NULL;
-        if (ip_fib_main_table)
+        if (*pos)
-                v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
+                return fib_triestat_get_next(seq);
-        return v;
+        else
+                return SEQ_START_TOKEN;
 }
 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        ++*pos;
-        return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
+        if (v == SEQ_START_TOKEN)
+                return fib_triestat_get_first(seq);
+        else
+                return fib_triestat_get_next(seq);
 }
 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
@@ -2388,22 +2303,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 {
        int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
        int i, max, pointers;
-        struct trie_stat *stat;
+        struct trie_stat *stat;
        int avdepth;
        stat = trie_collect_stats(t);
-        bytes=0;
+        bytes = 0;
        seq_printf(seq, "trie=%p\n", t);
        if (stat) {
                if (stat->leaves)
-                        avdepth=stat->totdepth*100 / stat->leaves;
+                        avdepth = stat->totdepth*100 / stat->leaves;
                else
-                        avdepth=0;
+                        avdepth = 0;
-                seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+                seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
                seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
-                        
                seq_printf(seq, "Leaves: %d\n", stat->leaves);
                bytes += sizeof(struct leaf) * stat->leaves;
                seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2455,11 +2370,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
                if (trie_main)
                        collect_and_show(trie_main, seq);
-        }
+        } else {
-        else {
+                snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
-                snprintf(bf, sizeof(bf),
-                         "*\t%08X\t%08X", 200, 400);
-        
                seq_printf(seq, "%-127s\n", bf);
        }
        return 0;
@@ -2520,22 +2433,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        void *v = NULL;
+        if (!ip_fib_main_table)
+                return NULL;
-        if (ip_fib_main_table)
+        if (*pos)
-                v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
+                return fib_trie_get_next(seq);
-        return v;
+        else
+                return SEQ_START_TOKEN;
 }
 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        ++*pos;
-        return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
+        if (v == SEQ_START_TOKEN)
+                return fib_trie_get_first(seq);
+        else
+                return fib_trie_get_next(seq);
 }
 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 {
 }
 /*
@@ -2555,9 +2473,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
                if (trie_main)
                        trie_dump_seq(seq, trie_main);
-        }
+        } else {
-        else {
                snprintf(bf, sizeof(bf),
                         "*\t%08X\t%08X", 200, 400);
                seq_printf(seq, "%-127s\n", bf);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index badfc5849973..24eb56ae1b5a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
 /*
 *      Statistics
 */
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
 /* An array of errno for error messages from dest unreach. */
 /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
                        break;
                case ICMP_FRAG_NEEDED:
                        if (ipv4_config.no_pmtu_disc) {
-                                LIMIT_NETDEBUG(
+                                LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
-                                        printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
                                                         "fragmentation needed "
                                                         "and DF set.\n",
-                                               NIPQUAD(iph->daddr)));
+                                               NIPQUAD(iph->daddr));
                        } else {
                                info = ip_rt_frag_needed(iph,
                                                     ntohs(icmph->un.frag.mtu));
@@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
                        }
                        break;
                case ICMP_SR_FAILED:
-                        LIMIT_NETDEBUG(
+                        LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
-                                printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
                                                 "Route Failed.\n",
-                                       NIPQUAD(iph->daddr)));
+                                       NIPQUAD(iph->daddr));
                        break;
                default:
                        break;
@@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
        case CHECKSUM_HW:
                if (!(u16)csum_fold(skb->csum))
                        break;
-                LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
        case CHECKSUM_NONE:
                if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
                        goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5088f90835ae..44607f4767b8 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
        case IGMP_MTRACE_RESP:
                break;
        default:
-                NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+                NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
        }
        in_dev_put(in_dev);
        kfree_skb(skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000000..fe3c6d3d0c91
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Support for INET connection oriented protocols.
+ *
+ * Authors:     See the TCP sources
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or(at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+{
+        const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
+        struct sock *sk2;
+        struct hlist_node *node;
+        int reuse = sk->sk_reuse;
+        sk_for_each_bound(sk2, node, &tb->owners) {
+                if (sk != sk2 &&
+                    !inet_v6_ipv6only(sk2) &&
+                    (!sk->sk_bound_dev_if ||
+                     !sk2->sk_bound_dev_if ||
+                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+                        if (!reuse || !sk2->sk_reuse ||
+                            sk2->sk_state == TCP_LISTEN) {
+                                const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+                                    sk2_rcv_saddr == sk_rcv_saddr)
+                                        break;
+                        }
+                }
+        }
+        return node != NULL;
+}
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+                      struct sock *sk, unsigned short snum)
+{
+        struct inet_bind_hashbucket *head;
+        struct hlist_node *node;
+        struct inet_bind_bucket *tb;
+        int ret;
+        local_bh_disable();
+        if (!snum) {
+                int low = sysctl_local_port_range[0];
+                int high = sysctl_local_port_range[1];
+                int remaining = (high - low) + 1;
+                int rover;
+                spin_lock(&hashinfo->portalloc_lock);
+                if (hashinfo->port_rover < low)
+                        rover = low;
+                else
+                        rover = hashinfo->port_rover;
+                do {
+                        rover++;
+                        if (rover > high)
+                                rover = low;
+                        head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+                        spin_lock(&head->lock);
+                        inet_bind_bucket_for_each(tb, node, &head->chain)
+                                if (tb->port == rover)
+                                        goto next;
+                        break;
+                next:
+                        spin_unlock(&head->lock);
+                } while (--remaining > 0);
+                hashinfo->port_rover = rover;
+                spin_unlock(&hashinfo->portalloc_lock);
+                /* Exhausted local port range during search?  It is not
+                 * possible for us to be holding one of the bind hash
+                 * locks if this test triggers, because if 'remaining'
+                 * drops to zero, we broke out of the do/while loop at
+                 * the top level, not from the 'break;' statement.
+                 */
+                ret = 1;
+                if (remaining <= 0)
+                        goto fail;
+                /* OK, here is the one we will use.  HEAD is
+                 * non-NULL and we hold it's mutex.
+                 */
+                snum = rover;
+        } else {
+                head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+                spin_lock(&head->lock);
+                inet_bind_bucket_for_each(tb, node, &head->chain)
+                        if (tb->port == snum)
+                                goto tb_found;
+        }
+        tb = NULL;
+        goto tb_not_found;
+tb_found:
+        if (!hlist_empty(&tb->owners)) {
+                if (sk->sk_reuse > 1)
+                        goto success;
+                if (tb->fastreuse > 0 &&
+                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+                        goto success;
+                } else {
+                        ret = 1;
+                        if (inet_csk_bind_conflict(sk, tb))
+                                goto fail_unlock;
+                }
+        }
+tb_not_found:
+        ret = 1;
+        if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+                goto fail_unlock;
+        if (hlist_empty(&tb->owners)) {
+                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+                        tb->fastreuse = 1;
+                else
+                        tb->fastreuse = 0;
+        } else if (tb->fastreuse &&
+                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+                tb->fastreuse = 0;
+success:
+        if (!inet_csk(sk)->icsk_bind_hash)
+                inet_bind_hash(sk, tb, snum);
+        BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+        ret = 0;
+fail_unlock:
+        spin_unlock(&head->lock);
+fail:
+        local_bh_enable();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        DEFINE_WAIT(wait);
+        int err;
+        /*
+         * True wake-one mechanism for incoming connections: only
+         * one process gets woken up, not the 'whole herd'.
+         * Since we do not 'race & poll' for established sockets
+         * anymore, the common case will execute the loop only once.
+         *
+         * Subtle issue: "add_wait_queue_exclusive()" will be added
+         * after any current non-exclusive waiters, and we know that
+         * it will always _stay_ after any new non-exclusive waiters
+         * because all non-exclusive waiters are added at the
+         * beginning of the wait-queue. As such, it's ok to "drop"
+         * our exclusiveness temporarily when we get woken up without
+         * having to remove and re-insert us on the wait queue.
+         */
+        for (;;) {
+                prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+                                          TASK_INTERRUPTIBLE);
+                release_sock(sk);
+                if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+                        timeo = schedule_timeout(timeo);
+                lock_sock(sk);
+                err = 0;
+                if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+                        break;
+                err = -EINVAL;
+                if (sk->sk_state != TCP_LISTEN)
+                        break;
+                err = sock_intr_errno(timeo);
+                if (signal_pending(current))
+                        break;
+                err = -EAGAIN;
+                if (!timeo)
+                        break;
+        }
+        finish_wait(sk->sk_sleep, &wait);
+        return err;
+}
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct sock *newsk;
+        int error;
+        lock_sock(sk);
+        /* We need to make sure that this socket is listening,
+         * and that it has something pending.
+         */
+        error = -EINVAL;
+        if (sk->sk_state != TCP_LISTEN)
+                goto out_err;
+        /* Find already established connection */
+        if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+                /* If this is a non blocking socket don't sleep */
+                error = -EAGAIN;
+                if (!timeo)
+                        goto out_err;
+                error = inet_csk_wait_for_connect(sk, timeo);
+                if (error)
+                        goto out_err;
+        }
+        newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+        BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+out:
+        release_sock(sk);
+        return newsk;
+out_err:
+        newsk = NULL;
+        *err = error;
+        goto out;
+}
+EXPORT_SYMBOL(inet_csk_accept);
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+                               void (*retransmit_handler)(unsigned long),
+                               void (*delack_handler)(unsigned long),
+                               void (*keepalive_handler)(unsigned long))
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        init_timer(&icsk->icsk_retransmit_timer);
+        init_timer(&icsk->icsk_delack_timer);
+        init_timer(&sk->sk_timer);
+        icsk->icsk_retransmit_timer.function = retransmit_handler;
+        icsk->icsk_delack_timer.function     = delack_handler;
+        sk->sk_timer.function                = keepalive_handler;
+        icsk->icsk_retransmit_timer.data = 
+                icsk->icsk_delack_timer.data =
+                        sk->sk_timer.data  = (unsigned long)sk;
+        icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+        sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+        sk_stop_timer(sk, &icsk->icsk_delack_timer);
+        sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+        sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+        sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+struct dst_entry* inet_csk_route_req(struct sock *sk,
+                                     const struct request_sock *req)
+{
+        struct rtable *rt;
+        const struct inet_request_sock *ireq = inet_rsk(req);
+        struct ip_options *opt = inet_rsk(req)->opt;
+        struct flowi fl = { .oif = sk->sk_bound_dev_if,
+                            .nl_u = { .ip4_u =
+                                      { .daddr = ((opt && opt->srr) ?
+                                                  opt->faddr :
+                                                  ireq->rmt_addr),
+                                        .saddr = ireq->loc_addr,
+                                        .tos = RT_CONN_FLAGS(sk) } },
+                            .proto = sk->sk_protocol,
+                            .uli_u = { .ports =
+                                       { .sport = inet_sk(sk)->sport,
+                                         .dport = ireq->rmt_port } } };
+        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+                return NULL;
+        }
+        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+                ip_rt_put(rt);
+                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+                return NULL;
+        }
+        return &rt->u.dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
+                                 const u32 rnd, const u16 synq_hsize)
+{
+        return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
+}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+                                         struct request_sock ***prevp,
+                                         const __u16 rport, const __u32 raddr,
+                                         const __u32 laddr)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+        struct request_sock *req, **prev;
+        for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+                                                    lopt->nr_table_entries)];
+             (req = *prev) != NULL;
+             prev = &req->dl_next) {
+                const struct inet_request_sock *ireq = inet_rsk(req);
+                if (ireq->rmt_port == rport &&
+                    ireq->rmt_addr == raddr &&
+                    ireq->loc_addr == laddr &&
+                    AF_INET_FAMILY(req->rsk_ops->family)) {
+                        BUG_TRAP(!req->sk);
+                        *prevp = prev;
+                        break;
+                }
+        }
+        return req;
+}
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+                                   const unsigned timeout)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+        const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+                                     lopt->hash_rnd, lopt->nr_table_entries);
+        reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+        inet_csk_reqsk_queue_added(sk, timeout);
+}
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+                                const unsigned long interval,
+                                const unsigned long timeout,
+                                const unsigned long max_rto)
+{
+        struct inet_connection_sock *icsk = inet_csk(parent);
+        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+        struct listen_sock *lopt = queue->listen_opt;
+        int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+        int thresh = max_retries;
+        unsigned long now = jiffies;
+        struct request_sock **reqp, *req;
+        int i, budget;
+        if (lopt == NULL || lopt->qlen == 0)
+                return;
+        /* Normally all the openreqs are young and become mature
+         * (i.e. converted to established socket) for first timeout.
+         * If synack was not acknowledged for 3 seconds, it means
+         * one of the following things: synack was lost, ack was lost,
+         * rtt is high or nobody planned to ack (i.e. synflood).
+         * When server is a bit loaded, queue is populated with old
+         * open requests, reducing effective size of queue.
+         * When server is well loaded, queue size reduces to zero
+         * after several minutes of work. It is not synflood,
+         * it is normal operation. The solution is pruning
+         * too old entries overriding normal timeout, when
+         * situation becomes dangerous.
+         *
+         * Essentially, we reserve half of room for young
+         * embrions; and abort old ones without pity, if old
+         * ones are about to clog our table.
+         */
+        if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+                int young = (lopt->qlen_young<<1);
+                while (thresh > 2) {
+                        if (lopt->qlen < young)
+                                break;
+                        thresh--;
+                        young <<= 1;
+                }
+        }
+        if (queue->rskq_defer_accept)
+                max_retries = queue->rskq_defer_accept;
+        budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+        i = lopt->clock_hand;
+        do {
+                reqp=&lopt->syn_table[i];
+                while ((req = *reqp) != NULL) {
+                        if (time_after_eq(now, req->expires)) {
+                                if ((req->retrans < thresh ||
+                                     (inet_rsk(req)->acked && req->retrans < max_retries))
+                                    && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+                                        unsigned long timeo;
+                                        if (req->retrans++ == 0)
+                                                lopt->qlen_young--;
+                                        timeo = min((timeout << req->retrans), max_rto);
+                                        req->expires = now + timeo;
+                                        reqp = &req->dl_next;
+                                        continue;
+                                }
+                                /* Drop this request */
+                                inet_csk_reqsk_queue_unlink(parent, req, reqp);
+                                reqsk_queue_removed(queue, req);
+                                reqsk_free(req);
+                                continue;
+                        }
+                        reqp = &req->dl_next;
+                }
+                i = (i + 1) & (lopt->nr_table_entries - 1);
+        } while (--budget > 0);
+        lopt->clock_hand = i;
+        if (lopt->qlen)
+                inet_csk_reset_keepalive_timer(parent, interval);
+}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+                            const unsigned int __nocast priority)
+{
+        struct sock *newsk = sk_clone(sk, priority);
+        if (newsk != NULL) {
+                struct inet_connection_sock *newicsk = inet_csk(newsk);
+                newsk->sk_state = TCP_SYN_RECV;
+                newicsk->icsk_bind_hash = NULL;
+                inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+                newsk->sk_write_space = sk_stream_write_space;
+                newicsk->icsk_retransmits = 0;
+                newicsk->icsk_backoff     = 0;
+                newicsk->icsk_probes_out  = 0;
+                /* Deinitialize accept_queue to trap illegal accesses. */
+                memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+        }
+        return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+        BUG_TRAP(sk->sk_state == TCP_CLOSE);
+        BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+        /* It cannot be in hash table! */
+        BUG_TRAP(sk_unhashed(sk));
+        /* If it has not 0 inet_sk(sk)->num, it must be bound */
+        BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+        sk->sk_prot->destroy(sk);
+        sk_stream_kill_queues(sk);
+        xfrm_sk_free_policy(sk);
+        sk_refcnt_debug_release(sk);
+        atomic_dec(sk->sk_prot->orphan_count);
+        sock_put(sk);
+}
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+        if (rc != 0)
+                return rc;
+        sk->sk_max_ack_backlog = 0;
+        sk->sk_ack_backlog = 0;
+        inet_csk_delack_init(sk);
+        /* There is race window here: we announce ourselves listening,
+         * but this transition is still not validated by get_port().
+         * It is OK, because this socket enters to hash table only
+         * after validation is complete.
+         */
+        sk->sk_state = TCP_LISTEN;
+        if (!sk->sk_prot->get_port(sk, inet->num)) {
+                inet->sport = htons(inet->num);
+                sk_dst_reset(sk);
+                sk->sk_prot->hash(sk);
+                return 0;
+        }
+        sk->sk_state = TCP_CLOSE;
+        __reqsk_queue_destroy(&icsk->icsk_accept_queue);
+        return -EADDRINUSE;
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+/*
+ *      This routine closes sockets which have been at least partially
+ *      opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock *acc_req;
+        struct request_sock *req;
+        inet_csk_delete_keepalive_timer(sk);
+        /* make all the listen_opt local to us */
+        acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+        /* Following specs, it would be better either to send FIN
+         * (and enter FIN-WAIT-1, it is normal close)
+         * or to send active reset (abort).
+         * Certainly, it is pretty dangerous while synflood, but it is
+         * bad justification for our negligence 8)
+         * To be honest, we are not able to make either
+         * of the variants now.                 --ANK
+         */
+        reqsk_queue_destroy(&icsk->icsk_accept_queue);
+        while ((req = acc_req) != NULL) {
+                struct sock *child = req->sk;
+                acc_req = req->dl_next;
+                local_bh_disable();
+                bh_lock_sock(child);
+                BUG_TRAP(!sock_owned_by_user(child));
+                sock_hold(child);
+                sk->sk_prot->disconnect(child, O_NONBLOCK);
+                sock_orphan(child);
+                atomic_inc(sk->sk_prot->orphan_count);
+                inet_csk_destroy_sock(child);
+                bh_unlock_sock(child);
+                local_bh_enable();
+                sock_put(child);
+                sk_acceptq_removed(sk);
+                __reqsk_free(req);
+        }
+        BUG_TRAP(!sk->sk_ack_backlog);
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 000000000000..71f3c7350c6e
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
+/*
+ * inet_diag.c  Module for monitoring INET transport protocols sockets.
+ *
+ * Version:     $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+#include <linux/inet.h>
+#include <linux/stddef.h>
+#include <linux/inet_diag.h>
+static const struct inet_diag_handler **inet_diag_table;
+struct inet_diag_entry {
+        u32 *saddr;
+        u32 *daddr;
+        u16 sport;
+        u16 dport;
+        u16 family;
+        u16 userlocks;
+};
+static struct sock *idiagnl;
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+        RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
+                        int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+                        const struct nlmsghdr *unlh)
+{
+        const struct inet_sock *inet = inet_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct inet_diag_msg *r;
+        struct nlmsghdr  *nlh;
+        void *info = NULL;
+        struct inet_diag_meminfo  *minfo = NULL;
+        unsigned char    *b = skb->tail;
+        const struct inet_diag_handler *handler;
+        handler = inet_diag_table[unlh->nlmsg_type];
+        BUG_ON(handler == NULL);
+        nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+        nlh->nlmsg_flags = nlmsg_flags;
+        r = NLMSG_DATA(nlh);
+        if (sk->sk_state != TCP_TIME_WAIT) {
+                if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+                        minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
+                                              sizeof(*minfo));
+                if (ext & (1 << (INET_DIAG_INFO - 1)))
+                        info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+                                           handler->idiag_info_size);
+                
+                if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+                        size_t len = strlen(icsk->icsk_ca_ops->name);
+                        strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+                               icsk->icsk_ca_ops->name);
+                }
+        }
+        r->idiag_family = sk->sk_family;
+        r->idiag_state = sk->sk_state;
+        r->idiag_timer = 0;
+        r->idiag_retrans = 0;
+        r->id.idiag_if = sk->sk_bound_dev_if;
+        r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+        r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+        if (r->idiag_state == TCP_TIME_WAIT) {
+                const struct inet_timewait_sock *tw = inet_twsk(sk);
+                long tmo = tw->tw_ttd - jiffies;
+                if (tmo < 0)
+                        tmo = 0;
+                r->id.idiag_sport = tw->tw_sport;
+                r->id.idiag_dport = tw->tw_dport;
+                r->id.idiag_src[0] = tw->tw_rcv_saddr;
+                r->id.idiag_dst[0] = tw->tw_daddr;
+                r->idiag_state = tw->tw_substate;
+                r->idiag_timer = 3;
+                r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
+                r->idiag_rqueue = 0;
+                r->idiag_wqueue = 0;
+                r->idiag_uid = 0;
+                r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                if (r->idiag_family == AF_INET6) {
+                        const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+                        ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+                                       &tcp6tw->tw_v6_rcv_saddr);
+                        ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+                                       &tcp6tw->tw_v6_daddr);
+                }
+#endif
+                nlh->nlmsg_len = skb->tail - b;
+                return skb->len;
+        }
+        r->id.idiag_sport = inet->sport;
+        r->id.idiag_dport = inet->dport;
+        r->id.idiag_src[0] = inet->rcv_saddr;
+        r->id.idiag_dst[0] = inet->daddr;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+        if (r->idiag_family == AF_INET6) {
+                struct ipv6_pinfo *np = inet6_sk(sk);
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+                               &np->rcv_saddr);
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+                               &np->daddr);
+        }
+#endif
+#define EXPIRES_IN_MS(tmo)  ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+                r->idiag_timer = 1;
+                r->idiag_retrans = icsk->icsk_retransmits;
+                r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+                r->idiag_timer = 4;
+                r->idiag_retrans = icsk->icsk_probes_out;
+                r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+        } else if (timer_pending(&sk->sk_timer)) {
+                r->idiag_timer = 2;
+                r->idiag_retrans = icsk->icsk_probes_out;
+                r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+        } else {
+                r->idiag_timer = 0;
+                r->idiag_expires = 0;
+        }
+#undef EXPIRES_IN_MS
+        r->idiag_uid = sock_i_uid(sk);
+        r->idiag_inode = sock_i_ino(sk);
+        if (minfo) {
+                minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+                minfo->idiag_wmem = sk->sk_wmem_queued;
+                minfo->idiag_fmem = sk->sk_forward_alloc;
+                minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+        }
+        handler->idiag_get_info(sk, r, info);
+        if (sk->sk_state < TCP_TIME_WAIT &&
+            icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
+                icsk->icsk_ca_ops->get_info(sk, ext, skb);
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+rtattr_failure:
+nlmsg_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+        int err;
+        struct sock *sk;
+        struct inet_diag_req *req = NLMSG_DATA(nlh);
+        struct sk_buff *rep;
+        struct inet_hashinfo *hashinfo;
+        const struct inet_diag_handler *handler;
+        handler = inet_diag_table[nlh->nlmsg_type];
+        BUG_ON(handler == NULL);
+        hashinfo = handler->idiag_hashinfo;
+        if (req->idiag_family == AF_INET) {
+                sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
+                                 req->id.idiag_dport, req->id.idiag_src[0],
+                                 req->id.idiag_sport, req->id.idiag_if);
+        }
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+        else if (req->idiag_family == AF_INET6) {
+                sk = inet6_lookup(hashinfo,
+                                  (struct in6_addr *)req->id.idiag_dst,
+                                  req->id.idiag_dport,
+                                  (struct in6_addr *)req->id.idiag_src,
+                                  req->id.idiag_sport,
+                                  req->id.idiag_if);
+        }
+#endif
+        else {
+                return -EINVAL;
+        }
+        if (sk == NULL)
+                return -ENOENT;
+        err = -ESTALE;
+        if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+             req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+            ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+             (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+                goto out;
+        err = -ENOMEM;
+        rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+                                     sizeof(struct inet_diag_meminfo) +
+                                     handler->idiag_info_size + 64)),
+                        GFP_KERNEL);
+        if (!rep)
+                goto out;
+        if (inet_diag_fill(rep, sk, req->idiag_ext,
+                         NETLINK_CB(in_skb).pid,
+                         nlh->nlmsg_seq, 0, nlh) <= 0)
+                BUG();
+        err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+                              MSG_DONTWAIT);
+        if (err > 0)
+                err = 0;
+out:
+        if (sk) {
+                if (sk->sk_state == TCP_TIME_WAIT)
+                        inet_twsk_put((struct inet_timewait_sock *)sk);
+                else
+                        sock_put(sk);
+        }
+        return err;
+}
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+        int words = bits >> 5;
+        bits &= 0x1f;
+        if (words) {
+                if (memcmp(a1, a2, words << 2))
+                        return 0;
+        }
+        if (bits) {
+                __u32 w1, w2;
+                __u32 mask;
+                w1 = a1[words];
+                w2 = a2[words];
+                mask = htonl((0xffffffff) << (32 - bits));
+                if ((w1 ^ w2) & mask)
+                        return 0;
+        }
+        return 1;
+}
+static int inet_diag_bc_run(const void *bc, int len,
+                          const struct inet_diag_entry *entry)
+{
+        while (len > 0) {
+                int yes = 1;
+                const struct inet_diag_bc_op *op = bc;
+                switch (op->code) {
+                case INET_DIAG_BC_NOP:
+                        break;
+                case INET_DIAG_BC_JMP:
+                        yes = 0;
+                        break;
+                case INET_DIAG_BC_S_GE:
+                        yes = entry->sport >= op[1].no;
+                        break;
+                case INET_DIAG_BC_S_LE:
+                        yes = entry->dport <= op[1].no;
+                        break;
+                case INET_DIAG_BC_D_GE:
+                        yes = entry->dport >= op[1].no;
+                        break;
+                case INET_DIAG_BC_D_LE:
+                        yes = entry->dport <= op[1].no;
+                        break;
+                case INET_DIAG_BC_AUTO:
+                        yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+                        break;
+                case INET_DIAG_BC_S_COND:
+                case INET_DIAG_BC_D_COND: {
+                        struct inet_diag_hostcond *cond;
+                        u32 *addr;
+                        cond = (struct inet_diag_hostcond *)(op + 1);
+                        if (cond->port != -1 &&
+                            cond->port != (op->code == INET_DIAG_BC_S_COND ?
+                                             entry->sport : entry->dport)) {
+                                yes = 0;
+                                break;
+                        }
+                        
+                        if (cond->prefix_len == 0)
+                                break;
+                        if (op->code == INET_DIAG_BC_S_COND)
+                                addr = entry->saddr;
+                        else
+                                addr = entry->daddr;
+                        if (bitstring_match(addr, cond->addr, cond->prefix_len))
+                                break;
+                        if (entry->family == AF_INET6 &&
+                            cond->family == AF_INET) {
+                                if (addr[0] == 0 && addr[1] == 0 &&
+                                    addr[2] == htonl(0xffff) &&
+                                    bitstring_match(addr + 3, cond->addr,
+                                                    cond->prefix_len))
+                                        break;
+                        }
+                        yes = 0;
+                        break;
+                }
+                }
+                if (yes) { 
+                        len -= op->yes;
+                        bc += op->yes;
+                } else {
+                        len -= op->no;
+                        bc += op->no;
+                }
+        }
+        return (len == 0);
+}
+static int valid_cc(const void *bc, int len, int cc)
+{
+        while (len >= 0) {
+                const struct inet_diag_bc_op *op = bc;
+                if (cc > len)
+                        return 0;
+                if (cc == len)
+                        return 1;
+                if (op->yes < 4)
+                        return 0;
+                len -= op->yes;
+                bc  += op->yes;
+        }
+        return 0;
+}
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+        const unsigned char *bc = bytecode;
+        int  len = bytecode_len;
+        while (len > 0) {
+                struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+                switch (op->code) {
+                case INET_DIAG_BC_AUTO:
+                case INET_DIAG_BC_S_COND:
+                case INET_DIAG_BC_D_COND:
+                case INET_DIAG_BC_S_GE:
+                case INET_DIAG_BC_S_LE:
+                case INET_DIAG_BC_D_GE:
+                case INET_DIAG_BC_D_LE:
+                        if (op->yes < 4 || op->yes > len + 4)
+                                return -EINVAL;
+                case INET_DIAG_BC_JMP:
+                        if (op->no < 4 || op->no > len + 4)
+                                return -EINVAL;
+                        if (op->no < len &&
+                            !valid_cc(bytecode, bytecode_len, len - op->no))
+                                return -EINVAL;
+                        break;
+                case INET_DIAG_BC_NOP:
+                        if (op->yes < 4 || op->yes > len + 4)
+                                return -EINVAL;
+                        break;
+                default:
+                        return -EINVAL;
+                }
+                bc += op->yes;
+                len -= op->yes;
+        }
+        return len == 0 ? 0 : -EINVAL;
+}
+static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
+                             struct netlink_callback *cb)
+{
+        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+                struct inet_diag_entry entry;
+                struct rtattr *bc = (struct rtattr *)(r + 1);
+                struct inet_sock *inet = inet_sk(sk);
+                entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                if (entry.family == AF_INET6) {
+                        struct ipv6_pinfo *np = inet6_sk(sk);
+                        entry.saddr = np->rcv_saddr.s6_addr32;
+                        entry.daddr = np->daddr.s6_addr32;
+                } else
+#endif
+                {
+                        entry.saddr = &inet->rcv_saddr;
+                        entry.daddr = &inet->daddr;
+                }
+                entry.sport = inet->num;
+                entry.dport = ntohs(inet->dport);
+                entry.userlocks = sk->sk_userlocks;
+                if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+                        return 0;
+        }
+        return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
+                            cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
+                            struct request_sock *req,
+                            u32 pid, u32 seq,
+                            const struct nlmsghdr *unlh)
+{
+        const struct inet_request_sock *ireq = inet_rsk(req);
+        struct inet_sock *inet = inet_sk(sk);
+        unsigned char *b = skb->tail;
+        struct inet_diag_msg *r;
+        struct nlmsghdr *nlh;
+        long tmo;
+        nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+        nlh->nlmsg_flags = NLM_F_MULTI;
+        r = NLMSG_DATA(nlh);
+        r->idiag_family = sk->sk_family;
+        r->idiag_state = TCP_SYN_RECV;
+        r->idiag_timer = 1;
+        r->idiag_retrans = req->retrans;
+        r->id.idiag_if = sk->sk_bound_dev_if;
+        r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+        r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+        tmo = req->expires - jiffies;
+        if (tmo < 0)
+                tmo = 0;
+        r->id.idiag_sport = inet->sport;
+        r->id.idiag_dport = ireq->rmt_port;
+        r->id.idiag_src[0] = ireq->loc_addr;
+        r->id.idiag_dst[0] = ireq->rmt_addr;
+        r->idiag_expires = jiffies_to_msecs(tmo);
+        r->idiag_rqueue = 0;
+        r->idiag_wqueue = 0;
+        r->idiag_uid = sock_i_uid(sk);
+        r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+        if (r->idiag_family == AF_INET6) {
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+                               &tcp6_rsk(req)->loc_addr);
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+                               &tcp6_rsk(req)->rmt_addr);
+        }
+#endif
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+                             struct netlink_callback *cb)
+{
+        struct inet_diag_entry entry;
+        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct listen_sock *lopt;
+        struct rtattr *bc = NULL;
+        struct inet_sock *inet = inet_sk(sk);
+        int j, s_j;
+        int reqnum, s_reqnum;
+        int err = 0;
+        s_j = cb->args[3];
+        s_reqnum = cb->args[4];
+        if (s_j > 0)
+                s_j--;
+        entry.family = sk->sk_family;
+        read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+        lopt = icsk->icsk_accept_queue.listen_opt;
+        if (!lopt || !lopt->qlen)
+                goto out;
+        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+                bc = (struct rtattr *)(r + 1);
+                entry.sport = inet->num;
+                entry.userlocks = sk->sk_userlocks;
+        }
+        for (j = s_j; j < lopt->nr_table_entries; j++) {
+                struct request_sock *req, *head = lopt->syn_table[j];
+                reqnum = 0;
+                for (req = head; req; reqnum++, req = req->dl_next) {
+                        struct inet_request_sock *ireq = inet_rsk(req);
+                        if (reqnum < s_reqnum)
+                                continue;
+                        if (r->id.idiag_dport != ireq->rmt_port &&
+                            r->id.idiag_dport)
+                                continue;
+                        if (bc) {
+                                entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                                        (entry.family == AF_INET6) ?
+                                        tcp6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+                                        &ireq->loc_addr;
+                                entry.daddr = 
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                                        (entry.family == AF_INET6) ?
+                                        tcp6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+                                        &ireq->rmt_addr;
+                                entry.dport = ntohs(ireq->rmt_port);
+                                if (!inet_diag_bc_run(RTA_DATA(bc),
+                                                    RTA_PAYLOAD(bc), &entry))
+                                        continue;
+                        }
+                        err = inet_diag_fill_req(skb, sk, req,
+                                               NETLINK_CB(cb->skb).pid,
+                                               cb->nlh->nlmsg_seq, cb->nlh);
+                        if (err < 0) {
+                                cb->args[3] = j + 1;
+                                cb->args[4] = reqnum;
+                                goto out;
+                        }
+                }
+                s_reqnum = 0;
+        }
+out:
+        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+        return err;
+}
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        int i, num;
+        int s_i, s_num;
+        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+        const struct inet_diag_handler *handler;
+        struct inet_hashinfo *hashinfo;
+        handler = inet_diag_table[cb->nlh->nlmsg_type];
+        BUG_ON(handler == NULL);
+        hashinfo = handler->idiag_hashinfo;
+                
+        s_i = cb->args[1];
+        s_num = num = cb->args[2];
+        if (cb->args[0] == 0) {
+                if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+                        goto skip_listen_ht;
+                inet_listen_lock(hashinfo);
+                for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+                        struct sock *sk;
+                        struct hlist_node *node;
+                        num = 0;
+                        sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+                                struct inet_sock *inet = inet_sk(sk);
+                                if (num < s_num) {
+                                        num++;
+                                        continue;
+                                }
+                                if (r->id.idiag_sport != inet->sport &&
+                                    r->id.idiag_sport)
+                                        goto next_listen;
+                                if (!(r->idiag_states & TCPF_LISTEN) ||
+                                    r->id.idiag_dport ||
+                                    cb->args[3] > 0)
+                                        goto syn_recv;
+                                if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+                                        inet_listen_unlock(hashinfo);
+                                        goto done;
+                                }
+syn_recv:
+                                if (!(r->idiag_states & TCPF_SYN_RECV))
+                                        goto next_listen;
+                                if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+                                        inet_listen_unlock(hashinfo);
+                                        goto done;
+                                }
+next_listen:
+                                cb->args[3] = 0;
+                                cb->args[4] = 0;
+                                ++num;
+                        }
+                        s_num = 0;
+                        cb->args[3] = 0;
+                        cb->args[4] = 0;
+                }
+                inet_listen_unlock(hashinfo);
+skip_listen_ht:
+                cb->args[0] = 1;
+                s_i = num = s_num = 0;
+        }
+        if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+                return skb->len;
+        for (i = s_i; i < hashinfo->ehash_size; i++) {
+                struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+                struct sock *sk;
+                struct hlist_node *node;
+                if (i > s_i)
+                        s_num = 0;
+                read_lock_bh(&head->lock);
+                num = 0;
+                sk_for_each(sk, node, &head->chain) {
+                        struct inet_sock *inet = inet_sk(sk);
+                        if (num < s_num)
+                                goto next_normal;
+                        if (!(r->idiag_states & (1 << sk->sk_state)))
+                                goto next_normal;
+                        if (r->id.idiag_sport != inet->sport &&
+                            r->id.idiag_sport)
+                                goto next_normal;
+                        if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
+                                goto next_normal;
+                        if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+                                read_unlock_bh(&head->lock);
+                                goto done;
+                        }
+next_normal:
+                        ++num;
+                }
+                if (r->idiag_states & TCPF_TIME_WAIT) {
+                        sk_for_each(sk, node,
+                                    &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
+                                struct inet_sock *inet = inet_sk(sk);
+                                if (num < s_num)
+                                        goto next_dying;
+                                if (r->id.idiag_sport != inet->sport &&
+                                    r->id.idiag_sport)
+                                        goto next_dying;
+                                if (r->id.idiag_dport != inet->dport &&
+                                    r->id.idiag_dport)
+                                        goto next_dying;
+                                if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+                                        read_unlock_bh(&head->lock);
+                                        goto done;
+                                }
+next_dying:
+                                ++num;
+                        }
+                }
+                read_unlock_bh(&head->lock);
+        }
+done:
+        cb->args[1] = i;
+        cb->args[2] = num;
+        return skb->len;
+}
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+        return 0;
+}
+static __inline__ int
+inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+        if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+                return 0;
+        if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
+                goto err_inval;
+        if (inet_diag_table[nlh->nlmsg_type] == NULL)
+                return -ENOENT;
+        if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
+                goto err_inval;
+        if (nlh->nlmsg_flags&NLM_F_DUMP) {
+                if (nlh->nlmsg_len >
+                    (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
+                        struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
+                                                 sizeof(struct inet_diag_req));
+                        if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
+                            rta->rta_len < 8 ||
+                            rta->rta_len >
+                            (nlh->nlmsg_len -
+                             NLMSG_SPACE(sizeof(struct inet_diag_req))))
+                                goto err_inval;
+                        if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+                                goto err_inval;
+                }
+                return netlink_dump_start(idiagnl, skb, nlh,
+                                          inet_diag_dump,
+                                          inet_diag_dump_done);
+        } else {
+                return inet_diag_get_exact(skb, nlh);
+        }
+err_inval:
+        return -EINVAL;
+}
+static inline void inet_diag_rcv_skb(struct sk_buff *skb)
+{
+        int err;
+        struct nlmsghdr * nlh;
+        if (skb->len >= NLMSG_SPACE(0)) {
+                nlh = (struct nlmsghdr *)skb->data;
+                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+                        return;
+                err = inet_diag_rcv_msg(skb, nlh);
+                if (err || nlh->nlmsg_flags & NLM_F_ACK) 
+                        netlink_ack(skb, nlh, err);
+        }
+}
+static void inet_diag_rcv(struct sock *sk, int len)
+{
+        struct sk_buff *skb;
+        unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+        while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
+                inet_diag_rcv_skb(skb);
+                kfree_skb(skb);
+        }
+}
+static DEFINE_SPINLOCK(inet_diag_register_lock);
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+        const __u16 type = h->idiag_type;
+        int err = -EINVAL;
+        if (type >= INET_DIAG_GETSOCK_MAX)
+                goto out;
+        spin_lock(&inet_diag_register_lock);
+        err = -EEXIST;
+        if (inet_diag_table[type] == NULL) {
+                inet_diag_table[type] = h;
+                err = 0;
+        }
+        spin_unlock(&inet_diag_register_lock);
+out:
+        return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+        const __u16 type = h->idiag_type;
+        if (type >= INET_DIAG_GETSOCK_MAX)
+                return;
+        spin_lock(&inet_diag_register_lock);
+        inet_diag_table[type] = NULL;
+        spin_unlock(&inet_diag_register_lock);
+        synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+static int __init inet_diag_init(void)
+{
+        const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+                                          sizeof(struct inet_diag_handler *));
+        int err = -ENOMEM;
+        inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
+        if (!inet_diag_table)
+                goto out;
+        memset(inet_diag_table, 0, inet_diag_table_size);
+        idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
+                                        THIS_MODULE);
+        if (idiagnl == NULL)
+                goto out_free_table;
+        err = 0;
+out:
+        return err;
+out_free_table:
+        kfree(inet_diag_table);
+        goto out;
+}
+static void __exit inet_diag_exit(void)
+{
+        sock_release(idiagnl->sk_socket);
+        kfree(inet_diag_table);
+}
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 000000000000..e8d29fe736d2
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Generic INET transport hashtables
+ *
+ * Authors:     Lotsa people, from code originally in tcp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+                                                 struct inet_bind_hashbucket *head,
+                                                 const unsigned short snum)
+{
+        struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+        if (tb != NULL) {
+                tb->port      = snum;
+                tb->fastreuse = 0;
+                INIT_HLIST_HEAD(&tb->owners);
+                hlist_add_head(&tb->node, &head->chain);
+        }
+        return tb;
+}
+EXPORT_SYMBOL(inet_bind_bucket_create);
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+{
+        if (hlist_empty(&tb->owners)) {
+                __hlist_del(&tb->node);
+                kmem_cache_free(cachep, tb);
+        }
+}
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+                    const unsigned short snum)
+{
+        inet_sk(sk)->num = snum;
+        sk_add_bind_node(sk, &tb->owners);
+        inet_csk(sk)->icsk_bind_hash = tb;
+}
+EXPORT_SYMBOL(inet_bind_hash);
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+        const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+        struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+        struct inet_bind_bucket *tb;
+        spin_lock(&head->lock);
+        tb = inet_csk(sk)->icsk_bind_hash;
+        __sk_del_bind_node(sk);
+        inet_csk(sk)->icsk_bind_hash = NULL;
+        inet_sk(sk)->num = 0;
+        inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+        spin_unlock(&head->lock);
+}
+void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+        local_bh_disable();
+        __inet_put_port(hashinfo, sk);
+        local_bh_enable();
+}
+EXPORT_SYMBOL(inet_put_port);
+/*
+ * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+{
+        write_lock(&hashinfo->lhash_lock);
+        if (atomic_read(&hashinfo->lhash_users)) {
+                DEFINE_WAIT(wait);
+                for (;;) {
+                        prepare_to_wait_exclusive(&hashinfo->lhash_wait,
+                                                  &wait, TASK_UNINTERRUPTIBLE);
+                        if (!atomic_read(&hashinfo->lhash_users))
+                                break;
+                        write_unlock_bh(&hashinfo->lhash_lock);
+                        schedule();
+                        write_lock_bh(&hashinfo->lhash_lock);
+                }
+                finish_wait(&hashinfo->lhash_wait, &wait);
+        }
+}
+EXPORT_SYMBOL(inet_listen_wlock);
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
+                                    const unsigned short hnum, const int dif)
+{
+        struct sock *result = NULL, *sk;
+        const struct hlist_node *node;
+        int hiscore = -1;
+        sk_for_each(sk, node, head) {
+                const struct inet_sock *inet = inet_sk(sk);
+                if (inet->num == hnum && !ipv6_only_sock(sk)) {
+                        const __u32 rcv_saddr = inet->rcv_saddr;
+                        int score = sk->sk_family == PF_INET ? 1 : 0;
+                        if (rcv_saddr) {
+                                if (rcv_saddr != daddr)
+                                        continue;
+                                score += 2;
+                        }
+                        if (sk->sk_bound_dev_if) {
+                                if (sk->sk_bound_dev_if != dif)
+                                        continue;
+                                score += 2;
+                        }
+                        if (score == 5)
+                                return sk;
+                        if (score > hiscore) {
+                                hiscore = score;
+                                result  = sk;
+                        }
+                }
+        }
+        return result;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 000000000000..4d1502a49852
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,384 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Generic TIME_WAIT sockets functions
+ *
+ *              From code orinally in TCP
+ */
+#include <linux/config.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+/* Must be called with locally disabled BHs. */
+void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
+{
+        struct inet_bind_hashbucket *bhead;
+        struct inet_bind_bucket *tb;
+        /* Unlink from established hashes. */
+        struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent];
+        write_lock(&ehead->lock);
+        if (hlist_unhashed(&tw->tw_node)) {
+                write_unlock(&ehead->lock);
+                return;
+        }
+        __hlist_del(&tw->tw_node);
+        sk_node_init(&tw->tw_node);
+        write_unlock(&ehead->lock);
+        /* Disassociate with bind bucket. */
+        bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+        spin_lock(&bhead->lock);
+        tb = tw->tw_tb;
+        __hlist_del(&tw->tw_bind_node);
+        tw->tw_tb = NULL;
+        inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+        spin_unlock(&bhead->lock);
+#ifdef SOCK_REFCNT_DEBUG
+        if (atomic_read(&tw->tw_refcnt) != 1) {
+                printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
+                       tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+        }
+#endif
+        inet_twsk_put(tw);
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_kill);
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+                           struct inet_hashinfo *hashinfo)
+{
+        const struct inet_sock *inet = inet_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent];
+        struct inet_bind_hashbucket *bhead;
+        /* Step 1: Put TW into bind hash. Original socket stays there too.
+           Note, that any socket with inet->num != 0 MUST be bound in
+           binding cache, even if it is closed.
+         */
+        bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+        spin_lock(&bhead->lock);
+        tw->tw_tb = icsk->icsk_bind_hash;
+        BUG_TRAP(icsk->icsk_bind_hash);
+        inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+        spin_unlock(&bhead->lock);
+        write_lock(&ehead->lock);
+        /* Step 2: Remove SK from established hash. */
+        if (__sk_del_node_init(sk))
+                sock_prot_dec_use(sk->sk_prot);
+        /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+        inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
+        atomic_inc(&tw->tw_refcnt);
+        write_unlock(&ehead->lock);
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+{
+        struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
+                                                         SLAB_ATOMIC);
+        if (tw != NULL) {
+                const struct inet_sock *inet = inet_sk(sk);
+                /* Give us an identity. */
+                tw->tw_daddr        = inet->daddr;
+                tw->tw_rcv_saddr    = inet->rcv_saddr;
+                tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+                tw->tw_num          = inet->num;
+                tw->tw_state        = TCP_TIME_WAIT;
+                tw->tw_substate     = state;
+                tw->tw_sport        = inet->sport;
+                tw->tw_dport        = inet->dport;
+                tw->tw_family       = sk->sk_family;
+                tw->tw_reuse        = sk->sk_reuse;
+                tw->tw_hashent      = sk->sk_hashent;
+                tw->tw_ipv6only     = 0;
+                tw->tw_prot         = sk->sk_prot_creator;
+                atomic_set(&tw->tw_refcnt, 1);
+                inet_twsk_dead_node_init(tw);
+        }
+        return tw;
+}
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+/* Returns non-zero if quota exceeded.  */
+static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+                                    const int slot)
+{
+        struct inet_timewait_sock *tw;
+        struct hlist_node *node;
+        unsigned int killed;
+        int ret;
+        /* NOTE: compare this to previous version where lock
+         * was released after detaching chain. It was racy,
+         * because tw buckets are scheduled in not serialized context
+         * in 2.3 (with netfilter), and with softnet it is common, because
+         * soft irqs are not sequenced.
+         */
+        killed = 0;
+        ret = 0;
+rescan:
+        inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+                __inet_twsk_del_dead_node(tw);
+                spin_unlock(&twdr->death_lock);
+                __inet_twsk_kill(tw, twdr->hashinfo);
+                inet_twsk_put(tw);
+                killed++;
+                spin_lock(&twdr->death_lock);
+                if (killed > INET_TWDR_TWKILL_QUOTA) {
+                        ret = 1;
+                        break;
+                }
+                /* While we dropped twdr->death_lock, another cpu may have
+                 * killed off the next TW bucket in the list, therefore
+                 * do a fresh re-read of the hlist head node with the
+                 * lock reacquired.  We still use the hlist traversal
+                 * macro in order to get the prefetches.
+                 */
+                goto rescan;
+        }
+        twdr->tw_count -= killed;
+        NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+        return ret;
+}
+void inet_twdr_hangman(unsigned long data)
+{
+        struct inet_timewait_death_row *twdr;
+        int unsigned need_timer;
+        twdr = (struct inet_timewait_death_row *)data;
+        spin_lock(&twdr->death_lock);
+        if (twdr->tw_count == 0)
+                goto out;
+        need_timer = 0;
+        if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
+                twdr->thread_slots |= (1 << twdr->slot);
+                mb();
+                schedule_work(&twdr->twkill_work);
+                need_timer = 1;
+        } else {
+                /* We purged the entire slot, anything left?  */
+                if (twdr->tw_count)
+                        need_timer = 1;
+        }
+        twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
+        if (need_timer)
+                mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+out:
+        spin_unlock(&twdr->death_lock);
+}
+EXPORT_SYMBOL_GPL(inet_twdr_hangman);
+extern void twkill_slots_invalid(void);
+void inet_twdr_twkill_work(void *data)
+{
+        struct inet_timewait_death_row *twdr = data;
+        int i;
+        if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
+                twkill_slots_invalid();
+        while (twdr->thread_slots) {
+                spin_lock_bh(&twdr->death_lock);
+                for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
+                        if (!(twdr->thread_slots & (1 << i)))
+                                continue;
+                        while (inet_twdr_do_twkill_work(twdr, i) != 0) {
+                                if (need_resched()) {
+                                        spin_unlock_bh(&twdr->death_lock);
+                                        schedule();
+                                        spin_lock_bh(&twdr->death_lock);
+                                }
+                        }
+                        twdr->thread_slots &= ~(1 << i);
+                }
+                spin_unlock_bh(&twdr->death_lock);
+        }
+}
+EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+                          struct inet_timewait_death_row *twdr)
+{
+        spin_lock(&twdr->death_lock);
+        if (inet_twsk_del_dead_node(tw)) {
+                inet_twsk_put(tw);
+                if (--twdr->tw_count == 0)
+                        del_timer(&twdr->tw_timer);
+        }
+        spin_unlock(&twdr->death_lock);
+        __inet_twsk_kill(tw, twdr->hashinfo);
+}
+EXPORT_SYMBOL(inet_twsk_deschedule);
+void inet_twsk_schedule(struct inet_timewait_sock *tw,
+                       struct inet_timewait_death_row *twdr,
+                       const int timeo, const int timewait_len)
+{
+        struct hlist_head *list;
+        int slot;
+        /* timeout := RTO * 3.5
+         *
+         * 3.5 = 1+2+0.5 to wait for two retransmits.
+         *
+         * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+         * our ACK acking that FIN can be lost. If N subsequent retransmitted
+         * FINs (or previous seqments) are lost (probability of such event
+         * is p^(N+1), where p is probability to lose single packet and
+         * time to detect the loss is about RTO*(2^N - 1) with exponential
+         * backoff). Normal timewait length is calculated so, that we
+         * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+         * [ BTW Linux. following BSD, violates this requirement waiting
+         *   only for 60sec, we should wait at least for 240 secs.
+         *   Well, 240 consumes too much of resources 8)
+         * ]
+         * This interval is not reduced to catch old duplicate and
+         * responces to our wandering segments living for two MSLs.
+         * However, if we use PAWS to detect
+         * old duplicates, we can reduce the interval to bounds required
+         * by RTO, rather than MSL. So, if peer understands PAWS, we
+         * kill tw bucket after 3.5*RTO (it is important that this number
+         * is greater than TS tick!) and detect old duplicates with help
+         * of PAWS.
+         */
+        slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
+        spin_lock(&twdr->death_lock);
+        /* Unlink it, if it was scheduled */
+        if (inet_twsk_del_dead_node(tw))
+                twdr->tw_count--;
+        else
+                atomic_inc(&tw->tw_refcnt);
+        if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+                /* Schedule to slow timer */
+                if (timeo >= timewait_len) {
+                        slot = INET_TWDR_TWKILL_SLOTS - 1;
+                } else {
+                        slot = (timeo + twdr->period - 1) / twdr->period;
+                        if (slot >= INET_TWDR_TWKILL_SLOTS)
+                                slot = INET_TWDR_TWKILL_SLOTS - 1;
+                }
+                tw->tw_ttd = jiffies + timeo;
+                slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
+                list = &twdr->cells[slot];
+        } else {
+                tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+                if (twdr->twcal_hand < 0) {
+                        twdr->twcal_hand = 0;
+                        twdr->twcal_jiffie = jiffies;
+                        twdr->twcal_timer.expires = twdr->twcal_jiffie +
+                                              (slot << INET_TWDR_RECYCLE_TICK);
+                        add_timer(&twdr->twcal_timer);
+                } else {
+                        if (time_after(twdr->twcal_timer.expires,
+                                       jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
+                                mod_timer(&twdr->twcal_timer,
+                                          jiffies + (slot << INET_TWDR_RECYCLE_TICK));
+                        slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
+                }
+                list = &twdr->twcal_row[slot];
+        }
+        hlist_add_head(&tw->tw_death_node, list);
+        if (twdr->tw_count++ == 0)
+                mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+        spin_unlock(&twdr->death_lock);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+void inet_twdr_twcal_tick(unsigned long data)
+{
+        struct inet_timewait_death_row *twdr;
+        int n, slot;
+        unsigned long j;
+        unsigned long now = jiffies;
+        int killed = 0;
+        int adv = 0;
+        twdr = (struct inet_timewait_death_row *)data;
+        spin_lock(&twdr->death_lock);
+        if (twdr->twcal_hand < 0)
+                goto out;
+        slot = twdr->twcal_hand;
+        j = twdr->twcal_jiffie;
+        for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
+                if (time_before_eq(j, now)) {
+                        struct hlist_node *node, *safe;
+                        struct inet_timewait_sock *tw;
+                        inet_twsk_for_each_inmate_safe(tw, node, safe,
+                                                       &twdr->twcal_row[slot]) {
+                                __inet_twsk_del_dead_node(tw);
+                                __inet_twsk_kill(tw, twdr->hashinfo);
+                                inet_twsk_put(tw);
+                                killed++;
+                        }
+                } else {
+                        if (!adv) {
+                                adv = 1;
+                                twdr->twcal_jiffie = j;
+                                twdr->twcal_hand = slot;
+                        }
+                        if (!hlist_empty(&twdr->twcal_row[slot])) {
+                                mod_timer(&twdr->twcal_timer, j);
+                                goto out;
+                        }
+                }
+                j += 1 << INET_TWDR_RECYCLE_TICK;
+                slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+        }
+        twdr->twcal_hand = -1;
+out:
+        if ((twdr->tw_count -= killed) == 0)
+                del_timer(&twdr->tw_timer);
+        NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+        spin_unlock(&twdr->death_lock);
+}
+EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index ab18a853d7ce..f84ba9c96551 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/net.h>
+#include <net/ip.h>
 #include <net/inetpeer.h>
 /*
@@ -72,7 +73,7 @@
 /* Exported for inet_getid inline function.  */
 DEFINE_SPINLOCK(inet_peer_idlock);
-static kmem_cache_t *peer_cachep;
+static kmem_cache_t *peer_cachep __read_mostly;
 #define node_height(x) x->avl_height
 static struct inet_peer peer_fake_node = {
@@ -459,5 +460,3 @@ static void peer_check_expire(unsigned long dummy)
                                peer_total / inet_peer_threshold * HZ;
        add_timer(&peer_periodic_timer);
 }
-EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aac6c28..0923add122b4 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
         *      that reaches zero, we must reply an ICMP control message telling
         *      that the packet's lifetime expired.
         */
+        if (skb->nh.iph->ttl <= 1)
-        iph = skb->nh.iph;
-        if (iph->ttl <= 1)
                goto too_many_hops;
        if (!xfrm4_route_forward(skb))
                goto drop;
-        iph = skb->nh.iph;
        rt = (struct rtable*)skb->dst;
        if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index eb377ae15305..9e6e683cc34d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
        return ip_frag_intern(hash, qp);
 out_nomem:
-        LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+        LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
        return NULL;
 }
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
        if (skb->dev)
                qp->iif = skb->dev->ifindex;
        skb->dev = NULL;
-        qp->stamp = skb->stamp;
+        skb_get_timestamp(skb, &qp->stamp);
        qp->meat += skb->len;
        atomic_add(skb->truesize, &ip_frag_mem);
        if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
        head->next = NULL;
        head->dev = dev;
-        head->stamp = qp->stamp;
+        skb_set_timestamp(head, &qp->stamp);
        iph = head->nh.iph;
        iph->frag_off = 0;
@@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
        return head;
 out_nomem:
-        LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
+        LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
-                              "queue %p\n", qp));
+                              "queue %p\n", qp);
        goto out_fail;
 out_oversize:
        if (net_ratelimit())
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c703528e0bcd..473d0f2b2e0d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
 *      SNMP management statistics
 */
-DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
 /*
 *      Process Router Attention IP option
@@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
                /* If there maybe a raw socket we must check - if not we
                 * don't care less
                 */
-                if (raw_sk)
+                if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
-                        raw_v4_input(skb, skb->nh.iph, hash);
+                        raw_sk = NULL;
                if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
                        int ret;
@@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb)
                       ip_local_deliver_finish);
 }
-static inline int ip_rcv_finish(struct sk_buff *skb)
+static inline int ip_rcv_options(struct sk_buff *skb)
 {
+        struct ip_options *opt;
+        struct iphdr *iph;
        struct net_device *dev = skb->dev;
+        /* It looks as overkill, because not all
+           IP options require packet mangling.
+           But it is the easiest for now, especially taking
+           into account that combination of IP options
+           and running sniffer is extremely rare condition.
+                                              --ANK (980813)
+        */
+        if (skb_cow(skb, skb_headroom(skb))) {
+                IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+                goto drop;
+        }
+        iph = skb->nh.iph;
+        if (ip_options_compile(NULL, skb)) {
+                IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+                goto drop;
+        }
+        opt = &(IPCB(skb)->opt);
+        if (unlikely(opt->srr)) {
+                struct in_device *in_dev = in_dev_get(dev);
+                if (in_dev) {
+                        if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+                                if (IN_DEV_LOG_MARTIANS(in_dev) &&
+                                    net_ratelimit())
+                                        printk(KERN_INFO "source route option "
+                                               "%u.%u.%u.%u -> %u.%u.%u.%u\n",
+                                               NIPQUAD(iph->saddr),
+                                               NIPQUAD(iph->daddr));
+                                in_dev_put(in_dev);
+                                goto drop;
+                        }
+                        in_dev_put(in_dev);
+                }
+                if (ip_options_rcv_srr(skb))
+                        goto drop;
+        }
+        return 0;
+drop:
+        return -1;
+}
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
        struct iphdr *iph = skb->nh.iph;
-        int err;
        /*
         *      Initialise the virtual path cache for the packet. It describes
         *      how the packet travels inside Linux networking.
         */ 
-        if (skb->dst == NULL) {
+        if (likely(skb->dst == NULL)) {
-                if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+                int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+                                         skb->dev);
+                if (unlikely(err)) {
                        if (err == -EHOSTUNREACH)
                                IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
                        goto drop; 
@@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
        }
 #ifdef CONFIG_NET_CLS_ROUTE
-        if (skb->dst->tclassid) {
+        if (unlikely(skb->dst->tclassid)) {
                struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
                u32 idx = skb->dst->tclassid;
                st[idx&0xFF].o_packets++;
@@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
        }
 #endif
-        if (iph->ihl > 5) {
+        if (iph->ihl > 5 && ip_rcv_options(skb))
-                struct ip_options *opt;
+                goto drop;
-                /* It looks as overkill, because not all
-                   IP options require packet mangling.
-                   But it is the easiest for now, especially taking
-                   into account that combination of IP options
-                   and running sniffer is extremely rare condition.
-                                                      --ANK (980813)
-                */
-                if (skb_cow(skb, skb_headroom(skb))) {
-                        IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
-                        goto drop;
-                }
-                iph = skb->nh.iph;
-                if (ip_options_compile(NULL, skb))
-                        goto inhdr_error;
-                opt = &(IPCB(skb)->opt);
-                if (opt->srr) {
-                        struct in_device *in_dev = in_dev_get(dev);
-                        if (in_dev) {
-                                if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
-                                        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-                                                printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
-                                                       NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-                                        in_dev_put(in_dev);
-                                        goto drop;
-                                }
-                                in_dev_put(in_dev);
-                        }
-                        if (ip_options_rcv_srr(skb))
-                                goto drop;
-                }
-        }
        return dst_input(skb);
-inhdr_error:
-        IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 drop:
        kfree_skb(skb);
        return NET_RX_DROP;
@@ -358,9 +373,10 @@ drop:
 /*
 *      Main IP Receive routine.
 */ 
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct iphdr *iph;
+        u32 len;
        /* When the interface is in promisc. mode, drop all the crap
         * that it receives, do not try to analyse it.
@@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
         */
        if (iph->ihl < 5 || iph->version != 4)
-                goto inhdr_error; 
+                goto inhdr_error;
        if (!pskb_may_pull(skb, iph->ihl*4))
                goto inhdr_error;
        iph = skb->nh.iph;
-        if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
-                goto inhdr_error; 
+                goto inhdr_error;
-        {
+        len = ntohs(iph->tot_len);
-                __u32 len = ntohs(iph->tot_len); 
+        if (skb->len < len || len < (iph->ihl*4))
-                if (skb->len < len || len < (iph->ihl<<2))
+                goto inhdr_error;
-                        goto inhdr_error;
-                /* Our transport medium may have padded the buffer out. Now we know it
+        /* Our transport medium may have padded the buffer out. Now we know it
-                 * is IP we can trim to the true length of the frame.
+         * is IP we can trim to the true length of the frame.
-                 * Note this now means skb->len holds ntohs(iph->tot_len).
+         * Note this now means skb->len holds ntohs(iph->tot_len).
-                 */
+         */
-                if (pskb_trim_rcsum(skb, len)) {
+        if (pskb_trim_rcsum(skb, len)) {
-                        IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+                IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
-                        goto drop;
+                goto drop;
-                }
        }
        return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -428,5 +442,4 @@ out:
        return NET_RX_DROP;
 }
-EXPORT_SYMBOL(ip_rcv);
 EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f3e701..bce4e875193b 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
        }
 }
-int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+static struct ip_options *ip_options_get_alloc(const int optlen)
 {
-        struct ip_options *opt;
+        struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
+                                         GFP_KERNEL);
+        if (opt)
+                memset(opt, 0, sizeof(*opt));
+        return opt;
+}
-        opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
+static int ip_options_get_finish(struct ip_options **optp,
-        if (!opt)
+                                 struct ip_options *opt, int optlen)
-                return -ENOMEM;
+{
-        memset(opt, 0, sizeof(struct ip_options));
-        if (optlen) {
-                if (user) {
-                        if (copy_from_user(opt->__data, data, optlen)) {
-                                kfree(opt);
-                                return -EFAULT;
-                        }
-                } else
-                        memcpy(opt->__data, data, optlen);
-        }
        while (optlen & 3)
                opt->__data[optlen++] = IPOPT_END;
        opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
        return 0;
 }
+int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
+{
+        struct ip_options *opt = ip_options_get_alloc(optlen);
+        if (!opt)
+                return -ENOMEM;
+        if (optlen && copy_from_user(opt->__data, data, optlen)) {
+                kfree(opt);
+                return -EFAULT;
+        }
+        return ip_options_get_finish(optp, opt, optlen);
+}
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
+{
+        struct ip_options *opt = ip_options_get_alloc(optlen);
+        if (!opt)
+                return -ENOMEM;
+        if (optlen)
+                memcpy(opt->__data, data, optlen);
+        return ip_options_get_finish(optp, opt, optlen);
+}
 void ip_forward_options(struct sk_buff *skb)
 {
        struct   ip_options * opt       = &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
        }
        return 0;
 }
-EXPORT_SYMBOL(ip_options_compile);
-EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 80d13103b2b0..3f1a263e1249 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/arp.h>
 #include <net/icmp.h>
-#include <net/raw.h>
 #include <net/checksum.h>
 #include <net/inetpeer.h>
 #include <net/checksum.h>
@@ -84,12 +81,8 @@
 #include <linux/netfilter_bridge.h>
 #include <linux/mroute.h>
 #include <linux/netlink.h>
+#include <linux/tcp.h>
-/*
- *      Shall we try to damage output packets if routing dev changes?
- */
-int sysctl_ip_dynaddr;
 int sysctl_ip_default_ttl = IPDEFTTL;
 /* Generate a checksum for an outgoing IP datagram. */
@@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
                       dst_output);
 }
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 static inline int ip_finish_output2(struct sk_buff *skb)
 {
        struct dst_entry *dst = skb->dst;
@@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
        return -EINVAL;
 }
-int ip_finish_output(struct sk_buff *skb)
+static inline int ip_finish_output(struct sk_buff *skb)
 {
        struct net_device *dev = skb->dst->dev;
@@ -329,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
                        if (ip_route_output_flow(&rt, &fl, sk, 0))
                                goto no_route;
                }
-                __sk_dst_set(sk, &rt->u.dst);
+                sk_setup_caps(sk, &rt->u.dst);
-                tcp_v4_setup_caps(sk, &rt->u.dst);
        }
        skb->dst = dst_clone(&rt->u.dst);
@@ -392,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 #endif
 #ifdef CONFIG_NETFILTER
        to->nfmark = from->nfmark;
-        to->nfcache = from->nfcache;
        /* Connection association is same as pre-frag packet */
        nf_conntrack_put(to->nfct);
        to->nfct = from->nfct;
@@ -580,7 +573,7 @@ slow_path:
                 */
                if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
-                        NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+                        NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
                        err = -ENOMEM;
                        goto fail;
                }
@@ -1329,12 +1322,7 @@ void __init ip_init(void)
 #endif
 }
-EXPORT_SYMBOL(ip_finish_output);
 EXPORT_SYMBOL(ip_fragment);
 EXPORT_SYMBOL(ip_generic_getfrag);
 EXPORT_SYMBOL(ip_queue_xmit);
 EXPORT_SYMBOL(ip_send_check);
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ff4bd067b397..2f0b47da5b37 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
                switch (cmsg->cmsg_type) {
                case IP_RETOPTS:
                        err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
-                        err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+                        err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
                        if (err)
                                return err;
                        break;
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                        struct ip_options * opt = NULL;
                        if (optlen > 40 || optlen < 0)
                                goto e_inval;
-                        err = ip_options_get(&opt, optval, optlen, 1);
+                        err = ip_options_get_from_user(&opt, optval, optlen);
                        if (err)
                                break;
                        if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                }
                case IP_MSFILTER:
                {
-                        extern int sysctl_optmem_max;
                        extern int sysctl_igmp_max_msf;
                        struct ip_msfilter *msf;
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                }
                case MCAST_MSFILTER:
                {
-                        extern int sysctl_optmem_max;
                        extern int sysctl_igmp_max_msf;
                        struct sockaddr_in *psin;
                        struct ip_msfilter *msf = NULL;
@@ -1090,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 EXPORT_SYMBOL(ip_cmsg_recv);
-#ifdef CONFIG_IP_SCTP_MODULE
 EXPORT_SYMBOL(ip_getsockopt);
 EXPORT_SYMBOL(ip_setsockopt);
-#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 7ded6e60f43a..dcb7ee6c4858 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
                              spi, IPPROTO_COMP, AF_INET);
        if (!x)
                return;
-        NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+        NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
-               spi, NIPQUAD(iph->daddr)));
+                 spi, NIPQUAD(iph->daddr));
        xfrm_state_put(x);
 }
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d2bf8e1930a3..63e106605f28 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -393,7 +393,7 @@ static int __init ic_defaults(void)
 #ifdef IPCONFIG_RARP
-static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
 static struct packet_type rarp_packet_type __initdata = {
        .type = __constant_htons(ETH_P_RARP),
@@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void)
 *  Process received RARP packet.
 */
 static int __init
-ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct arphdr *rarp;
        unsigned char *rarp_ptr;
@@ -555,7 +555,7 @@ struct bootp_pkt {		/* BOOTP packet format */
 #define DHCPRELEASE     7
 #define DHCPINFORM      8
-static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
 static struct packet_type bootp_packet_type __initdata = {
        .type = __constant_htons(ETH_P_IP),
@@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
 /*
 *  Receive BOOTP reply.
 */
-static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct bootp_pkt *b;
        struct iphdr *h;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index dc806b578427..9dbf5909f3a6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
   In this case data path is free of exclusive locks at all.
 */
-static kmem_cache_t *mrt_cachep;
+static kmem_cache_t *mrt_cachep __read_mostly;
 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212addd193..6e092dadb388 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <net/protocol.h>
+#include <net/tcp.h>
 #include <asm/system.h>
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d0145a8b1551..e11952ea17af 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
 static struct list_head *ip_vs_conn_tab;
 /*  SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep;
+static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
 /*  counter for current IPVS connections */
 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257dd07cb..3ac7eeca04ac 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
 *
 * Changes:
 *      Paul `Rusty' Russell            properly handle non-linear skbs
+ *      Harald Welte                    don't use nfcache
 *
 */
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
                                       const struct net_device *out,
                                       int (*okfn)(struct sk_buff *))
 {
-        if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+        if (!((*pskb)->ipvs_property))
                return NF_ACCEPT;
        /* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
        /* do the statistics and put it back */
        ip_vs_out_stats(cp, skb);
-        skb->nfcache |= NFC_IPVS_PROPERTY;
+        skb->ipvs_property = 1;
        verdict = NF_ACCEPT;
  out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
        EnterFunction(11);
-        if (skb->nfcache & NFC_IPVS_PROPERTY)
+        if (skb->ipvs_property)
                return NF_ACCEPT;
        iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
        ip_vs_conn_put(cp);
-        skb->nfcache |= NFC_IPVS_PROPERTY;
+        skb->ipvs_property = 1;
        LeaveFunction(11);
        return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7d99ede2ef79..2d66848e7aa0 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = {
        { .ctl_name = 0 }
 };
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
        {
                .ctl_name       = NET_IPV4,
                .procname       = "ipv4",
@@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = {
                .ctl_name       = CTL_NET,
                .procname       = "net",
                .mode           = 0555,
-                .child          = ipv4_table,
+                .child          = ipvs_ipv4_table,
        },
        { .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838b780a..561cda326fa8 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
        { .ctl_name = 0 }
 };
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
        {
                .ctl_name       = NET_IPV4,
                .procname       = "ipv4", 
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
                .ctl_name       = CTL_NET,
                .procname       = "net", 
                .mode           = 0555, 
-                .child          = ipv4_table
+                .child          = ipvs_ipv4_table
        },
        { .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd55d271..ce456dbf09a5 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
        { .ctl_name = 0 }
 };
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
        {
                .ctl_name       = NET_IPV4,
                .procname       = "ipv4", 
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
                .ctl_name       = CTL_NET,
                .procname       = "net", 
                .mode           = 0555, 
-                .child          = ipv4_table
+                .child          = ipvs_ipv4_table
        },
        { .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de675da74..c19408973c09 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
 }
-static void tcp_init(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
 {
        IP_VS_INIT_HASH_TABLE(tcp_apps);
        pp->timeout_table = tcp_timeouts;
 }
-static void tcp_exit(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
 {
 }
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
        .protocol =             IPPROTO_TCP,
        .dont_defrag =          0,
        .appcnt =               ATOMIC_INIT(0),
-        .init =                 tcp_init,
+        .init =                 ip_vs_tcp_init,
-        .exit =                 tcp_exit,
+        .exit =                 ip_vs_tcp_exit,
        .register_app =         tcp_register_app,
        .unregister_app =       tcp_unregister_app,
        .conn_schedule =        tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index a8512a3fd08a..3b87482049cf 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 #define IP_VS_XMIT(skb, rt)                             \
 do {                                                    \
-        (skb)->nfcache |= NFC_IPVS_PROPERTY;            \
+        (skb)->ipvs_property = 1;                       \
        (skb)->ip_summed = CHECKSUM_NONE;               \
        NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
                (rt)->u.dst.dev, dst_output);           \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index c9cf8726051d..db67373f9b34 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
        return NOTIFY_DONE;
 }
-struct notifier_block drr_dev_notifier = {
+static struct notifier_block drr_dev_notifier = {
        .notifier_call  = drr_dev_event,
 };
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 000000000000..ae0779d82c5d
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
+/* IPv4 specific functions of netfilter core */
+#include <linux/config.h>
+#ifdef CONFIG_NETFILTER
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/route.h>
+#include <linux/ip.h>
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff **pskb)
+{
+        struct iphdr *iph = (*pskb)->nh.iph;
+        struct rtable *rt;
+        struct flowi fl = {};
+        struct dst_entry *odst;
+        unsigned int hh_len;
+        /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+         * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+         */
+        if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
+                fl.nl_u.ip4_u.daddr = iph->daddr;
+                fl.nl_u.ip4_u.saddr = iph->saddr;
+                fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+                fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
+#endif
+                fl.proto = iph->protocol;
+                if (ip_route_output_key(&rt, &fl) != 0)
+                        return -1;
+                /* Drop old route. */
+                dst_release((*pskb)->dst);
+                (*pskb)->dst = &rt->u.dst;
+        } else {
+                /* non-local src, find valid iif to satisfy
+                 * rp-filter when calling ip_route_input. */
+                fl.nl_u.ip4_u.daddr = iph->saddr;
+                if (ip_route_output_key(&rt, &fl) != 0)
+                        return -1;
+                odst = (*pskb)->dst;
+                if (ip_route_input(*pskb, iph->daddr, iph->saddr,
+                                   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+                        dst_release(&rt->u.dst);
+                        return -1;
+                }
+                dst_release(&rt->u.dst);
+                dst_release(odst);
+        }
+        
+        if ((*pskb)->dst->error)
+                return -1;
+        /* Change in oif may mean change in hh_len. */
+        hh_len = (*pskb)->dst->dev->hard_header_len;
+        if (skb_headroom(*pskb) < hh_len) {
+                struct sk_buff *nskb;
+                nskb = skb_realloc_headroom(*pskb, hh_len);
+                if (!nskb) 
+                        return -1;
+                if ((*pskb)->sk)
+                        skb_set_owner_w(nskb, (*pskb)->sk);
+                kfree_skb(*pskb);
+                *pskb = nskb;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+struct ip_rt_info {
+        u_int32_t daddr;
+        u_int32_t saddr;
+        u_int8_t tos;
+};
+static void queue_save(const struct sk_buff *skb, struct nf_info *info)
+{
+        struct ip_rt_info *rt_info = nf_info_reroute(info);
+        if (info->hook == NF_IP_LOCAL_OUT) {
+                const struct iphdr *iph = skb->nh.iph;
+                rt_info->tos = iph->tos;
+                rt_info->daddr = iph->daddr;
+                rt_info->saddr = iph->saddr;
+        }
+}
+static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+        const struct ip_rt_info *rt_info = nf_info_reroute(info);
+        if (info->hook == NF_IP_LOCAL_OUT) {
+                struct iphdr *iph = (*pskb)->nh.iph;
+                if (!(iph->tos == rt_info->tos
+                      && iph->daddr == rt_info->daddr
+                      && iph->saddr == rt_info->saddr))
+                        return ip_route_me_harder(pskb);
+        }
+        return 0;
+}
+static struct nf_queue_rerouter ip_reroute = {
+        .rer_size       = sizeof(struct ip_rt_info),
+        .save           = queue_save,
+        .reroute        = queue_reroute,
+};
+static int init(void)
+{
+        return nf_register_queue_rerouter(PF_INET, &ip_reroute);
+}
+static void fini(void)
+{
+        nf_unregister_queue_rerouter(PF_INET);
+}
+module_init(init);
+module_exit(fini);
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1c06f0..e046f5521814 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK
          of packets, but this mark value is kept in the conntrack session
          instead of the individual packets.
        
+config IP_NF_CONNTRACK_EVENTS
+        bool "Connection tracking events"
+        depends on IP_NF_CONNTRACK
+        help
+          If this option is enabled, the connection tracking code will
+          provide a notifier chain that can be used by other kernel code
+          to get notified about changes in the connection tracking state.
+          
+          IF unsure, say `N'.
 config IP_NF_CT_PROTO_SCTP
        tristate  'SCTP protocol connection tracking support (EXPERIMENTAL)'
        depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -100,11 +110,15 @@ config IP_NF_AMANDA
          To compile it as a module, choose M here.  If unsure, say Y.
 config IP_NF_QUEUE
-        tristate "Userspace queueing via NETLINK"
+        tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
        help
          Netfilter has the ability to queue packets to user space: the
          netlink device can be used to access them using this driver.
+          This option enables the old IPv4-only "ip_queue" implementation
+          which has been obsoleted by the new "nfnetlink_queue" code (see
+          CONFIG_NETFILTER_NETLINK_QUEUE).
          To compile it as a module, choose M here.  If unsure, say N.
 config IP_NF_IPTABLES
@@ -340,6 +354,17 @@ config IP_NF_MATCH_SCTP
          If you want to compile it as a module, say M here and read
          <file:Documentation/modules.txt>.  If unsure, say `N'.
+config IP_NF_MATCH_DCCP
+        tristate  'DCCP protocol match support'
+        depends on IP_NF_IPTABLES
+        help
+          With this option enabled, you will be able to use the iptables
+          `dccp' match in order to match on DCCP source/destination ports
+          and DCCP flags.
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
 config IP_NF_MATCH_COMMENT
        tristate  'comment match support'
        depends on IP_NF_IPTABLES
@@ -361,6 +386,16 @@ config IP_NF_MATCH_CONNMARK
          <file:Documentation/modules.txt>.  The module will be called
          ipt_connmark.o.  If unsure, say `N'.
+config IP_NF_MATCH_CONNBYTES
+        tristate  'Connection byte/packet counter match support'
+        depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
+        help
+          This option adds a `connbytes' match, which allows you to match the
+          number of bytes and/or packets for each direction within a connection.
+          If you want to compile it as a module, say M here and read
+          <file:Documentation/modules.txt>.  If unsure, say `N'.
 config IP_NF_MATCH_HASHLIMIT
        tristate  'hashlimit match support'
        depends on IP_NF_IPTABLES
@@ -375,6 +410,19 @@ config IP_NF_MATCH_HASHLIMIT
          destination IP' or `500pps from any given source IP'  with a single
          IPtables rule.
+config IP_NF_MATCH_STRING
+        tristate  'string match support'
+        depends on IP_NF_IPTABLES 
+        select TEXTSEARCH
+        select TEXTSEARCH_KMP
+        select TEXTSEARCH_BM
+        select TEXTSEARCH_FSM
+        help
+          This option adds a `string' match, which allows you to look for
+          pattern matchings in packets.
+          To compile it as a module, choose M here.  If unsure, say N.
 # `filter', generic and specific targets
 config IP_NF_FILTER
        tristate "Packet filtering"
@@ -616,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY
          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_TTL
+        tristate  'TTL target support'
+        depends on IP_NF_MANGLE
+        help
+          This option adds a `TTL' target, which enables the user to modify
+          the TTL value of the IP header.
+          While it is safe to decrement/lower the TTL, this target also enables
+          functionality to increment and set the TTL value of the IP header to
+          arbitrary values.  This is EXTREMELY DANGEROUS since you can easily
+          create immortal packets that loop forever on the network.
+          To compile it as a module, choose M here.  If unsure, say N.
 config IP_NF_TARGET_CONNMARK
        tristate  'CONNMARK target support'
        depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
@@ -692,5 +754,11 @@ config IP_NF_ARP_MANGLE
          Allows altering the ARP packet payload: source and destination
          hardware and network addresses.
+config IP_NF_CONNTRACK_NETLINK
+        tristate 'Connection tracking netlink interface'
+        depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+        help
+          This option enables support for a netlink-based userspace interface
 endmenu
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5924dd..a7bd38f50522 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -9,6 +9,10 @@ iptable_nat-objs	:= ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe
 # connection tracking
 obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+# conntrack netlink interface
+obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
 # SCTP protocol connection tracking
 obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
@@ -38,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
 obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
 obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
 obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
 obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
 obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
 obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +59,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
 obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
 obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
 obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
 obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
 obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
 obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 # targets
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
 obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -87,3 +95,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
 obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
 obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 01e1b58322a9..be4c9eb3243f 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
 static char *conns[] = { "DATA ", "MESG ", "INDEX " };
 /* This is slow, but it's simple. --RR */
-static char amanda_buffer[65536];
+static char *amanda_buffer;
 static DEFINE_SPINLOCK(amanda_buffer_lock);
 unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
@@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = {
 static void __exit fini(void)
 {
        ip_conntrack_helper_unregister(&amanda_helper);
+        kfree(amanda_buffer);
 }
 static int __init init(void)
 {
-        return ip_conntrack_helper_register(&amanda_helper);
+        int ret;
+        amanda_buffer = kmalloc(65536, GFP_KERNEL);
+        if (!amanda_buffer)
+                return -ENOMEM;
+        ret = ip_conntrack_helper_register(&amanda_helper);
+        if (ret < 0) {
+                kfree(amanda_buffer);
+                return ret;
+        }
+        return 0;
 }
 module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a7f0c821a9b2..a0648600190e 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
 #include <linux/err.h>
 #include <linux/percpu.h>
 #include <linux/moduleparam.h>
+#include <linux/notifier.h>
 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
   registrations, conntrack timers*/
@@ -49,7 +50,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/listhelp.h>
-#define IP_CONNTRACK_VERSION    "2.1"
+#define IP_CONNTRACK_VERSION    "2.3"
 #if 0
 #define DEBUGP printk
@@ -69,22 +70,81 @@ static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size = 0;
 int ip_conntrack_max;
 struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
-static kmem_cache_t *ip_conntrack_expect_cachep;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
 struct ip_conntrack ip_conntrack_untracked;
 unsigned int ip_ct_log_invalid;
 static LIST_HEAD(unconfirmed);
 static int ip_conntrack_vmalloc;
-DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+static unsigned int ip_conntrack_next_id = 1;
+static unsigned int ip_conntrack_expect_next_id = 1;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
-void 
+/* deliver cached events and clear cache entry - must be called with locally
-ip_conntrack_put(struct ip_conntrack *ct)
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
 {
-        IP_NF_ASSERT(ct);
+        DEBUGP("ecache: delivering events for %p\n", ecache->ct);
-        nf_conntrack_put(&ct->ct_general);
+        if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+                notifier_call_chain(&ip_conntrack_chain, ecache->events,
+                                    ecache->ct);
+        ecache->events = 0;
+        ip_conntrack_put(ecache->ct);
+        ecache->ct = NULL;
 }
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
+{
+        struct ip_conntrack_ecache *ecache;
+        
+        local_bh_disable();
+        ecache = &__get_cpu_var(ip_conntrack_ecache);
+        if (ecache->ct == ct)
+                __ip_ct_deliver_cached_events(ecache);
+        local_bh_enable();
+}
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
+{
+        struct ip_conntrack_ecache *ecache;
+        /* take care of delivering potentially old events */
+        ecache = &__get_cpu_var(ip_conntrack_ecache);
+        BUG_ON(ecache->ct == ct);
+        if (ecache->ct)
+                __ip_ct_deliver_cached_events(ecache);
+        /* initialize for this conntrack/packet */
+        ecache->ct = ct;
+        nf_conntrack_get(&ct->ct_general);
+}
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
+{
+        struct ip_conntrack_ecache *ecache;
+        int cpu;
+        for_each_cpu(cpu) {
+                ecache = &per_cpu(ip_conntrack_ecache, cpu);
+                if (ecache->ct)
+                        ip_conntrack_put(ecache->ct);
+        }
+}
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
 static int ip_conntrack_hash_rnd_initted;
 static unsigned int ip_conntrack_hash_rnd;
@@ -144,6 +204,13 @@ static void unlink_expect(struct ip_conntrack_expect *exp)
        list_del(&exp->list);
        CONNTRACK_STAT_INC(expect_delete);
        exp->master->expecting--;
+        ip_conntrack_expect_put(exp);
+}
+void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
+{
+        unlink_expect(exp);
+        ip_conntrack_expect_put(exp);
 }
 static void expectation_timed_out(unsigned long ul_expect)
@@ -156,6 +223,33 @@ static void expectation_timed_out(unsigned long ul_expect)
        ip_conntrack_expect_put(exp);
 }
+struct ip_conntrack_expect *
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+        struct ip_conntrack_expect *i;
+        
+        list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+                if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+                        atomic_inc(&i->use);
+                        return i;
+                }
+        }
+        return NULL;
+}
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
+{
+        struct ip_conntrack_expect *i;
+        
+        read_lock_bh(&ip_conntrack_lock);
+        i = __ip_conntrack_expect_find(tuple);
+        read_unlock_bh(&ip_conntrack_lock);
+        return i;
+}
 /* If an expectation for this connection is found, it gets delete from
 * global list then returned. */
 static struct ip_conntrack_expect *
@@ -180,7 +274,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
 }
 /* delete all expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct)
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
 {
        struct ip_conntrack_expect *i, *tmp;
@@ -210,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct)
        LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
        /* Destroy all pending expectations */
-        remove_expectations(ct);
+        ip_ct_remove_expectations(ct);
 }
 static void
@@ -223,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *nfct)
        IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
        IP_NF_ASSERT(!timer_pending(&ct->timeout));
+        ip_conntrack_event(IPCT_DESTROY, ct);
+        set_bit(IPS_DYING_BIT, &ct->status);
        /* To make sure we don't get any weird locking issues here:
         * destroy_conntrack() MUST NOT be called with a write lock
         * to ip_conntrack_lock!!! -HW */
-        proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+        proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
        if (proto && proto->destroy)
                proto->destroy(ct);
@@ -238,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
         * except TFTP can create an expectation on the first packet,
         * before connection is in the list, so we need to clean here,
         * too. */
-        remove_expectations(ct);
+        ip_ct_remove_expectations(ct);
        /* We overload first tuple to link into unconfirmed list. */
        if (!is_confirmed(ct)) {
@@ -253,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
                ip_conntrack_put(ct->master);
        DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
-        kmem_cache_free(ip_conntrack_cachep, ct);
+        ip_conntrack_free(ct);
-        atomic_dec(&ip_conntrack_count);
 }
 static void death_by_timeout(unsigned long ul_conntrack)
@@ -280,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
                && ip_ct_tuple_equal(tuple, &i->tuple);
 }
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
                    const struct ip_conntrack *ignored_conntrack)
 {
@@ -315,6 +411,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
        return h;
 }
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+                                        unsigned int hash,
+                                        unsigned int repl_hash) 
+{
+        ct->id = ++ip_conntrack_next_id;
+        list_prepend(&ip_conntrack_hash[hash],
+                     &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+        list_prepend(&ip_conntrack_hash[repl_hash],
+                     &ct->tuplehash[IP_CT_DIR_REPLY].list);
+}
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
+{
+        unsigned int hash, repl_hash;
+        hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+        repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+        write_lock_bh(&ip_conntrack_lock);
+        __ip_conntrack_hash_insert(ct, hash, repl_hash);
+        write_unlock_bh(&ip_conntrack_lock);
+}
 /* Confirm a connection given skb; places it in hash table */
 int
 __ip_conntrack_confirm(struct sk_buff **pskb)
@@ -361,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
                /* Remove from unconfirmed list */
                list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
-                list_prepend(&ip_conntrack_hash[hash],
+                __ip_conntrack_hash_insert(ct, hash, repl_hash);
-                             &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
-                list_prepend(&ip_conntrack_hash[repl_hash],
-                             &ct->tuplehash[IP_CT_DIR_REPLY]);
                /* Timer relative to confirmation time, not original
                   setting time, otherwise we'd get timer wrap in
                   weird delay cases. */
@@ -374,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
                set_bit(IPS_CONFIRMED_BIT, &ct->status);
                CONNTRACK_STAT_INC(insert);
                write_unlock_bh(&ip_conntrack_lock);
+                if (ct->helper)
+                        ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+                if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+                    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+                        ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+                ip_conntrack_event_cache(master_ct(ct) ?
+                                         IPCT_RELATED : IPCT_NEW, *pskb);
                return NF_ACCEPT;
        }
@@ -438,34 +564,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
        return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
 }
-static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
 {
        return LIST_FIND(&helpers, helper_cmp,
                         struct ip_conntrack_helper *,
                         tuple);
 }
-/* Allocate a new conntrack: we return -ENOMEM if classification
+struct ip_conntrack_helper *
-   failed due to stress.  Otherwise it really is unclassifiable. */
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
-static struct ip_conntrack_tuple_hash *
+{
-init_conntrack(const struct ip_conntrack_tuple *tuple,
+        struct ip_conntrack_helper *helper;
-               struct ip_conntrack_protocol *protocol,
-               struct sk_buff *skb)
+        /* need ip_conntrack_lock to assure that helper exists until
+         * try_module_get() is called */
+        read_lock_bh(&ip_conntrack_lock);
+        helper = __ip_conntrack_helper_find(tuple);
+        if (helper) {
+                /* need to increase module usage count to assure helper will
+                 * not go away while the caller is e.g. busy putting a
+                 * conntrack in the hash that uses the helper */
+                if (!try_module_get(helper->me))
+                        helper = NULL;
+        }
+        read_unlock_bh(&ip_conntrack_lock);
+        return helper;
+}
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+        module_put(helper->me);
+}
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+        return ip_ct_protos[protocol];
+}
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+        struct ip_conntrack_protocol *p;
+        preempt_disable();
+        p = __ip_conntrack_proto_find(protocol);
+        if (p) {
+                if (!try_module_get(p->me))
+                        p = &ip_conntrack_generic_protocol;
+        }
+        preempt_enable();
+        
+        return p;
+}
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+        module_put(p->me);
+}
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+                                        struct ip_conntrack_tuple *repl)
 {
        struct ip_conntrack *conntrack;
-        struct ip_conntrack_tuple repl_tuple;
-        size_t hash;
-        struct ip_conntrack_expect *exp;
        if (!ip_conntrack_hash_rnd_initted) {
                get_random_bytes(&ip_conntrack_hash_rnd, 4);
                ip_conntrack_hash_rnd_initted = 1;
        }
-        hash = hash_conntrack(tuple);
        if (ip_conntrack_max
            && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+                unsigned int hash = hash_conntrack(orig);
                /* Try dropping from this hash chain. */
                if (!early_drop(&ip_conntrack_hash[hash])) {
                        if (net_ratelimit())
@@ -476,11 +652,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
                }
        }
-        if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
-                DEBUGP("Can't invert tuple.\n");
-                return NULL;
-        }
        conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
        if (!conntrack) {
                DEBUGP("Can't allocate conntrack.\n");
@@ -490,17 +661,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
        memset(conntrack, 0, sizeof(*conntrack));
        atomic_set(&conntrack->ct_general.use, 1);
        conntrack->ct_general.destroy = destroy_conntrack;
-        conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
+        conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
-        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
+        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
-        if (!protocol->new(conntrack, skb)) {
-                kmem_cache_free(ip_conntrack_cachep, conntrack);
-                return NULL;
-        }
        /* Don't set timer yet: wait for confirmation */
        init_timer(&conntrack->timeout);
        conntrack->timeout.data = (unsigned long)conntrack;
        conntrack->timeout.function = death_by_timeout;
+        atomic_inc(&ip_conntrack_count);
+        return conntrack;
+}
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+        atomic_dec(&ip_conntrack_count);
+        kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress.   Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+               struct ip_conntrack_protocol *protocol,
+               struct sk_buff *skb)
+{
+        struct ip_conntrack *conntrack;
+        struct ip_conntrack_tuple repl_tuple;
+        struct ip_conntrack_expect *exp;
+        if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+                DEBUGP("Can't invert tuple.\n");
+                return NULL;
+        }
+        conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+        if (conntrack == NULL || IS_ERR(conntrack))
+                return (struct ip_conntrack_tuple_hash *)conntrack;
+        if (!protocol->new(conntrack, skb)) {
+                ip_conntrack_free(conntrack);
+                return NULL;
+        }
        write_lock_bh(&ip_conntrack_lock);
        exp = find_expectation(tuple);
@@ -521,7 +725,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
                nf_conntrack_get(&conntrack->master->ct_general);
                CONNTRACK_STAT_INC(expect_new);
        } else {
-                conntrack->helper = ip_ct_find_helper(&repl_tuple);
+                conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
                CONNTRACK_STAT_INC(new);
        }
@@ -529,7 +733,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
        /* Overload tuple linked list to put us in unconfirmed list. */
        list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
-        atomic_inc(&ip_conntrack_count);
        write_unlock_bh(&ip_conntrack_lock);
        if (exp) {
@@ -607,7 +810,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
        struct ip_conntrack *ct;
        enum ip_conntrack_info ctinfo;
        struct ip_conntrack_protocol *proto;
-        int set_reply;
+        int set_reply = 0;
        int ret;
        /* Previously seen (loopback or untracked)?  Ignore. */
@@ -625,9 +828,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
                return NF_DROP;
        }
-        /* FIXME: Do this right please. --RR */
-        (*pskb)->nfcache |= NFC_UNKNOWN;
 /* Doesn't cover locally-generated broadcast, so not worth it. */
 #if 0
        /* Ignore broadcast: no `connection'. */
@@ -643,7 +843,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
        }
 #endif
-        proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+        proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
        /* It may be an special packet, error, unclean...
         * inverse of the return code tells to the netfilter
@@ -679,8 +879,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
                return -ret;
        }
-        if (set_reply)
+        if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
-                set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+                ip_conntrack_event_cache(IPCT_STATUS, *pskb);
        return ret;
 }
@@ -689,7 +889,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
                   const struct ip_conntrack_tuple *orig)
 {
        return ip_ct_invert_tuple(inverse, orig, 
-                                  ip_ct_find_proto(orig->dst.protonum));
+                                  __ip_conntrack_proto_find(orig->dst.protonum));
 }
 /* Would two expected things clash? */
@@ -769,6 +969,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
        exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
        add_timer(&exp->timeout);
+        exp->id = ++ip_conntrack_expect_next_id;
+        atomic_inc(&exp->use);
        CONNTRACK_STAT_INC(expect_create);
 }
@@ -827,6 +1029,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
                evict_oldest_expect(expect->master);
        ip_conntrack_expect_insert(expect);
+        ip_conntrack_expect_event(IPEXP_NEW, expect);
        ret = 0;
 out:
        write_unlock_bh(&ip_conntrack_lock);
@@ -847,7 +1050,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
        if (!conntrack->master && conntrack->expecting == 0)
-                conntrack->helper = ip_ct_find_helper(newreply);
+                conntrack->helper = __ip_conntrack_helper_find(newreply);
        write_unlock_bh(&ip_conntrack_lock);
 }
@@ -861,11 +1064,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
        return 0;
 }
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+        struct ip_conntrack_helper *h;
+        list_for_each_entry(h, &helpers, list) {
+                if (!strcmp(h->name, name))
+                        return h;
+        }
+        return NULL;
+}
 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
                         const struct ip_conntrack_helper *me)
 {
-        if (tuplehash_to_ctrack(i)->helper == me)
+        if (tuplehash_to_ctrack(i)->helper == me) {
+                ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
                tuplehash_to_ctrack(i)->helper = NULL;
+        }
        return 0;
 }
@@ -927,12 +1145,46 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
                if (del_timer(&ct->timeout)) {
                        ct->timeout.expires = jiffies + extra_jiffies;
                        add_timer(&ct->timeout);
+                        ip_conntrack_event_cache(IPCT_REFRESH, skb);
                }
                ct_add_counters(ct, ctinfo, skb);
                write_unlock_bh(&ip_conntrack_lock);
        }
 }
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+                               const struct ip_conntrack_tuple *tuple)
+{
+        NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+                &tuple->src.u.tcp.port);
+        NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+                &tuple->dst.u.tcp.port);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+                               struct ip_conntrack_tuple *t)
+{
+        if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+                return -EINVAL;
+        t->src.u.tcp.port =
+                *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+        t->dst.u.tcp.port =
+                *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+        return 0;
+}
+#endif
 /* Returns new sk_buff, or NULL */
 struct sk_buff *
 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -943,10 +1195,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
        skb = ip_defrag(skb, user);
        local_bh_enable();
-        if (skb) {
+        if (skb)
                ip_send_check(skb->nh.iph);
-                skb->nfcache |= NFC_ALTERED;
-        }
        return skb;
 }
@@ -1096,16 +1346,14 @@ static void free_conntrack_hash(void)
                                     * ip_conntrack_htable_size));
 }
-/* Mishearing the voices in his head, our hero wonders how he's
+void ip_conntrack_flush()
-   supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
 {
-        ip_ct_attach = NULL;
        /* This makes sure all current packets have passed through
           netfilter framework.  Roll on, two-stage module
           delete... */
        synchronize_net();
- 
+        ip_ct_event_cache_flush();
 i_see_dead_people:
        ip_ct_iterate_cleanup(kill_all, NULL);
        if (atomic_read(&ip_conntrack_count) != 0) {
@@ -1115,7 +1363,14 @@ void ip_conntrack_cleanup(void)
        /* wait until all references to ip_conntrack_untracked are dropped */
        while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
                schedule();
+}
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+        ip_ct_attach = NULL;
+        ip_conntrack_flush();
        kmem_cache_destroy(ip_conntrack_cachep);
        kmem_cache_destroy(ip_conntrack_expect_cachep);
        free_conntrack_hash();
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 7a3b773be3f9..3a2627db1729 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
 MODULE_DESCRIPTION("ftp connection tracking helper");
 /* This is slow, but it's simple. --RR */
-static char ftp_buffer[65536];
+static char *ftp_buffer;
 static DEFINE_SPINLOCK(ip_ftp_lock);
 #define MAX_PORTS 8
@@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
 }
 /* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+                          struct sk_buff *skb)
 {
        unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
@@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
                        oldest = i;
        }
-        if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+        if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
                info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
-        else if (oldest != NUM_SEQ_TO_REMEMBER)
+                ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+        } else if (oldest != NUM_SEQ_TO_REMEMBER) {
                info->seq_aft_nl[dir][oldest] = nl_seq;
+                ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+        }
 }
 static int help(struct sk_buff **pskb,
@@ -439,7 +442,7 @@ out_update_nl:
        /* Now if this ends in \n, update ftp info.  Seq may have been
         * adjusted by NAT code. */
        if (ends_in_nl)
-                update_nl_seq(seq, ct_ftp_info,dir);
+                update_nl_seq(seq, ct_ftp_info,dir, *pskb);
 out:
        spin_unlock_bh(&ip_ftp_lock);
        return ret;
@@ -457,6 +460,8 @@ static void fini(void)
                                ports[i]);
                ip_conntrack_helper_unregister(&ftp[i]);
        }
+        kfree(ftp_buffer);
 }
 static int __init init(void)
@@ -464,6 +469,10 @@ static int __init init(void)
        int i, ret;
        char *tmpname;
+        ftp_buffer = kmalloc(65536, GFP_KERNEL);
+        if (!ftp_buffer)
+                return -ENOMEM;
        if (ports_c == 0)
                ports[ports_c++] = FTP_PORT;
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 4a28f297d502..25438eec21a1 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -39,7 +39,7 @@ static int ports_c;
 static int max_dcc_channels = 8;
 static unsigned int dcc_timeout = 300;
 /* This is slow, but it's simple. --RR */
-static char irc_buffer[65536];
+static char *irc_buffer;
 static DEFINE_SPINLOCK(irc_buffer_lock);
 unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
@@ -257,6 +257,10 @@ static int __init init(void)
                printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
                return -EBUSY;
        }
+        irc_buffer = kmalloc(65536, GFP_KERNEL);
+        if (!irc_buffer)
+                return -ENOMEM;
        
        /* If no port given, default to standard irc port */
        if (ports_c == 0)
@@ -304,6 +308,7 @@ static void fini(void)
                       ports[i]);
                ip_conntrack_helper_unregister(&irc_helpers[i]);
        }
+        kfree(irc_buffer);
 }
 module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 000000000000..a4e9278db4ed
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1579 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * I've reworked this stuff to use attributes instead of conntrack 
+ * structures. 5.44 am. I need more tea. --pablo 05/07/11.
+ *
+ * Initial connection tracking via netlink development funded and 
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+MODULE_LICENSE("GPL");
+static char __initdata version[] = "0.90";
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb, 
+                            const struct ip_conntrack_tuple *tuple)
+{
+        struct ip_conntrack_protocol *proto;
+        NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+        proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+        if (proto && proto->tuple_to_nfattr)
+                return proto->tuple_to_nfattr(skb, tuple);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_dump_tuples(struct sk_buff *skb, 
+                      const struct ip_conntrack_tuple *tuple)
+{
+        struct nfattr *nest_parms;
+        
+        nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
+        NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
+        NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
+        NFA_NEST_END(skb, nest_parms);
+        nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
+        ctnetlink_dump_tuples_proto(skb, tuple);
+        NFA_NEST_END(skb, nest_parms);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+        u_int32_t status = htonl((u_int32_t) ct->status);
+        NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+        long timeout_l = ct->timeout.expires - jiffies;
+        u_int32_t timeout;
+        if (timeout_l < 0)
+                timeout = 0;
+        else
+                timeout = htonl(timeout_l / HZ);
+        
+        NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+        struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+        struct nfattr *nest_proto;
+        int ret;
+        
+        if (!proto || !proto->to_nfattr)
+                return 0;
+        
+        nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
+        ret = proto->to_nfattr(skb, nest_proto, ct);
+        ip_conntrack_proto_put(proto);
+        NFA_NEST_END(skb, nest_proto);
+        return ret;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+        struct nfattr *nest_helper;
+        if (!ct->helper)
+                return 0;
+                
+        nest_helper = NFA_NEST(skb, CTA_HELP);
+        NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
+        if (ct->helper->to_nfattr)
+                ct->helper->to_nfattr(skb, ct);
+        NFA_NEST_END(skb, nest_helper);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+#ifdef CONFIG_IP_NF_CT_ACCT
+static inline int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
+                        enum ip_conntrack_dir dir)
+{
+        enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+        struct nfattr *nest_count = NFA_NEST(skb, type);
+        u_int64_t tmp;
+        tmp = cpu_to_be64(ct->counters[dir].packets);
+        NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp);
+        tmp = cpu_to_be64(ct->counters[dir].bytes);
+        NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp);
+        NFA_NEST_END(skb, nest_count);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+#else
+#define ctnetlink_dump_counters(a, b, c) (0)
+#endif
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+        u_int32_t mark = htonl(ct->mark);
+        NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+        u_int32_t id = htonl(ct->id);
+        NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+        unsigned int use = htonl(atomic_read(&ct->ct_general.use));
+        
+        NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+                    int event, int nowait, 
+                    const struct ip_conntrack *ct)
+{
+        struct nlmsghdr *nlh;
+        struct nfgenmsg *nfmsg;
+        struct nfattr *nest_parms;
+        unsigned char *b;
+        b = skb->tail;
+        event |= NFNL_SUBSYS_CTNETLINK << 8;
+        nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+        nfmsg  = NLMSG_DATA(nlh);
+        nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+        nfmsg->nfgen_family = AF_INET;
+        nfmsg->version      = NFNETLINK_V0;
+        nfmsg->res_id       = 0;
+        nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+        if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+                goto nfattr_failure;
+        NFA_NEST_END(skb, nest_parms);
+        
+        nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+        if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+                goto nfattr_failure;
+        NFA_NEST_END(skb, nest_parms);
+        if (ctnetlink_dump_status(skb, ct) < 0 ||
+            ctnetlink_dump_timeout(skb, ct) < 0 ||
+            ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+            ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+            ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+            ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+            ctnetlink_dump_mark(skb, ct) < 0 ||
+            ctnetlink_dump_id(skb, ct) < 0 ||
+            ctnetlink_dump_use(skb, ct) < 0)
+                goto nfattr_failure;
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+nfattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+                                     unsigned long events, void *ptr)
+{
+        struct nlmsghdr *nlh;
+        struct nfgenmsg *nfmsg;
+        struct nfattr *nest_parms;
+        struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
+        struct sk_buff *skb;
+        unsigned int type;
+        unsigned char *b;
+        unsigned int flags = 0, group;
+        /* ignore our fake conntrack entry */
+        if (ct == &ip_conntrack_untracked)
+                return NOTIFY_DONE;
+        if (events & IPCT_DESTROY) {
+                type = IPCTNL_MSG_CT_DELETE;
+                group = NFNLGRP_CONNTRACK_DESTROY;
+                goto alloc_skb;
+        }
+        if (events & (IPCT_NEW | IPCT_RELATED)) {
+                type = IPCTNL_MSG_CT_NEW;
+                flags = NLM_F_CREATE|NLM_F_EXCL;
+                /* dump everything */
+                events = ~0UL;
+                group = NFNLGRP_CONNTRACK_NEW;
+                goto alloc_skb;
+        }
+        if (events & (IPCT_STATUS |
+                      IPCT_PROTOINFO |
+                      IPCT_HELPER |
+                      IPCT_HELPINFO |
+                      IPCT_NATINFO)) {
+                type = IPCTNL_MSG_CT_NEW;
+                group = NFNLGRP_CONNTRACK_UPDATE;
+                goto alloc_skb;
+        } 
+        
+        return NOTIFY_DONE;
+alloc_skb:
+  /* FIXME: Check if there are any listeners before, don't hurt performance */
+        
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+        if (!skb)
+                return NOTIFY_DONE;
+        b = skb->tail;
+        type |= NFNL_SUBSYS_CTNETLINK << 8;
+        nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+        nfmsg = NLMSG_DATA(nlh);
+        nlh->nlmsg_flags    = flags;
+        nfmsg->nfgen_family = AF_INET;
+        nfmsg->version  = NFNETLINK_V0;
+        nfmsg->res_id   = 0;
+        nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+        if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+                goto nfattr_failure;
+        NFA_NEST_END(skb, nest_parms);
+        
+        nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+        if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+                goto nfattr_failure;
+        NFA_NEST_END(skb, nest_parms);
+        
+        /* NAT stuff is now a status flag */
+        if ((events & IPCT_STATUS || events & IPCT_NATINFO)
+            && ctnetlink_dump_status(skb, ct) < 0)
+                goto nfattr_failure;
+        if (events & IPCT_REFRESH
+            && ctnetlink_dump_timeout(skb, ct) < 0)
+                goto nfattr_failure;
+        if (events & IPCT_PROTOINFO
+            && ctnetlink_dump_protoinfo(skb, ct) < 0)
+                goto nfattr_failure;
+        if (events & IPCT_HELPINFO
+            && ctnetlink_dump_helpinfo(skb, ct) < 0)
+                goto nfattr_failure;
+        if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+            ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+                goto nfattr_failure;
+        nlh->nlmsg_len = skb->tail - b;
+        nfnetlink_send(skb, 0, group, 0);
+        return NOTIFY_DONE;
+nlmsg_failure:
+nfattr_failure:
+        kfree_skb(skb);
+        return NOTIFY_DONE;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+        DEBUGP("entered %s\n", __FUNCTION__);
+        return 0;
+}
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct ip_conntrack *ct = NULL;
+        struct ip_conntrack_tuple_hash *h;
+        struct list_head *i;
+        u_int32_t *id = (u_int32_t *) &cb->args[1];
+        DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 
+                        cb->args[0], *id);
+        read_lock_bh(&ip_conntrack_lock);
+        for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+                list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+                        h = (struct ip_conntrack_tuple_hash *) i;
+                        if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+                                continue;
+                        ct = tuplehash_to_ctrack(h);
+                        if (ct->id <= *id)
+                                continue;
+                        if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+                                                cb->nlh->nlmsg_seq,
+                                                IPCTNL_MSG_CT_NEW,
+                                                1, ct) < 0)
+                                goto out;
+                        *id = ct->id;
+                }
+        }
+out:    
+        read_unlock_bh(&ip_conntrack_lock);
+        DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+        return skb->len;
+}
+#ifdef CONFIG_IP_NF_CT_ACCT
+static int
+ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct ip_conntrack *ct = NULL;
+        struct ip_conntrack_tuple_hash *h;
+        struct list_head *i;
+        u_int32_t *id = (u_int32_t *) &cb->args[1];
+        DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, 
+                        cb->args[0], *id);
+        write_lock_bh(&ip_conntrack_lock);
+        for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+                list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+                        h = (struct ip_conntrack_tuple_hash *) i;
+                        if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+                                continue;
+                        ct = tuplehash_to_ctrack(h);
+                        if (ct->id <= *id)
+                                continue;
+                        if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+                                                cb->nlh->nlmsg_seq,
+                                                IPCTNL_MSG_CT_NEW,
+                                                1, ct) < 0)
+                                goto out;
+                        *id = ct->id;
+                        memset(&ct->counters, 0, sizeof(ct->counters));
+                }
+        }
+out:    
+        write_unlock_bh(&ip_conntrack_lock);
+        DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+        return skb->len;
+}
+#endif
+static const int cta_min_ip[CTA_IP_MAX] = {
+        [CTA_IP_V4_SRC-1]       = sizeof(u_int32_t),
+        [CTA_IP_V4_DST-1]       = sizeof(u_int32_t),
+};
+static inline int
+ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
+{
+        struct nfattr *tb[CTA_IP_MAX];
+        DEBUGP("entered %s\n", __FUNCTION__);
+        
+        if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
+                goto nfattr_failure;
+        if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+                return -EINVAL;
+        if (!tb[CTA_IP_V4_SRC-1])
+                return -EINVAL;
+        tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
+        if (!tb[CTA_IP_V4_DST-1])
+                return -EINVAL;
+        tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+        DEBUGP("leaving\n");
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static const int cta_min_proto[CTA_PROTO_MAX] = {
+        [CTA_PROTO_NUM-1]       = sizeof(u_int16_t),
+        [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
+        [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t),
+        [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
+        [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
+        [CTA_PROTO_ICMP_ID-1]   = sizeof(u_int16_t),
+};
+static inline int
+ctnetlink_parse_tuple_proto(struct nfattr *attr, 
+                            struct ip_conntrack_tuple *tuple)
+{
+        struct nfattr *tb[CTA_PROTO_MAX];
+        struct ip_conntrack_protocol *proto;
+        int ret = 0;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
+                goto nfattr_failure;
+        if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+                return -EINVAL;
+        if (!tb[CTA_PROTO_NUM-1])
+                return -EINVAL;
+        tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+        proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+        if (likely(proto && proto->nfattr_to_tuple)) {
+                ret = proto->nfattr_to_tuple(tb, tuple);
+                ip_conntrack_proto_put(proto);
+        }
+        
+        return ret;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
+                      enum ctattr_tuple type)
+{
+        struct nfattr *tb[CTA_TUPLE_MAX];
+        int err;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        memset(tuple, 0, sizeof(*tuple));
+        if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
+                goto nfattr_failure;
+        if (!tb[CTA_TUPLE_IP-1])
+                return -EINVAL;
+        err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+        if (err < 0)
+                return err;
+        if (!tb[CTA_TUPLE_PROTO-1])
+                return -EINVAL;
+        err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+        if (err < 0)
+                return err;
+        /* orig and expect tuples get DIR_ORIGINAL */
+        if (type == CTA_TUPLE_REPLY)
+                tuple->dst.dir = IP_CT_DIR_REPLY;
+        else
+                tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+        DUMP_TUPLE(tuple);
+        DEBUGP("leaving\n");
+        return 0;
+nfattr_failure:
+        return -1;
+}
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+        [CTA_PROTONAT_PORT_MIN-1]       = sizeof(u_int16_t),
+        [CTA_PROTONAT_PORT_MAX-1]       = sizeof(u_int16_t),
+};
+static int ctnetlink_parse_nat_proto(struct nfattr *attr,
+                                     const struct ip_conntrack *ct,
+                                     struct ip_nat_range *range)
+{
+        struct nfattr *tb[CTA_PROTONAT_MAX];
+        struct ip_nat_protocol *npt;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
+                goto nfattr_failure;
+        if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
+                goto nfattr_failure;
+        npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+        if (!npt)
+                return 0;
+        if (!npt->nfattr_to_range) {
+                ip_nat_proto_put(npt);
+                return 0;
+        }
+        /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
+        if (npt->nfattr_to_range(tb, range) > 0)
+                range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+        ip_nat_proto_put(npt);
+        DEBUGP("leaving\n");
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static inline int
+ctnetlink_parse_nat(struct nfattr *cda[],
+                    const struct ip_conntrack *ct, struct ip_nat_range *range)
+{
+        struct nfattr *tb[CTA_NAT_MAX];
+        int err;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        memset(range, 0, sizeof(*range));
+        
+        if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
+                goto nfattr_failure;
+        if (tb[CTA_NAT_MINIP-1])
+                range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
+        if (!tb[CTA_NAT_MAXIP-1])
+                range->max_ip = range->min_ip;
+        else
+                range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
+        if (range->min_ip)
+                range->flags |= IP_NAT_RANGE_MAP_IPS;
+        if (!tb[CTA_NAT_PROTO-1])
+                return 0;
+        err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+        if (err < 0)
+                return err;
+        DEBUGP("leaving\n");
+        return 0;
+nfattr_failure:
+        return -1;
+}
+#endif
+static inline int
+ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
+{
+        struct nfattr *tb[CTA_HELP_MAX];
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
+                goto nfattr_failure;
+        if (!tb[CTA_HELP_NAME-1])
+                return -EINVAL;
+        *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+                        struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+        struct ip_conntrack_tuple_hash *h;
+        struct ip_conntrack_tuple tuple;
+        struct ip_conntrack *ct;
+        int err = 0;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (cda[CTA_TUPLE_ORIG-1])
+                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+        else if (cda[CTA_TUPLE_REPLY-1])
+                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+        else {
+                /* Flush the whole table */
+                ip_conntrack_flush();
+                return 0;
+        }
+        if (err < 0)
+                return err;
+        h = ip_conntrack_find_get(&tuple, NULL);
+        if (!h) {
+                DEBUGP("tuple not found in conntrack hash\n");
+                return -ENOENT;
+        }
+        ct = tuplehash_to_ctrack(h);
+        
+        if (cda[CTA_ID-1]) {
+                u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
+                if (ct->id != id) {
+                        ip_conntrack_put(ct);
+                        return -ENOENT;
+                }
+        }       
+        if (del_timer(&ct->timeout)) {
+                ip_conntrack_put(ct);
+                ct->timeout.function((unsigned long)ct);
+                return 0;
+        }
+        ip_conntrack_put(ct);
+        DEBUGP("leaving\n");
+        return 0;
+}
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+                        struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+        struct ip_conntrack_tuple_hash *h;
+        struct ip_conntrack_tuple tuple;
+        struct ip_conntrack *ct;
+        struct sk_buff *skb2 = NULL;
+        int err = 0;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (nlh->nlmsg_flags & NLM_F_DUMP) {
+                struct nfgenmsg *msg = NLMSG_DATA(nlh);
+                u32 rlen;
+                if (msg->nfgen_family != AF_INET)
+                        return -EAFNOSUPPORT;
+                if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+                                        IPCTNL_MSG_CT_GET_CTRZERO) {
+#ifdef CONFIG_IP_NF_CT_ACCT
+                        if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+                                                ctnetlink_dump_table_w,
+                                                ctnetlink_done)) != 0)
+                                return -EINVAL;
+#else
+                        return -ENOTSUPP;
+#endif
+                } else {
+                        if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+                                                        ctnetlink_dump_table,
+                                                        ctnetlink_done)) != 0)
+                        return -EINVAL;
+                }
+                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+                if (rlen > skb->len)
+                        rlen = skb->len;
+                skb_pull(skb, rlen);
+                return 0;
+        }
+        if (cda[CTA_TUPLE_ORIG-1])
+                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+        else if (cda[CTA_TUPLE_REPLY-1])
+                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+        else
+                return -EINVAL;
+        if (err < 0)
+                return err;
+        h = ip_conntrack_find_get(&tuple, NULL);
+        if (!h) {
+                DEBUGP("tuple not found in conntrack hash");
+                return -ENOENT;
+        }
+        DEBUGP("tuple found\n");
+        ct = tuplehash_to_ctrack(h);
+        err = -ENOMEM;
+        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+        if (!skb2) {
+                ip_conntrack_put(ct);
+                return -ENOMEM;
+        }
+        NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+        err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 
+                                  IPCTNL_MSG_CT_NEW, 1, ct);
+        ip_conntrack_put(ct);
+        if (err <= 0)
+                goto out;
+        err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+        if (err < 0)
+                goto out;
+        DEBUGP("leaving\n");
+        return 0;
+out:
+        if (skb2)
+                kfree_skb(skb2);
+        return -1;
+}
+static inline int
+ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+        unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]);
+        d = ct->status ^ status;
+        if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+                /* unchangeable */
+                return -EINVAL;
+        
+        if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+                /* SEEN_REPLY bit can only be set */
+                return -EINVAL;
+        
+        if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+                /* ASSURED bit can only be set */
+                return -EINVAL;
+        if (cda[CTA_NAT-1]) {
+#ifndef CONFIG_IP_NF_NAT_NEEDED
+                return -EINVAL;
+#else
+                unsigned int hooknum;
+                struct ip_nat_range range;
+                if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+                        return -EINVAL;
+                DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
+                       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+                       htons(range.min.all), htons(range.max.all));
+                
+                /* This is tricky but it works. ip_nat_setup_info needs the
+                 * hook number as parameter, so let's do the correct 
+                 * conversion and run away */
+                if (status & IPS_SRC_NAT_DONE)
+                        hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+                else if (status & IPS_DST_NAT_DONE)
+                        hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
+                else 
+                        return -EINVAL; /* Missing NAT flags */
+                DEBUGP("NAT status: %lu\n", 
+                       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+                
+                if (ip_nat_initialized(ct, hooknum))
+                        return -EEXIST;
+                ip_nat_setup_info(ct, &range, hooknum);
+                DEBUGP("NAT status after setup_info: %lu\n",
+                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+#endif
+        }
+        /* Be careful here, modifying NAT bits can screw up things,
+         * so don't let users modify them directly if they don't pass
+         * ip_nat_range. */
+        ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+        return 0;
+}
+static inline int
+ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+        struct ip_conntrack_helper *helper;
+        char *helpname;
+        int err;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        /* don't change helper of sibling connections */
+        if (ct->master)
+                return -EINVAL;
+        err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+        if (err < 0)
+                return err;
+        helper = __ip_conntrack_helper_find_byname(helpname);
+        if (!helper) {
+                if (!strcmp(helpname, ""))
+                        helper = NULL;
+                else
+                        return -EINVAL;
+        }
+        if (ct->helper) {
+                if (!helper) {
+                        /* we had a helper before ... */
+                        ip_ct_remove_expectations(ct);
+                        ct->helper = NULL;
+                } else {
+                        /* need to zero data of old helper */
+                        memset(&ct->help, 0, sizeof(ct->help));
+                }
+        }
+        
+        ct->helper = helper;
+        return 0;
+}
+static inline int
+ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+        u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+        
+        if (!del_timer(&ct->timeout))
+                return -ETIME;
+        ct->timeout.expires = jiffies + timeout * HZ;
+        add_timer(&ct->timeout);
+        return 0;
+}
+static int
+ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+        int err;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (cda[CTA_HELP-1]) {
+                err = ctnetlink_change_helper(ct, cda);
+                if (err < 0)
+                        return err;
+        }
+        if (cda[CTA_TIMEOUT-1]) {
+                err = ctnetlink_change_timeout(ct, cda);
+                if (err < 0)
+                        return err;
+        }
+        if (cda[CTA_STATUS-1]) {
+                err = ctnetlink_change_status(ct, cda);
+                if (err < 0)
+                        return err;
+        }
+        DEBUGP("all done\n");
+        return 0;
+}
+static int
+ctnetlink_create_conntrack(struct nfattr *cda[], 
+                           struct ip_conntrack_tuple *otuple,
+                           struct ip_conntrack_tuple *rtuple)
+{
+        struct ip_conntrack *ct;
+        int err = -EINVAL;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        ct = ip_conntrack_alloc(otuple, rtuple);
+        if (ct == NULL || IS_ERR(ct))
+                return -ENOMEM; 
+        if (!cda[CTA_TIMEOUT-1])
+                goto err;
+        ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+        ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+        ct->status |= IPS_CONFIRMED;
+        err = ctnetlink_change_status(ct, cda);
+        if (err < 0)
+                goto err;
+        ct->helper = ip_conntrack_helper_find_get(rtuple);
+        add_timer(&ct->timeout);
+        ip_conntrack_hash_insert(ct);
+        if (ct->helper)
+                ip_conntrack_helper_put(ct->helper);
+        DEBUGP("conntrack with id %u inserted\n", ct->id);
+        return 0;
+err:    
+        ip_conntrack_free(ct);
+        return err;
+}
+static int 
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+                        struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+        struct ip_conntrack_tuple otuple, rtuple;
+        struct ip_conntrack_tuple_hash *h = NULL;
+        int err = 0;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (cda[CTA_TUPLE_ORIG-1]) {
+                err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
+                if (err < 0)
+                        return err;
+        }
+        if (cda[CTA_TUPLE_REPLY-1]) {
+                err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
+                if (err < 0)
+                        return err;
+        }
+        write_lock_bh(&ip_conntrack_lock);
+        if (cda[CTA_TUPLE_ORIG-1])
+                h = __ip_conntrack_find(&otuple, NULL);
+        else if (cda[CTA_TUPLE_REPLY-1])
+                h = __ip_conntrack_find(&rtuple, NULL);
+        if (h == NULL) {
+                write_unlock_bh(&ip_conntrack_lock);
+                DEBUGP("no such conntrack, create new\n");
+                err = -ENOENT;
+                if (nlh->nlmsg_flags & NLM_F_CREATE)
+                        err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+                return err;
+        }
+        /* implicit 'else' */
+        /* we only allow nat config for new conntracks */
+        if (cda[CTA_NAT-1]) {
+                err = -EINVAL;
+                goto out_unlock;
+        }
+        /* We manipulate the conntrack inside the global conntrack table lock,
+         * so there's no need to increase the refcount */
+        DEBUGP("conntrack found\n");
+        err = -EEXIST;
+        if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+                err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
+out_unlock:
+        write_unlock_bh(&ip_conntrack_lock);
+        return err;
+}
+/*********************************************************************** 
+ * EXPECT 
+ ***********************************************************************/ 
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+                         const struct ip_conntrack_tuple *tuple,
+                         enum ctattr_expect type)
+{
+        struct nfattr *nest_parms = NFA_NEST(skb, type);
+        
+        if (ctnetlink_dump_tuples(skb, tuple) < 0)
+                goto nfattr_failure;
+        NFA_NEST_END(skb, nest_parms);
+        return 0;
+nfattr_failure:
+        return -1;
+}                       
+static inline int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+                          const struct ip_conntrack_expect *exp)
+{
+        struct ip_conntrack *master = exp->master;
+        u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
+        u_int32_t id = htonl(exp->id);
+        if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+                goto nfattr_failure;
+        if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
+                goto nfattr_failure;
+        if (ctnetlink_exp_dump_tuple(skb,
+                                 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+                                 CTA_EXPECT_MASTER) < 0)
+                goto nfattr_failure;
+        
+        NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
+        NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+        return 0;
+        
+nfattr_failure:
+        return -1;
+}
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+                    int event, 
+                    int nowait, 
+                    const struct ip_conntrack_expect *exp)
+{
+        struct nlmsghdr *nlh;
+        struct nfgenmsg *nfmsg;
+        unsigned char *b;
+        b = skb->tail;
+        event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+        nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+        nfmsg  = NLMSG_DATA(nlh);
+        nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+        nfmsg->nfgen_family = AF_INET;
+        nfmsg->version      = NFNETLINK_V0;
+        nfmsg->res_id       = 0;
+        if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+                goto nfattr_failure;
+        nlh->nlmsg_len = skb->tail - b;
+        return skb->len;
+nlmsg_failure:
+nfattr_failure:
+        skb_trim(skb, b - skb->data);
+        return -1;
+}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_expect_event(struct notifier_block *this,
+                                  unsigned long events, void *ptr)
+{
+        struct nlmsghdr *nlh;
+        struct nfgenmsg *nfmsg;
+        struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
+        struct sk_buff *skb;
+        unsigned int type;
+        unsigned char *b;
+        int flags = 0;
+        u16 proto;
+        if (events & IPEXP_NEW) {
+                type = IPCTNL_MSG_EXP_NEW;
+                flags = NLM_F_CREATE|NLM_F_EXCL;
+        } else
+                return NOTIFY_DONE;
+        skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+        if (!skb)
+                return NOTIFY_DONE;
+        b = skb->tail;
+        type |= NFNL_SUBSYS_CTNETLINK << 8;
+        nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+        nfmsg = NLMSG_DATA(nlh);
+        nlh->nlmsg_flags    = flags;
+        nfmsg->nfgen_family = AF_INET;
+        nfmsg->version      = NFNETLINK_V0;
+        nfmsg->res_id       = 0;
+        if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+                goto nfattr_failure;
+        nlh->nlmsg_len = skb->tail - b;
+        proto = exp->tuple.dst.protonum;
+        nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
+        return NOTIFY_DONE;
+nlmsg_failure:
+nfattr_failure:
+        kfree_skb(skb);
+        return NOTIFY_DONE;
+}
+#endif
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct ip_conntrack_expect *exp = NULL;
+        struct list_head *i;
+        u_int32_t *id = (u_int32_t *) &cb->args[0];
+        DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+        read_lock_bh(&ip_conntrack_lock);
+        list_for_each_prev(i, &ip_conntrack_expect_list) {
+                exp = (struct ip_conntrack_expect *) i;
+                if (exp->id <= *id)
+                        continue;
+                if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+                                            cb->nlh->nlmsg_seq,
+                                            IPCTNL_MSG_EXP_NEW,
+                                            1, exp) < 0)
+                        goto out;
+                *id = exp->id;
+        }
+out:    
+        read_unlock_bh(&ip_conntrack_lock);
+        DEBUGP("leaving, last id=%llu\n", *id);
+        return skb->len;
+}
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, 
+                     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+        struct ip_conntrack_tuple tuple;
+        struct ip_conntrack_expect *exp;
+        struct sk_buff *skb2;
+        int err = 0;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        if (nlh->nlmsg_flags & NLM_F_DUMP) {
+                struct nfgenmsg *msg = NLMSG_DATA(nlh);
+                u32 rlen;
+                if (msg->nfgen_family != AF_INET)
+                        return -EAFNOSUPPORT;
+                if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+                                                ctnetlink_exp_dump_table,
+                                                ctnetlink_done)) != 0)
+                        return -EINVAL;
+                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+                if (rlen > skb->len)
+                        rlen = skb->len;
+                skb_pull(skb, rlen);
+                return 0;
+        }
+        if (cda[CTA_EXPECT_MASTER-1])
+                err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
+        else
+                return -EINVAL;
+        if (err < 0)
+                return err;
+        exp = ip_conntrack_expect_find_get(&tuple);
+        if (!exp)
+                return -ENOENT;
+        err = -ENOMEM;
+        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+        if (!skb2)
+                goto out;
+        NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+        
+        err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, 
+                                      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+                                      1, exp);
+        if (err <= 0)
+                goto out;
+        ip_conntrack_expect_put(exp);
+        err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+        if (err < 0)
+                goto free;
+        return err;
+out:
+        ip_conntrack_expect_put(exp);
+free:
+        if (skb2)
+                kfree_skb(skb2);
+        return err;
+}
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, 
+                     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+        struct ip_conntrack_expect *exp, *tmp;
+        struct ip_conntrack_tuple tuple;
+        struct ip_conntrack_helper *h;
+        int err;
+        if (cda[CTA_EXPECT_TUPLE-1]) {
+                /* delete a single expect by tuple */
+                err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+                if (err < 0)
+                        return err;
+                /* bump usage count to 2 */
+                exp = ip_conntrack_expect_find_get(&tuple);
+                if (!exp)
+                        return -ENOENT;
+                if (cda[CTA_EXPECT_ID-1]) {
+                        u_int32_t id = 
+                                *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+                        if (exp->id != ntohl(id)) {
+                                ip_conntrack_expect_put(exp);
+                                return -ENOENT;
+                        }
+                }
+                /* after list removal, usage count == 1 */
+                ip_conntrack_unexpect_related(exp);
+                /* have to put what we 'get' above. 
+                 * after this line usage count == 0 */
+                ip_conntrack_expect_put(exp);
+        } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+                char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+                /* delete all expectations for this helper */
+                write_lock_bh(&ip_conntrack_lock);
+                h = __ip_conntrack_helper_find_byname(name);
+                if (!h) {
+                        write_unlock_bh(&ip_conntrack_lock);
+                        return -EINVAL;
+                }
+                list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+                                         list) {
+                        if (exp->master->helper == h 
+                            && del_timer(&exp->timeout))
+                                __ip_ct_expect_unlink_destroy(exp);
+                }
+                write_unlock(&ip_conntrack_lock);
+        } else {
+                /* This basically means we have to flush everything*/
+                write_lock_bh(&ip_conntrack_lock);
+                list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+                                         list) {
+                        if (del_timer(&exp->timeout))
+                                __ip_ct_expect_unlink_destroy(exp);
+                }
+                write_unlock_bh(&ip_conntrack_lock);
+        }
+        return 0;
+}
+static int
+ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
+{
+        return -EOPNOTSUPP;
+}
+static int
+ctnetlink_create_expect(struct nfattr *cda[])
+{
+        struct ip_conntrack_tuple tuple, mask, master_tuple;
+        struct ip_conntrack_tuple_hash *h = NULL;
+        struct ip_conntrack_expect *exp;
+        struct ip_conntrack *ct;
+        int err = 0;
+        DEBUGP("entered %s\n", __FUNCTION__);
+        /* caller guarantees that those three CTA_EXPECT_* exist */
+        err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+        if (err < 0)
+                return err;
+        err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
+        if (err < 0)
+                return err;
+        err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
+        if (err < 0)
+                return err;
+        /* Look for master conntrack of this expectation */
+        h = ip_conntrack_find_get(&master_tuple, NULL);
+        if (!h)
+                return -ENOENT;
+        ct = tuplehash_to_ctrack(h);
+        if (!ct->helper) {
+                /* such conntrack hasn't got any helper, abort */
+                err = -EINVAL;
+                goto out;
+        }
+        exp = ip_conntrack_expect_alloc(ct);
+        if (!exp) {
+                err = -ENOMEM;
+                goto out;
+        }
+        
+        exp->expectfn = NULL;
+        exp->master = ct;
+        memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
+        memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
+        err = ip_conntrack_expect_related(exp);
+        ip_conntrack_expect_put(exp);
+out:    
+        ip_conntrack_put(tuplehash_to_ctrack(h));
+        return err;
+}
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+                     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+        struct ip_conntrack_tuple tuple;
+        struct ip_conntrack_expect *exp;
+        int err = 0;
+        DEBUGP("entered %s\n", __FUNCTION__);   
+        if (!cda[CTA_EXPECT_TUPLE-1]
+            || !cda[CTA_EXPECT_MASK-1]
+            || !cda[CTA_EXPECT_MASTER-1])
+                return -EINVAL;
+        err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+        if (err < 0)
+                return err;
+        write_lock_bh(&ip_conntrack_lock);
+        exp = __ip_conntrack_expect_find(&tuple);
+        if (!exp) {
+                write_unlock_bh(&ip_conntrack_lock);
+                err = -ENOENT;
+                if (nlh->nlmsg_flags & NLM_F_CREATE)
+                        err = ctnetlink_create_expect(cda);
+                return err;
+        }
+        err = -EEXIST;
+        if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+                err = ctnetlink_change_expect(exp, cda);
+        write_unlock_bh(&ip_conntrack_lock);
+        DEBUGP("leaving\n");
+        
+        return err;
+}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static struct notifier_block ctnl_notifier = {
+        .notifier_call  = ctnetlink_conntrack_event,
+};
+static struct notifier_block ctnl_notifier_exp = {
+        .notifier_call  = ctnetlink_expect_event,
+};
+#endif
+static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+        [IPCTNL_MSG_CT_NEW]             = { .call = ctnetlink_new_conntrack,
+                                            .attr_count = CTA_MAX,
+                                            .cap_required = CAP_NET_ADMIN },
+        [IPCTNL_MSG_CT_GET]             = { .call = ctnetlink_get_conntrack,
+                                            .attr_count = CTA_MAX,
+                                            .cap_required = CAP_NET_ADMIN },
+        [IPCTNL_MSG_CT_DELETE]          = { .call = ctnetlink_del_conntrack,
+                                            .attr_count = CTA_MAX,
+                                            .cap_required = CAP_NET_ADMIN },
+        [IPCTNL_MSG_CT_GET_CTRZERO]     = { .call = ctnetlink_get_conntrack,
+                                            .attr_count = CTA_MAX,
+                                            .cap_required = CAP_NET_ADMIN },
+};
+static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+        [IPCTNL_MSG_EXP_GET]            = { .call = ctnetlink_get_expect,
+                                            .attr_count = CTA_EXPECT_MAX,
+                                            .cap_required = CAP_NET_ADMIN },
+        [IPCTNL_MSG_EXP_NEW]            = { .call = ctnetlink_new_expect,
+                                            .attr_count = CTA_EXPECT_MAX,
+                                            .cap_required = CAP_NET_ADMIN },
+        [IPCTNL_MSG_EXP_DELETE]         = { .call = ctnetlink_del_expect,
+                                            .attr_count = CTA_EXPECT_MAX,
+                                            .cap_required = CAP_NET_ADMIN },
+};
+static struct nfnetlink_subsystem ctnl_subsys = {
+        .name                           = "conntrack",
+        .subsys_id                      = NFNL_SUBSYS_CTNETLINK,
+        .cb_count                       = IPCTNL_MSG_MAX,
+        .cb                             = ctnl_cb,
+};
+static struct nfnetlink_subsystem ctnl_exp_subsys = {
+        .name                           = "conntrack_expect",
+        .subsys_id                      = NFNL_SUBSYS_CTNETLINK_EXP,
+        .cb_count                       = IPCTNL_MSG_EXP_MAX,
+        .cb                             = ctnl_exp_cb,
+};
+static int __init ctnetlink_init(void)
+{
+        int ret;
+        printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+        ret = nfnetlink_subsys_register(&ctnl_subsys);
+        if (ret < 0) {
+                printk("ctnetlink_init: cannot register with nfnetlink.\n");
+                goto err_out;
+        }
+        ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+        if (ret < 0) {
+                printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+                goto err_unreg_subsys;
+        }
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+        ret = ip_conntrack_register_notifier(&ctnl_notifier);
+        if (ret < 0) {
+                printk("ctnetlink_init: cannot register notifier.\n");
+                goto err_unreg_exp_subsys;
+        }
+        ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
+        if (ret < 0) {
+                printk("ctnetlink_init: cannot expect register notifier.\n");
+                goto err_unreg_notifier;
+        }
+#endif
+        return 0;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+        ip_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+        nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+        nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+        return ret;
+}
+static void __exit ctnetlink_exit(void)
+{
+        printk("ctnetlink: unregistering from nfnetlink.\n");
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+        ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
+        ip_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+        nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+        nfnetlink_subsys_unregister(&ctnl_subsys);
+        return;
+}
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db3252..838d1d69b36e 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
                        ct->timeout.function((unsigned long)ct);
        } else {
                atomic_inc(&ct->proto.icmp.count);
+                ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
                ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
        }
        return NF_ACCEPT;
 }
+static u_int8_t valid_new[] = { 
+        [ICMP_ECHO] = 1,
+        [ICMP_TIMESTAMP] = 1,
+        [ICMP_INFO_REQUEST] = 1,
+        [ICMP_ADDRESS] = 1 
+};
 /* Called when a new connection for this protocol found. */
 static int icmp_new(struct ip_conntrack *conntrack,
                    const struct sk_buff *skb)
 {
-        static u_int8_t valid_new[]
-                = { [ICMP_ECHO] = 1,
-                    [ICMP_TIMESTAMP] = 1,
-                    [ICMP_INFO_REQUEST] = 1,
-                    [ICMP_ADDRESS] = 1 };
        if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
            || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
                /* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
                return NF_ACCEPT;
        }
-        innerproto = ip_ct_find_proto(inside->ip.protocol);
+        innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
        dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
        /* Are they talking about one of our connections? */
        if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
                DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+                ip_conntrack_proto_put(innerproto);
                return NF_ACCEPT;
        }
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
           been preserved inside the ICMP. */
        if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
                DEBUGP("icmp_error_track: Can't invert tuple\n");
+                ip_conntrack_proto_put(innerproto);
                return NF_ACCEPT;
        }
+        ip_conntrack_proto_put(innerproto);
        *ctinfo = IP_CT_RELATED;
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
        icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
        if (icmph == NULL) {
                if (LOG_INVALID(IPPROTO_ICMP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                      "ip_ct_icmp: short packet ");
                return -NF_ACCEPT;
        }
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
                if (!(u16)csum_fold(skb->csum)) 
                        break;
                if (LOG_INVALID(IPPROTO_ICMP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                      "ip_ct_icmp: bad HW ICMP checksum ");
                return -NF_ACCEPT;
        case CHECKSUM_NONE:
                if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
                        if (LOG_INVALID(IPPROTO_ICMP))
-                                nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                              "ip_ct_icmp: bad ICMP checksum ");
                        return -NF_ACCEPT;
                }
@@ -249,7 +254,7 @@ checksum_skipped:
         */
        if (icmph->type > NR_ICMP_TYPES) {
                if (LOG_INVALID(IPPROTO_ICMP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                      "ip_ct_icmp: invalid ICMP type ");
                return -NF_ACCEPT;
        }
@@ -265,6 +270,47 @@ checksum_skipped:
        return icmp_error_message(skb, ctinfo, hooknum);
 }
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+                                const struct ip_conntrack_tuple *t)
+{
+        NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+                &t->src.u.icmp.id);
+        NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+                &t->dst.u.icmp.type);
+        NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+                &t->dst.u.icmp.code);
+        if (t->dst.u.icmp.type >= sizeof(valid_new) 
+            || !valid_new[t->dst.u.icmp.type])
+                return -EINVAL;
+        return 0;
+nfattr_failure:
+        return -1;
+}
+static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+                                struct ip_conntrack_tuple *tuple)
+{
+        if (!tb[CTA_PROTO_ICMP_TYPE-1]
+            || !tb[CTA_PROTO_ICMP_CODE-1]
+            || !tb[CTA_PROTO_ICMP_ID-1])
+                return -1;
+        tuple->dst.u.icmp.type = 
+                        *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+        tuple->dst.u.icmp.code =
+                        *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+        tuple->src.u.icmp.id =
+                        *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+        return 0;
+}
+#endif
 struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
 {
        .proto                  = IPPROTO_ICMP,
@@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
        .packet                 = icmp_packet,
        .new                    = icmp_new,
        .error                  = icmp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+        .tuple_to_nfattr        = icmp_tuple_to_nfattr,
+        .nfattr_to_tuple        = icmp_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d75390bf12..a875f35e576d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack,
                }
                conntrack->proto.sctp.state = newconntrack;
+                if (oldsctpstate != newconntrack)
+                        ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
                write_unlock_bh(&sctp_lock);
        }
@@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
        .packet          = sctp_packet, 
        .new             = sctp_new, 
        .destroy         = NULL, 
-        .me              = THIS_MODULE 
+        .me              = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+        .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+        .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
 };
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed766d4..f23ef1f88c46 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s,
        return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
+                         const struct ip_conntrack *ct)
+{
+        read_lock_bh(&tcp_lock);
+        NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
+                &ct->proto.tcp.state);
+        read_unlock_bh(&tcp_lock);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+#endif
 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
 {
        if (tcph->rst) return TCP_RST_SET;
@@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
                res = 1;
        } else {
                if (LOG_INVALID(IPPROTO_TCP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                        "ip_ct_tcp: %s ",
                        before(seq, sender->td_maxend + 1) ?
                        after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb,
                                sizeof(_tcph), &_tcph);
        if (th == NULL) {
                if (LOG_INVALID(IPPROTO_TCP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                "ip_ct_tcp: short packet ");
                return -NF_ACCEPT;
        }
@@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb,
        /* Not whole TCP header or malformed packet */
        if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
                if (LOG_INVALID(IPPROTO_TCP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                "ip_ct_tcp: truncated/malformed packet ");
                return -NF_ACCEPT;
        }
@@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb,
                                 skb->ip_summed == CHECKSUM_HW ? skb->csum
                                 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
                if (LOG_INVALID(IPPROTO_TCP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                  "ip_ct_tcp: bad TCP checksum ");
                return -NF_ACCEPT;
        }
@@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb,
        tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
        if (!tcp_valid_flags[tcpflags]) {
                if (LOG_INVALID(IPPROTO_TCP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                  "ip_ct_tcp: invalid TCP flag combination ");
                return -NF_ACCEPT;
        }
@@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
                         */
                        write_unlock_bh(&tcp_lock);
                        if (LOG_INVALID(IPPROTO_TCP))
-                                nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                                nf_log_packet(PF_INET, 0, skb, NULL, NULL,
-                                          "ip_ct_tcp: killing out of sync session ");
+                                              NULL, "ip_ct_tcp: "
+                                              "killing out of sync session ");
                        if (del_timer(&conntrack->timeout))
                                conntrack->timeout.function((unsigned long)
                                                            conntrack);
@@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
                
                write_unlock_bh(&tcp_lock);
                if (LOG_INVALID(IPPROTO_TCP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                  "ip_ct_tcp: invalid packet ignored ");
                return NF_ACCEPT;
        case TCP_CONNTRACK_MAX:
@@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
                       old_state);
                write_unlock_bh(&tcp_lock);
                if (LOG_INVALID(IPPROTO_TCP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                  "ip_ct_tcp: invalid state ");
                return -NF_ACCEPT;
        case TCP_CONNTRACK_SYN_SENT:
@@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
                        write_unlock_bh(&tcp_lock);
                        if (LOG_INVALID(IPPROTO_TCP))
                                nf_log_packet(PF_INET, 0, skb, NULL, NULL,
-                                              "ip_ct_tcp: invalid SYN");
+                                              NULL, "ip_ct_tcp: invalid SYN");
                        return -NF_ACCEPT;
                }
        case TCP_CONNTRACK_CLOSE:
@@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrack *conntrack,
                  ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
        write_unlock_bh(&tcp_lock);
+        ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+        if (new_state != old_state)
+                ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
        if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
                /* If only reply is a RST, we can consider ourselves not to
                   have an established connection: this is a fairly common
@@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
        .packet                 = tcp_packet,
        .new                    = tcp_new,
        .error                  = tcp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+        .to_nfattr              = tcp_to_nfattr,
+        .tuple_to_nfattr        = ip_ct_port_tuple_to_nfattr,
+        .nfattr_to_tuple        = ip_ct_port_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba098d4..f2dcac7c7660 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
                ip_ct_refresh_acct(conntrack, ctinfo, skb, 
                                   ip_ct_udp_timeout_stream);
                /* Also, more likely to be important, and not a probe */
-                set_bit(IPS_ASSURED_BIT, &conntrack->status);
+                if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+                        ip_conntrack_event_cache(IPCT_STATUS, skb);
        } else
                ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
        hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
        if (hdr == NULL) {
                if (LOG_INVALID(IPPROTO_UDP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                  "ip_ct_udp: short packet ");
                return -NF_ACCEPT;
        }
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
        /* Truncated/malformed packets */
        if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
                if (LOG_INVALID(IPPROTO_UDP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                  "ip_ct_udp: truncated/malformed packet ");
                return -NF_ACCEPT;
        }
@@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
                                 skb->ip_summed == CHECKSUM_HW ? skb->csum
                                 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
                if (LOG_INVALID(IPPROTO_UDP))
-                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+                        nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
                                  "ip_ct_udp: bad UDP checksum ");
                return -NF_ACCEPT;
        }
@@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
        .packet                 = udp_packet,
        .new                    = udp_new,
        .error                  = udp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+        .tuple_to_nfattr        = ip_ct_port_tuple_to_nfattr,
+        .nfattr_to_tuple        = ip_ct_port_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 61798c46e91d..ee5895afd0c3 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
 */
 /* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
        if (DIRECTION(hash))
                return 0;
-        proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+        proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
-                               .tuple.dst.protonum);
        IP_NF_ASSERT(proto);
        if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
                        return -ENOSPC;
 #if defined(CONFIG_IP_NF_CONNTRACK_MARK)
-        if (seq_printf(s, "mark=%lu ", conntrack->mark))
+        if (seq_printf(s, "mark=%u ", conntrack->mark))
                return -ENOSPC;
 #endif
@@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
        seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
        print_tuple(s, &expect->tuple,
-                    ip_ct_find_proto(expect->tuple.dst.protonum));
+                    __ip_conntrack_proto_find(expect->tuple.dst.protonum));
        return seq_putc(s, '\n');
 }
@@ -889,6 +888,7 @@ static int init_or_cleanup(int init)
        return ret;
 cleanup:
+        synchronize_net();
 #ifdef CONFIG_SYSCTL
        unregister_sysctl_table(ip_ct_sysctl_header);
 cleanup_localinops:
@@ -971,6 +971,14 @@ void need_ip_conntrack(void)
 {
 }
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(ip_conntrack_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
+EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
+EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+#endif
 EXPORT_SYMBOL(ip_conntrack_protocol_register);
 EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
 EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -982,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register);
 EXPORT_SYMBOL(ip_conntrack_helper_unregister);
 EXPORT_SYMBOL(ip_ct_iterate_cleanup);
 EXPORT_SYMBOL(ip_ct_refresh_acct);
-EXPORT_SYMBOL(ip_ct_protos);
-EXPORT_SYMBOL(ip_ct_find_proto);
 EXPORT_SYMBOL(ip_conntrack_expect_alloc);
 EXPORT_SYMBOL(ip_conntrack_expect_put);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
 EXPORT_SYMBOL(ip_conntrack_expect_related);
 EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
+EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
+EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy);
 EXPORT_SYMBOL(ip_conntrack_tuple_taken);
 EXPORT_SYMBOL(ip_ct_gather_frags);
 EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
 EXPORT_SYMBOL(ip_conntrack_hash);
 EXPORT_SYMBOL(ip_conntrack_untracked);
 EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_put);
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 EXPORT_SYMBOL(ip_conntrack_tcp_update);
 #endif
+EXPORT_SYMBOL_GPL(ip_conntrack_flush);
+EXPORT_SYMBOL_GPL(__ip_conntrack_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
+EXPORT_SYMBOL_GPL(ip_conntrack_free);
+EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
+EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
+EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
+#endif
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 739b6dde1c82..1adedb743f60 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock);
 static unsigned int ip_nat_htable_size;
 static struct list_head *bysource;
+#define MAX_IP_NAT_PROTO 256
 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+static inline struct ip_nat_protocol *
+__ip_nat_proto_find(u_int8_t protonum)
+{
+        return ip_nat_protos[protonum];
+}
+struct ip_nat_protocol *
+ip_nat_proto_find_get(u_int8_t protonum)
+{
+        struct ip_nat_protocol *p;
+        /* we need to disable preemption to make sure 'p' doesn't get
+         * removed until we've grabbed the reference */
+        preempt_disable();
+        p = __ip_nat_proto_find(protonum);
+        if (p) {
+                if (!try_module_get(p->me))
+                        p = &ip_nat_unknown_protocol;
+        }
+        preempt_enable();
+        return p;
+}
+void
+ip_nat_proto_put(struct ip_nat_protocol *p)
+{
+        module_put(p->me);
+}
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
@@ -103,7 +134,8 @@ static int
 in_range(const struct ip_conntrack_tuple *tuple,
         const struct ip_nat_range *range)
 {
-        struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+        struct ip_nat_protocol *proto = 
+                                __ip_nat_proto_find(tuple->dst.protonum);
        /* If we are supposed to map IPs, then we must be in the
           range specified, otherwise let this drag us onto a new src IP. */
@@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
                 struct ip_conntrack *conntrack,
                 enum ip_nat_manip_type maniptype)
 {
-        struct ip_nat_protocol *proto
+        struct ip_nat_protocol *proto;
-                = ip_nat_find_proto(orig_tuple->dst.protonum);
        /* 1) If this srcip/proto/src-proto-part is currently mapped,
           and that same mapping gives a unique tuple within the given
@@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
        /* 3) The per-protocol part of the manip is made to map into
           the range to make a unique tuple. */
+        proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
        /* Only bother mapping if it's not already in range and unique */
        if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
             || proto->in_range(tuple, maniptype, &range->min, &range->max))
-            && !ip_nat_used_tuple(tuple, conntrack))
+            && !ip_nat_used_tuple(tuple, conntrack)) {
+                ip_nat_proto_put(proto);
                return;
+        }
        /* Last change: get protocol to try to obtain unique tuple. */
        proto->unique_tuple(tuple, range, maniptype, conntrack);
+        ip_nat_proto_put(proto);
 }
 unsigned int
@@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto,
          enum ip_nat_manip_type maniptype)
 {
        struct iphdr *iph;
+        struct ip_nat_protocol *p;
-        (*pskb)->nfcache |= NFC_ALTERED;
+        if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
-        if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
                return 0;
        iph = (void *)(*pskb)->data + iphdroff;
        /* Manipulate protcol part. */
-        if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
+        p = ip_nat_proto_find_get(proto);
-                                                 target, maniptype))
+        if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
+                ip_nat_proto_put(p);
                return 0;
+        }
+        ip_nat_proto_put(p);
        iph = (void *)(*pskb)->data + iphdroff;
@@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
        struct ip_conntrack_tuple inner, target;
        int hdrlen = (*pskb)->nh.iph->ihl * 4;
-        if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+        if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
                return 0;
        inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
        if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
                             sizeof(struct icmphdr) + inside->ip.ihl*4,
-                             &inner, ip_ct_find_proto(inside->ip.protocol)))
+                             &inner,
+                             __ip_conntrack_proto_find(inside->ip.protocol)))
                return 0;
        /* Change inner back to look like incoming packet.  We do the
@@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
        synchronize_net();
 }
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+int
+ip_nat_port_range_to_nfattr(struct sk_buff *skb, 
+                            const struct ip_nat_range *range)
+{
+        NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
+                &range->min.tcp.port);
+        NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
+                &range->max.tcp.port);
+        return 0;
+nfattr_failure:
+        return -1;
+}
+int
+ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
+{
+        int ret = 0;
+        
+        /* we have to return whether we actually parsed something or not */
+        if (tb[CTA_PROTONAT_PORT_MIN-1]) {
+                ret = 1;
+                range->min.tcp.port = 
+                        *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
+        }
+        
+        if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
+                if (ret) 
+                        range->max.tcp.port = range->min.tcp.port;
+        } else {
+                ret = 1;
+                range->max.tcp.port = 
+                        *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
+        }
+        return ret;
+}
+#endif
 int __init ip_nat_init(void)
 {
        size_t i;
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 158f34f32c04..d2dd5d313556 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
        struct tcphdr *tcph;
        int datalen;
-        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+        if (!skb_make_writable(pskb, (*pskb)->len))
                return 0;
        if (rep_len > match_len
@@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
                               match_offset + match_len)
                return 0;
-        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+        if (!skb_make_writable(pskb, (*pskb)->len))
                return 0;
        if (rep_len > match_len
@@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
        optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
        optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
-        if (!skb_ip_make_writable(pskb, optend))
+        if (!skb_make_writable(pskb, optend))
                return 0;
        dir = CTINFO2DIR(ctinfo);
@@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
        this_way = &ct->nat.info.seq[dir];
        other_way = &ct->nat.info.seq[!dir];
-        if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+        if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
                return 0;
        tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 6596c9ee1655..938719043999 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
        struct icmphdr *hdr;
        unsigned int hdroff = iphdroff + iph->ihl*4;
-        if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+        if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
                return 0;
        hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
        else return 0;
 }
-struct ip_nat_protocol ip_nat_protocol_icmp
+struct ip_nat_protocol ip_nat_protocol_icmp = {
-= { "ICMP", IPPROTO_ICMP,
+        .name                   = "ICMP",
-    icmp_manip_pkt,
+        .protonum               = IPPROTO_ICMP,
-    icmp_in_range,
+        .me                     = THIS_MODULE,
-    icmp_unique_tuple,
+        .manip_pkt              = icmp_manip_pkt,
-    icmp_print,
+        .in_range               = icmp_in_range,
-    icmp_print_range
+        .unique_tuple           = icmp_unique_tuple,
+        .print                  = icmp_print,
+        .print_range            = icmp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+        .range_to_nfattr        = ip_nat_port_range_to_nfattr,
+        .nfattr_to_range        = ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a98e36d2b3c6..1d381bf68574 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/if.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
        if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
                hdrsize = sizeof(struct tcphdr);
-        if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+        if (!skb_make_writable(pskb, hdroff + hdrsize))
                return 0;
        iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
        else return 0;
 }
-struct ip_nat_protocol ip_nat_protocol_tcp
+struct ip_nat_protocol ip_nat_protocol_tcp = {
-= { "TCP", IPPROTO_TCP,
+        .name                   = "TCP",
-    tcp_manip_pkt,
+        .protonum               = IPPROTO_TCP,
-    tcp_in_range,
+        .me                     = THIS_MODULE,
-    tcp_unique_tuple,
+        .manip_pkt              = tcp_manip_pkt,
-    tcp_print,
+        .in_range               = tcp_in_range,
-    tcp_print_range
+        .unique_tuple           = tcp_unique_tuple,
+        .print                  = tcp_print,
+        .print_range            = tcp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+        .range_to_nfattr        = ip_nat_port_range_to_nfattr,
+        .nfattr_to_range        = ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index 9f66e5625664..c4906e1aa24a 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
        u32 oldip, newip;
        u16 *portptr, newport;
-        if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+        if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
                return 0;
        iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
        else return 0;
 }
-struct ip_nat_protocol ip_nat_protocol_udp
+struct ip_nat_protocol ip_nat_protocol_udp = {
-= { "UDP", IPPROTO_UDP,
+        .name                   = "UDP",
-    udp_manip_pkt,
+        .protonum               = IPPROTO_UDP,
-    udp_in_range,
+        .me                     = THIS_MODULE,
-    udp_unique_tuple,
+        .manip_pkt              = udp_manip_pkt,
-    udp_print,
+        .in_range               = udp_in_range,
-    udp_print_range
+        .unique_tuple           = udp_unique_tuple,
+        .print                  = udp_print,
+        .print_range            = udp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+        .range_to_nfattr        = ip_nat_port_range_to_nfattr,
+        .nfattr_to_range        = ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd58d16..99bbef56f84e 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
 }
 struct ip_nat_protocol ip_nat_unknown_protocol = {
-        "unknown", 0,
+        .name                   = "unknown",
-        unknown_manip_pkt,
+        .me                     = THIS_MODULE,
-        unknown_in_range,
+        .manip_pkt              = unknown_manip_pkt,
-        unknown_unique_tuple,
+        .in_range               = unknown_in_range,
-        unknown_print,
+        .unique_tuple           = unknown_unique_tuple,
-        unknown_print_range
+        .print                  = unknown_print,
+        .print_range            = unknown_print_range
 };
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e635ae..93b2c5111bb2 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
                 return NF_DROP;
        }
-        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+        if (!skb_make_writable(pskb, (*pskb)->len))
                return NF_DROP;
        spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 91d5ea1dbbc9..89db052add81 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
        IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
                       & htons(IP_MF|IP_OFFSET)));
-        (*pskb)->nfcache |= NFC_UNKNOWN;
        /* If we had a hardware checksum before, it's now invalid */
        if ((*pskb)->ip_summed == CHECKSUM_HW)
                if (skb_checksum_help(*pskb, (out == NULL)))
@@ -396,6 +394,8 @@ module_exit(fini);
 EXPORT_SYMBOL(ip_nat_setup_info);
 EXPORT_SYMBOL(ip_nat_protocol_register);
 EXPORT_SYMBOL(ip_nat_protocol_unregister);
+EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_nat_proto_put);
 EXPORT_SYMBOL(ip_nat_cheat_check);
 EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
 EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c6baa8174389..d54f14d926f6 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
 #define NET_IPQ_QMAX 2088
 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-struct ipq_rt_info {
-        __u8 tos;
-        __u32 daddr;
-        __u32 saddr;
-};
 struct ipq_queue_entry {
        struct list_head list;
        struct nf_info *info;
        struct sk_buff *skb;
-        struct ipq_rt_info rt_info;
 };
 typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -247,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
        pmsg->packet_id       = (unsigned long )entry;
        pmsg->data_len        = data_len;
-        pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
+        pmsg->timestamp_sec   = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
-        pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+        pmsg->timestamp_usec  = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
        pmsg->mark            = entry->skb->nfmark;
        pmsg->hook            = entry->info->hook;
        pmsg->hw_protocol     = entry->skb->protocol;
@@ -287,7 +280,8 @@ nlmsg_failure:
 }
 static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+                   unsigned int queuenum, void *data)
 {
        int status = -EINVAL;
        struct sk_buff *nskb;
@@ -305,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
        entry->info = info;
        entry->skb = skb;
-        if (entry->info->hook == NF_IP_LOCAL_OUT) {
-                struct iphdr *iph = skb->nh.iph;
-                entry->rt_info.tos = iph->tos;
-                entry->rt_info.daddr = iph->daddr;
-                entry->rt_info.saddr = iph->saddr;
-        }
        nskb = ipq_build_packet_message(entry, &status);
        if (nskb == NULL)
                goto err_out_free;
@@ -388,24 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
                }
                skb_put(e->skb, diff);
        }
-        if (!skb_ip_make_writable(&e->skb, v->data_len))
+        if (!skb_make_writable(&e->skb, v->data_len))
                return -ENOMEM;
        memcpy(e->skb->data, v->payload, v->data_len);
        e->skb->ip_summed = CHECKSUM_NONE;
-        e->skb->nfcache |= NFC_ALTERED;
-        /*
-         * Extra routing may needed on local out, as the QUEUE target never
-         * returns control to the table.
-         */
-        if (e->info->hook == NF_IP_LOCAL_OUT) {
-                struct iphdr *iph = e->skb->nh.iph;
-                if (!(iph->tos == e->rt_info.tos
-                      && iph->daddr == e->rt_info.daddr
-                      && iph->saddr == e->rt_info.saddr))
-                        return ip_route_me_harder(&e->skb);
-        }
        return 0;
 }
@@ -683,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
 }
 #endif /* CONFIG_PROC_FS */
+static struct nf_queue_handler nfqh = {
+        .name   = "ip_queue",
+        .outfn  = &ipq_enqueue_packet,
+};
 static int
 init_or_cleanup(int init)
 {
@@ -693,7 +671,8 @@ init_or_cleanup(int init)
                goto cleanup;
        netlink_register_notifier(&ipq_nl_notifier);
-        ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+        ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
+                                      THIS_MODULE);
        if (ipqnl == NULL) {
                printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
                goto cleanup_netlink_notifier;
@@ -710,7 +689,7 @@ init_or_cleanup(int init)
        register_netdevice_notifier(&ipq_dev_notifier);
        ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
        
-        status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+        status = nf_register_queue_handler(PF_INET, &nfqh);
        if (status < 0) {
                printk(KERN_ERR "ip_queue: failed to register queue handler\n");
                goto cleanup_sysctl;
@@ -718,7 +697,7 @@ init_or_cleanup(int init)
        return status;
 cleanup:
-        nf_unregister_queue_handler(PF_INET);
+        nf_unregister_queue_handlers(&nfqh);
        synchronize_net();
        ipq_flush(NF_DROP);
        
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c88dfcd38c56..eef99a1b5de6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb,
        do {
                IP_NF_ASSERT(e);
                IP_NF_ASSERT(back);
-                (*pskb)->nfcache |= e->nfcache;
                if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
                        struct ipt_entry_target *t;
@@ -341,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb,
                                                         back->comefrom);
                                        continue;
                                }
-                                if (table_base + v
+                                if (table_base + v != (void *)e + e->next_offset
-                                    != (void *)e + e->next_offset) {
+                                    && !(e->ip.flags & IPT_F_GOTO)) {
                                        /* Save old back ptr in next entry */
                                        struct ipt_entry *next
                                                = (void *)e + e->next_offset;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e23184..dab78d8bd494 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
 {
        const struct ipt_classify_target_info *clinfo = targinfo;
-        if((*pskb)->priority != clinfo->priority) {
+        if((*pskb)->priority != clinfo->priority) 
                (*pskb)->priority = clinfo->priority;
-                (*pskb)->nfcache |= NFC_ALTERED;
-        }
        return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6706d3a1bc4f..2d05cafec221 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,7 +367,7 @@ target(struct sk_buff **pskb,
 #ifdef DEBUG_CLUSTERP
        DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 #endif
-        DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+        DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
        if (!clusterip_responsible(cipinfo->config, hash)) {
                DEBUGP("not responsible\n");
                return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e18eb7..134638021339 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
       void *userinfo)
 {
        const struct ipt_connmark_target_info *markinfo = targinfo;
-        unsigned long diff;
+        u_int32_t diff;
-        unsigned long nfmark;
+        u_int32_t nfmark;
-        unsigned long newmark;
+        u_int32_t newmark;
        enum ip_conntrack_info ctinfo;
        struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
            case IPT_CONNMARK_RESTORE:
                nfmark = (*pskb)->nfmark;
                diff = (ct->mark ^ nfmark) & markinfo->mask;
-                if (diff != 0) {
+                if (diff != 0)
                    (*pskb)->nfmark = nfmark ^ diff;
-                    (*pskb)->nfcache |= NFC_ALTERED;
-                }
                break;
            }
        }
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
            }
        }
+        if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+                printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+                return 0;
+        }
        return 1;
 }
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509099f9..6e319570a28c 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
        if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
                u_int16_t diffs[2];
-                if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+                if (!skb_make_writable(pskb, sizeof(struct iphdr)))
                        return NF_DROP;
                diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
                                                 sizeof(diffs),
                                                 (*pskb)->nh.iph->check
                                                 ^ 0xFFFF));
-                (*pskb)->nfcache |= NFC_ALTERED;
        }
        return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 94a0ce1c1c9d..a1319693f648 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
            != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
                u_int16_t diffs[2];
-                if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+                if (!skb_make_writable(pskb, sizeof(struct iphdr)))
                        return 0;
                diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
                                                 sizeof(diffs),
                                                 (*pskb)->nh.iph->check
                                                 ^0xFFFF));
-                (*pskb)->nfcache |= NFC_ALTERED;
        } 
        return 1;
 }
@@ -67,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
             tcph->cwr == einfo->proto.tcp.cwr)))
                return 1;
-        if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+        if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
                return 0;
        tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
@@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
                tcph->check = csum_fold(csum_partial((char *)diffs,
                                                     sizeof(diffs),
                                                     tcph->check^0xFFFF));
-        (*pskb)->nfcache |= NFC_ALTERED;
        return 1;
 }
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733d26da..92ed050fac69 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("iptables syslog logging module");
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
- 
 #if 0
 #define DEBUGP printk
 #else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
 static DEFINE_SPINLOCK(log_lock);
 /* One level of recursion won't kill us */
-static void dump_packet(const struct ipt_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
                        const struct sk_buff *skb,
                        unsigned int iphoff)
 {
        struct iphdr _iph, *ih;
+        unsigned int logflags;
+        if (info->type == NF_LOG_TYPE_LOG)
+                logflags = info->u.log.logflags;
+        else
+                logflags = NF_LOG_MASK;
        ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
        if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
        if (ntohs(ih->frag_off) & IP_OFFSET)
                printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
-        if ((info->logflags & IPT_LOG_IPOPT)
+        if ((logflags & IPT_LOG_IPOPT)
            && ih->ihl * 4 > sizeof(struct iphdr)) {
                unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
                unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
                printk("SPT=%u DPT=%u ",
                       ntohs(th->source), ntohs(th->dest));
                /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
-                if (info->logflags & IPT_LOG_TCPSEQ)
+                if (logflags & IPT_LOG_TCPSEQ)
                        printk("SEQ=%u ACK=%u ",
                               ntohl(th->seq), ntohl(th->ack_seq));
                /* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
                /* Max length: 11 "URGP=65535 " */
                printk("URGP=%u ", ntohs(th->urg_ptr));
-                if ((info->logflags & IPT_LOG_TCPOPT)
+                if ((logflags & IPT_LOG_TCPOPT)
                    && th->doff * 4 > sizeof(struct tcphdr)) {
                        unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
                        unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
        }
        /* Max length: 15 "UID=4294967295 " */
-        if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+        if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
                read_lock_bh(&skb->sk->sk_callback_lock);
                if (skb->sk->sk_socket && skb->sk->sk_socket->file)
                        printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
        /* maxlen = 230+   91  + 230 + 252 = 803 */
 }
+struct nf_loginfo default_loginfo = {
+        .type   = NF_LOG_TYPE_LOG,
+        .u = {
+                .log = {
+                        .level    = 0,
+                        .logflags = NF_LOG_MASK,
+                },
+        },
+};
 static void
-ipt_log_packet(unsigned int hooknum,
+ipt_log_packet(unsigned int pf,
+               unsigned int hooknum,
               const struct sk_buff *skb,
               const struct net_device *in,
               const struct net_device *out,
-               const struct ipt_log_info *loginfo,
+               const struct nf_loginfo *loginfo,
-               const char *level_string,
               const char *prefix)
 {
+        if (!loginfo)
+                loginfo = &default_loginfo;
        spin_lock_bh(&log_lock);
-        printk(level_string);
+        printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
-        printk("%sIN=%s OUT=%s ",
+               prefix,
-               prefix == NULL ? loginfo->prefix : prefix,
               in ? in->name : "",
               out ? out->name : "");
 #ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
               void *userinfo)
 {
        const struct ipt_log_info *loginfo = targinfo;
-        char level_string[4] = "< >";
+        struct nf_loginfo li;
-        level_string[1] = '0' + (loginfo->level % 8);
+        li.type = NF_LOG_TYPE_LOG;
-        ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+        li.u.log.level = loginfo->level;
+        li.u.log.logflags = loginfo->logflags;
-        return IPT_CONTINUE;
+        nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
-}
-static void
+        return IPT_CONTINUE;
-ipt_logfn(unsigned int hooknum,
-          const struct sk_buff *skb,
-          const struct net_device *in,
-          const struct net_device *out,
-          const char *prefix)
-{
-        struct ipt_log_info loginfo = { 
-                .level = 0, 
-                .logflags = IPT_LOG_MASK, 
-                .prefix = "" 
-        };
-        ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
 }
 static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
        .me             = THIS_MODULE,
 };
+static struct nf_logger ipt_log_logger ={
+        .name           = "ipt_LOG",
+        .logfn          = &ipt_log_packet,
+        .me             = THIS_MODULE,
+};
 static int __init init(void)
 {
        if (ipt_register_target(&ipt_log_reg))
                return -EINVAL;
-        if (nflog)
+        if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
-                nf_log_register(PF_INET, &ipt_logfn);
+                printk(KERN_WARNING "ipt_LOG: not logging via system console "
+                       "since somebody else already registered for PF_INET\n");
+                /* we cannot make module load fail here, since otherwise
+                 * iptables userspace would abort */
+        }
        
        return 0;
 }
 static void __exit fini(void)
 {
-        if (nflog)
+        nf_log_unregister_logger(&ipt_log_logger);
-                nf_log_unregister(PF_INET, &ipt_logfn);
        ipt_unregister_target(&ipt_log_reg);
 }
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b63b8d..52b4f2c296bf 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
 {
        const struct ipt_mark_target_info *markinfo = targinfo;
-        if((*pskb)->nfmark != markinfo->mark) {
+        if((*pskb)->nfmark != markinfo->mark)
                (*pskb)->nfmark = markinfo->mark;
-                (*pskb)->nfcache |= NFC_ALTERED;
-        }
        return IPT_CONTINUE;
 }
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
                break;
        }
-        if((*pskb)->nfmark != mark) {
+        if((*pskb)->nfmark != mark)
                (*pskb)->nfmark = mark;
-                (*pskb)->nfcache |= NFC_ALTERED;
-        }
        return IPT_CONTINUE;
 }
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
              unsigned int targinfosize,
              unsigned int hook_mask)
 {
+        struct ipt_mark_target_info *markinfo = targinfo;
        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
                printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
                       targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
                return 0;
        }
+        if (markinfo->mark > 0xffffffff) {
+                printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+                return 0;
+        }
        return 1;
 }
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
                return 0;
        }
+        if (markinfo->mark > 0xffffffff) {
+                printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+                return 0;
+        }
        return 1;
 }
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 91e74502c3d3..2f3e181c8e97 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb,
        IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
-        /* FIXME: For the moment, don't do local packets, breaks
-           testsuite for 2.3.49 --RR */
-        if ((*pskb)->sk)
-                return NF_ACCEPT;
        ct = ip_conntrack_get(*pskb, &ctinfo);
        IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
                            || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b29d034..e6e7b6095363 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
                DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
                return 0;
        }
-        if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+        if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
+                          (1 << NF_IP_LOCAL_OUT))) {
                DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
                return 0;
        }
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
        struct ip_nat_range newrange;
        IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
-                     || hooknum == NF_IP_POST_ROUTING);
+                     || hooknum == NF_IP_POST_ROUTING
+                     || hooknum == NF_IP_LOCAL_OUT);
        ct = ip_conntrack_get(*pskb, &ctinfo);
        netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
-        if (hooknum == NF_IP_PRE_ROUTING)
+        if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
                new_ip = (*pskb)->nh.iph->daddr & ~netmask;
        else
                new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 000000000000..3cedc9be8807
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables NFQUEUE target");
+MODULE_LICENSE("GPL");
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_NFQ_info *tinfo = targinfo;
+        return NF_QUEUE_NR(tinfo->queuenum);
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
+                printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
+                return 0;
+        }
+        return 1;
+}
+static struct ipt_target ipt_NFQ_reg = {
+        .name           = "NFQUEUE",
+        .target         = target,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_NFQ_reg);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_NFQ_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 915696446020..f115a84a4ac6 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
        /* This packet will not be the same as the other: clear nf fields */
        nf_reset(nskb);
-        nskb->nfcache = 0;
        nskb->nfmark = 0;
 #ifdef CONFIG_BRIDGE_NETFILTER
        nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 7b84a254440e..8db70d6908c3 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb,
        unsigned int i;
        u_int8_t *opt;
-        if (!skb_ip_make_writable(pskb, (*pskb)->len))
+        if (!skb_make_writable(pskb, (*pskb)->len))
                return NF_DROP;
        if ((*pskb)->ip_summed == CHECKSUM_HW &&
@@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
               newmss);
 retmodified:
-        (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
        return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d240f8b..deadb36d4428 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
        if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
                u_int16_t diffs[2];
-                if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+                if (!skb_make_writable(pskb, sizeof(struct iphdr)))
                        return NF_DROP;
                diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
                                                 sizeof(diffs),
                                                 (*pskb)->nh.iph->check
                                                 ^0xFFFF));
-                (*pskb)->nfcache |= NFC_ALTERED;
        }
        return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 000000000000..b9ae6a9382f3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
+/* TTL modification target for IP tables
+ * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TTL.h>
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL modification module");
+MODULE_LICENSE("GPL");
+static unsigned int 
+ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, 
+                const struct net_device *out, unsigned int hooknum, 
+                const void *targinfo, void *userinfo)
+{
+        struct iphdr *iph;
+        const struct ipt_TTL_info *info = targinfo;
+        u_int16_t diffs[2];
+        int new_ttl;
+        if (!skb_make_writable(pskb, (*pskb)->len))
+                return NF_DROP;
+        iph = (*pskb)->nh.iph;
+        switch (info->mode) {
+                case IPT_TTL_SET:
+                        new_ttl = info->ttl;
+                        break;
+                case IPT_TTL_INC:
+                        new_ttl = iph->ttl + info->ttl;
+                        if (new_ttl > 255)
+                                new_ttl = 255;
+                        break;
+                case IPT_TTL_DEC:
+                        new_ttl = iph->ttl - info->ttl;
+                        if (new_ttl < 0)
+                                new_ttl = 0;
+                        break;
+                default:
+                        new_ttl = iph->ttl;
+                        break;
+        }
+        if (new_ttl != iph->ttl) {
+                diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
+                iph->ttl = new_ttl;
+                diffs[1] = htons(((unsigned)iph->ttl) << 8);
+                iph->check = csum_fold(csum_partial((char *)diffs,
+                                                    sizeof(diffs),
+                                                    iph->check^0xFFFF));
+        }
+        return IPT_CONTINUE;
+}
+static int ipt_ttl_checkentry(const char *tablename,
+                const struct ipt_entry *e,
+                void *targinfo,
+                unsigned int targinfosize,
+                unsigned int hook_mask)
+{
+        struct ipt_TTL_info *info = targinfo;
+        if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
+                printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
+                                targinfosize,
+                                IPT_ALIGN(sizeof(struct ipt_TTL_info)));
+                return 0;
+        }
+        if (strcmp(tablename, "mangle")) {
+                printk(KERN_WARNING "ipt_TTL: can only be called from "
+                        "\"mangle\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        if (info->mode > IPT_TTL_MAXMODE) {
+                printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", 
+                        info->mode);
+                return 0;
+        }
+        if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
+                return 0;
+        return 1;
+}
+static struct ipt_target ipt_TTL = { 
+        .name           = "TTL",
+        .target         = ipt_ttl_target, 
+        .checkentry     = ipt_ttl_checkentry, 
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ipt_register_target(&ipt_TTL);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_target(&ipt_TTL);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 52a0076302a7..e2c14f3cb2fc 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -62,6 +62,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("iptables userspace logging module");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
 #define ULOG_NL_EVENT           111             /* Harald's favorite number */
 #define ULOG_MAXNLGROUPS        32              /* numer of nlgroups */
@@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
        if (ub->qlen > 1)
                ub->lastnlh->nlmsg_type = NLMSG_DONE;
-        NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
+        NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
-        DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
+        DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
-                ub->qlen, nlgroupnum);
+                ub->qlen, nlgroupnum + 1);
-        netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+        netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
        ub->qlen = 0;
        ub->skb = NULL;
@@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
        pm = NLMSG_DATA(nlh);
        /* We might not have a timestamp, get one */
-        if (skb->stamp.tv_sec == 0)
+        if (skb->tstamp.off_sec == 0)
-                do_gettimeofday((struct timeval *)&skb->stamp);
+                __net_timestamp((struct sk_buff *)skb);
        /* copy hook, prefix, timestamp, payload, etc. */
        pm->data_len = copy_len;
-        pm->timestamp_sec = skb->stamp.tv_sec;
+        pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
-        pm->timestamp_usec = skb->stamp.tv_usec;
+        pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
        pm->mark = skb->nfmark;
        pm->hook = hooknum;
        if (prefix != NULL)
@@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
        return IPT_CONTINUE;
 }
 
-static void ipt_logfn(unsigned int hooknum,
+static void ipt_logfn(unsigned int pf,
+                      unsigned int hooknum,
                      const struct sk_buff *skb,
                      const struct net_device *in,
                      const struct net_device *out,
+                      const struct nf_loginfo *li,
                      const char *prefix)
 {
-        struct ipt_ulog_info loginfo = { 
+        struct ipt_ulog_info loginfo;
-                .nl_group = ULOG_DEFAULT_NLGROUP,
-                .copy_range = 0,
+        if (!li || li->type != NF_LOG_TYPE_ULOG) {
-                .qthreshold = ULOG_DEFAULT_QTHRESHOLD,
+                loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
-                .prefix = ""
+                loginfo.copy_range = 0;
-        };
+                loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
+                loginfo.prefix[0] = '\0';
+        } else {
+                loginfo.nl_group = li->u.ulog.group;
+                loginfo.copy_range = li->u.ulog.copy_len;
+                loginfo.qthreshold = li->u.ulog.qthreshold;
+                strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+        }
        ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
 }
@@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
        .me             = THIS_MODULE,
 };
+static struct nf_logger ipt_ulog_logger = {
+        .name           = "ipt_ULOG",
+        .logfn          = &ipt_logfn,
+        .me             = THIS_MODULE,
+};
 static int __init init(void)
 {
        int i;
@@ -372,7 +388,8 @@ static int __init init(void)
                ulog_buffers[i].timer.data = i;
        }
-        nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+        nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+                                        THIS_MODULE);
        if (!nflognl)
                return -ENOMEM;
@@ -381,7 +398,7 @@ static int __init init(void)
                return -EINVAL;
        }
        if (nflog)
-                nf_log_register(PF_INET, &ipt_logfn);
+                nf_log_register(PF_INET, &ipt_ulog_logger);
        
        return 0;
 }
@@ -394,7 +411,7 @@ static void __exit fini(void)
        DEBUGP("ipt_ULOG: cleanup_module\n");
        if (nflog)
-                nf_log_unregister(PF_INET, &ipt_logfn);
+                nf_log_unregister_logger(&ipt_ulog_logger);
        ipt_unregister_target(&ipt_ulog_reg);
        sock_release(nflognl->sk_socket);
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 000000000000..df4a42c6da22
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ *
+ * 2004-07-20 Harald Welte <laforge@netfilter.org>
+ *      - reimplemented to use per-connection accounting counters
+ *      - add functionality to match number of packets
+ *      - add functionality to match average packet size
+ *      - add support to match directions seperately
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connbytes.h>
+#include <asm/div64.h>
+#include <asm/bitops.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
+/* 64bit divisor, dividend and result. dynamic precision */
+static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+        u_int32_t d = divisor;
+        if (divisor > 0xffffffffULL) {
+                unsigned int shift = fls(divisor >> 32);
+                d = divisor >> shift;
+                dividend >>= shift;
+        }
+        do_div(dividend, d);
+        return dividend;
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_connbytes_info *sinfo = matchinfo;
+        enum ip_conntrack_info ctinfo;
+        struct ip_conntrack *ct;
+        u_int64_t what = 0;     /* initialize to make gcc happy */
+        if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
+                return 0; /* no match */
+        switch (sinfo->what) {
+        case IPT_CONNBYTES_PKTS:
+                switch (sinfo->direction) {
+                case IPT_CONNBYTES_DIR_ORIGINAL:
+                        what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+                        break;
+                case IPT_CONNBYTES_DIR_REPLY:
+                        what = ct->counters[IP_CT_DIR_REPLY].packets;
+                        break;
+                case IPT_CONNBYTES_DIR_BOTH:
+                        what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+                        what += ct->counters[IP_CT_DIR_REPLY].packets;
+                        break;
+                }
+                break;
+        case IPT_CONNBYTES_BYTES:
+                switch (sinfo->direction) {
+                case IPT_CONNBYTES_DIR_ORIGINAL:
+                        what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+                        break;
+                case IPT_CONNBYTES_DIR_REPLY:
+                        what = ct->counters[IP_CT_DIR_REPLY].bytes;
+                        break;
+                case IPT_CONNBYTES_DIR_BOTH:
+                        what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+                        what += ct->counters[IP_CT_DIR_REPLY].bytes;
+                        break;
+                }
+                break;
+        case IPT_CONNBYTES_AVGPKT:
+                switch (sinfo->direction) {
+                case IPT_CONNBYTES_DIR_ORIGINAL:
+                        what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
+                                        ct->counters[IP_CT_DIR_ORIGINAL].packets);
+                        break;
+                case IPT_CONNBYTES_DIR_REPLY:
+                        what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
+                                        ct->counters[IP_CT_DIR_REPLY].packets);
+                        break;
+                case IPT_CONNBYTES_DIR_BOTH:
+                        {
+                                u_int64_t bytes;
+                                u_int64_t pkts;
+                                bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
+                                        ct->counters[IP_CT_DIR_REPLY].bytes;
+                                pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
+                                        ct->counters[IP_CT_DIR_REPLY].packets;
+                                /* FIXME_THEORETICAL: what to do if sum
+                                 * overflows ? */
+                                what = div64_64(bytes, pkts);
+                        }
+                        break;
+                }
+                break;
+        }
+        if (sinfo->count.to)
+                return (what <= sinfo->count.to && what >= sinfo->count.from);
+        else
+                return (what >= sinfo->count.from);
+}
+static int check(const char *tablename,
+                 const struct ipt_ip *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+        const struct ipt_connbytes_info *sinfo = matchinfo;
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
+                return 0;
+        if (sinfo->what != IPT_CONNBYTES_PKTS &&
+            sinfo->what != IPT_CONNBYTES_BYTES &&
+            sinfo->what != IPT_CONNBYTES_AVGPKT)
+                return 0;
+        if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
+            sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
+            sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
+                return 0;
+        return 1;
+}
+static struct ipt_match state_match = {
+        .name           = "connbytes",
+        .match          = &match,
+        .checkentry     = &check,
+        .me             = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&state_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&state_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96cea55..bf8de47ce004 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
           unsigned int matchsize,
           unsigned int hook_mask)
 {
+        struct ipt_connmark_info *cm = 
+                                (struct ipt_connmark_info *)matchinfo;
        if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
                return 0;
+        if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+                printk(KERN_WARNING "connmark: only support 32bit mark\n");
+                return 0;
+        }
        return 1;
 }
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 000000000000..ad3278bba6c1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_dccp.h>
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+                                  || (!!((invflag) & (option)) ^ (cond)))
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+static inline int
+dccp_find_option(u_int8_t option,
+                 const struct sk_buff *skb,
+                 const struct dccp_hdr *dh,
+                 int *hotdrop)
+{
+        /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+        unsigned char *op;
+        unsigned int optoff = __dccp_hdr_len(dh);
+        unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+        unsigned int i;
+        if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
+                *hotdrop = 1;
+                return 0;
+        }
+        if (!optlen)
+                return 0;
+        spin_lock_bh(&dccp_buflock);
+        op = skb_header_pointer(skb,
+                                skb->nh.iph->ihl*4 + optoff,
+                                optlen, dccp_optbuf);
+        if (op == NULL) {
+                /* If we don't have the whole header, drop packet. */
+                spin_unlock_bh(&dccp_buflock);
+                *hotdrop = 1;
+                return 0;
+        }
+        for (i = 0; i < optlen; ) {
+                if (op[i] == option) {
+                        spin_unlock_bh(&dccp_buflock);
+                        return 1;
+                }
+                if (op[i] < 2) 
+                        i++;
+                else 
+                        i += op[i+1]?:1;
+        }
+        spin_unlock_bh(&dccp_buflock);
+        return 0;
+}
+static inline int
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+        return (typemask & (1 << dh->dccph_type));
+}
+static inline int
+match_option(u_int8_t option, const struct sk_buff *skb,
+             const struct dccp_hdr *dh, int *hotdrop)
+{
+        return dccp_find_option(option, skb, dh, hotdrop);
+}
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+        const struct ipt_dccp_info *info = 
+                                (const struct ipt_dccp_info *)matchinfo;
+        struct dccp_hdr _dh, *dh;
+        if (offset)
+                return 0;
+        
+        dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
+        if (dh == NULL) {
+                *hotdrop = 1;
+                return 0;
+        }
+        return  DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) 
+                        && (ntohs(dh->dccph_sport) <= info->spts[1])), 
+                        IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
+                && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) 
+                        && (ntohs(dh->dccph_dport) <= info->dpts[1])), 
+                        IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
+                && DCCHECK(match_types(dh, info->typemask),
+                           IPT_DCCP_TYPE, info->flags, info->invflags)
+                && DCCHECK(match_option(info->option, skb, dh, hotdrop),
+                           IPT_DCCP_OPTION, info->flags, info->invflags);
+}
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        const struct ipt_dccp_info *info;
+        info = (const struct ipt_dccp_info *)matchinfo;
+        return ip->proto == IPPROTO_DCCP
+                && !(ip->invflags & IPT_INV_PROTO)
+                && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
+                && !(info->flags & ~IPT_DCCP_VALID_FLAGS)
+                && !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
+                && !(info->invflags & ~info->flags);
+}
+static struct ipt_match dccp_match = 
+{ 
+        .name           = "dccp",
+        .match          = &match,
+        .checkentry     = &checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        int ret;
+        /* doff is 8 bits, so the maximum option size is (4*256).  Don't put
+         * this in BSS since DaveM is worried about locked TLB's for kernel
+         * BSS. */
+        dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+        if (!dccp_optbuf)
+                return -ENOMEM;
+        ret = ipt_register_match(&dccp_match);
+        if (ret)
+                kfree(dccp_optbuf);
+        return ret;
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&dccp_match);
+        kfree(dccp_optbuf);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Match for DCCP protocol packets");
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 564b49bfebcf..2dd1cccbdab9 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -94,7 +94,7 @@ struct ipt_hashlimit_htable {
 static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
 static DECLARE_MUTEX(hlimit_mutex);     /* additional checkentry protection */
 static HLIST_HEAD(hashlimit_htables);
-static kmem_cache_t *hashlimit_cachep;
+static kmem_cache_t *hashlimit_cachep __read_mostly;
 static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
 {
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728127b9..00bef6cdd3f8 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
           unsigned int matchsize,
           unsigned int hook_mask)
 {
+        struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
        if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
                return 0;
+        if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+                printk(KERN_WARNING "mark: only supports 32bit mark\n");
+                return 0;
+        }
        return 1;
 }
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e06381..c1889f88262b 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("iptables owner match");
 static int
-match_comm(const struct sk_buff *skb, const char *comm)
-{
-        struct task_struct *g, *p;
-        struct files_struct *files;
-        int i;
-        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
-                if(strncmp(p->comm, comm, sizeof(p->comm)))
-                        continue;
-                task_lock(p);
-                files = p->files;
-                if(files) {
-                        spin_lock(&files->file_lock);
-                        for (i=0; i < files->max_fds; i++) {
-                                if (fcheck_files(files, i) ==
-                                    skb->sk->sk_socket->file) {
-                                        spin_unlock(&files->file_lock);
-                                        task_unlock(p);
-                                        read_unlock(&tasklist_lock);
-                                        return 1;
-                                }
-                        }
-                        spin_unlock(&files->file_lock);
-                }
-                task_unlock(p);
-        } while_each_thread(g, p);
-        read_unlock(&tasklist_lock);
-        return 0;
-}
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
-        struct task_struct *p;
-        struct files_struct *files;
-        int i;
-        read_lock(&tasklist_lock);
-        p = find_task_by_pid(pid);
-        if (!p)
-                goto out;
-        task_lock(p);
-        files = p->files;
-        if(files) {
-                spin_lock(&files->file_lock);
-                for (i=0; i < files->max_fds; i++) {
-                        if (fcheck_files(files, i) ==
-                            skb->sk->sk_socket->file) {
-                                spin_unlock(&files->file_lock);
-                                task_unlock(p);
-                                read_unlock(&tasklist_lock);
-                                return 1;
-                        }
-                }
-                spin_unlock(&files->file_lock);
-        }
-        task_unlock(p);
-out:
-        read_unlock(&tasklist_lock);
-        return 0;
-}
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
-        struct task_struct *g, *p;
-        struct file *file = skb->sk->sk_socket->file;
-        int i, found=0;
-        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
-                struct files_struct *files;
-                if (p->signal->session != sid)
-                        continue;
-                task_lock(p);
-                files = p->files;
-                if (files) {
-                        spin_lock(&files->file_lock);
-                        for (i=0; i < files->max_fds; i++) {
-                                if (fcheck_files(files, i) == file) {
-                                        found = 1;
-                                        break;
-                                }
-                        }
-                        spin_unlock(&files->file_lock);
-                }
-                task_unlock(p);
-                if (found)
-                        goto out;
-        } while_each_thread(g, p);
-out:
-        read_unlock(&tasklist_lock);
-        return found;
-}
-static int
 match(const struct sk_buff *skb,
      const struct net_device *in,
      const struct net_device *out,
@@ -145,24 +45,6 @@ match(const struct sk_buff *skb,
                        return 0;
        }
-        if(info->match & IPT_OWNER_PID) {
-                if (!match_pid(skb, info->pid) ^
-                    !!(info->invert & IPT_OWNER_PID))
-                        return 0;
-        }
-        if(info->match & IPT_OWNER_SID) {
-                if (!match_sid(skb, info->sid) ^
-                    !!(info->invert & IPT_OWNER_SID))
-                        return 0;
-        }
-        if(info->match & IPT_OWNER_COMM) {
-                if (!match_comm(skb, info->comm) ^
-                    !!(info->invert & IPT_OWNER_COMM))
-                        return 0;
-        }
        return 1;
 }
@@ -173,6 +55,8 @@ checkentry(const char *tablename,
           unsigned int matchsize,
           unsigned int hook_mask)
 {
+        const struct ipt_owner_info *info = matchinfo;
        if (hook_mask
            & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
                printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +68,13 @@ checkentry(const char *tablename,
                       IPT_ALIGN(sizeof(struct ipt_owner_info)));
                return 0;
        }
-#ifdef CONFIG_SMP
-        /* files->file_lock can not be used in a BH */
+        if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
-        if (((struct ipt_owner_info *)matchinfo)->match
+                printk("ipt_owner: pid, sid and command matching "
-            & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+                       "not supported anymore\n");
-                printk("ipt_owner: pid, sid and command matching is broken "
-                       "on SMP.\n");
                return 0;
        }
-#endif
        return 1;
 }
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 000000000000..b5def204d798
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
+/* String matching match for iptables
+ * 
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_string.h>
+#include <linux/textsearch.h>
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("IP tables string match module");
+MODULE_LICENSE("GPL");
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo,
+                 int offset,
+                 int *hotdrop)
+{
+        struct ts_state state;
+        struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
+        memset(&state, 0, sizeof(struct ts_state));
+        return (skb_find_text((struct sk_buff *)skb, conf->from_offset, 
+                             conf->to_offset, conf->config, &state) 
+                             != UINT_MAX) && !conf->invert;
+}
+#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
+static int checkentry(const char *tablename,
+                      const struct ipt_ip *ip,
+                      void *matchinfo,
+                      unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+        struct ipt_string_info *conf = matchinfo;
+        struct ts_config *ts_conf;
+        if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
+                return 0;
+        /* Damn, can't handle this case properly with iptables... */
+        if (conf->from_offset > conf->to_offset)
+                return 0;
+        ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+                                     GFP_KERNEL, TS_AUTOLOAD);
+        if (IS_ERR(ts_conf))
+                return 0;
+        conf->config = ts_conf;
+        return 1;
+}
+static void destroy(void *matchinfo, unsigned int matchsize)
+{
+        textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+}
+static struct ipt_match string_match = {
+        .name           = "string",
+        .match          = match,
+        .checkentry     = checkentry,
+        .destroy        = destroy,
+        .me             = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ipt_register_match(&string_match);
+}
+static void __exit fini(void)
+{
+        ipt_unregister_match(&string_match);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f415..f7943ba1f43c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
 */
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
-        /* From net/socket.c */
-        extern void socket_seq_show(struct seq_file *seq);
        socket_seq_show(seq);
        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
                   fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
-                   tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+                   tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
                   atomic_read(&tcp_memory_allocated));
        seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
        seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a869f2..291831e792af 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
 #include <linux/timer.h>
 #include <net/ip.h>
 #include <net/protocol.h>
-#include <net/tcp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d1835b1bc8c4..304bb0a1d4f0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
 #include <linux/netdevice.h>
 #include <linux/in_route.h>
 #include <linux/route.h>
-#include <linux/tcp.h>
 #include <linux/skbuff.h>
 #include <net/dst.h>
 #include <net/sock.h>
@@ -71,6 +70,7 @@
 #include <net/udp.h>
 #include <net/raw.h>
 #include <net/snmp.h>
+#include <net/tcp_states.h>
 #include <net/inet_common.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
 * RFC 1122: SHOULD pass TOS value up to the transport layer.
 * -> It does. And not only TOS, but all IP header.
 */
-void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
 {
        struct sock *sk;
        struct hlist_head *head;
+        int delivered = 0;
        read_lock(&raw_v4_lock);
        head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
                             skb->dev->ifindex);
        while (sk) {
+                delivered = 1;
                if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
                        struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
        }
 out:
        read_unlock(&raw_v4_lock);
+        return delivered;
 }
 void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d675ff80b04d..8c0b14e3beec 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,9 @@ static unsigned			rt_hash_mask;
 static int                      rt_hash_log;
 static unsigned int             rt_hash_rnd;
-struct rt_cache_stat *rt_cache_stat;
+static struct rt_cache_stat *rt_cache_stat;
+#define RT_CACHE_STAT_INC(field)                                          \
+                (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
 static int rt_intern_hash(unsigned hash, struct rtable *rth,
                                struct rtable **res);
@@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
        return ip_route_output_slow(rp, flp);
 }
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
 {
        int err;
@@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
        return 0;
 }
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
 {
        return ip_route_output_flow(rp, flp, NULL, 0);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 72d014442185..a34e60ea48a1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
        return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
 }
-extern struct request_sock_ops tcp_request_sock_ops;
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
                                           struct request_sock *req,
                                           struct dst_entry *dst)
@@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
        child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
        if (child)
-                tcp_acceptq_queue(sk, req, child);
+                inet_csk_reqsk_queue_add(sk, req, child);
        else
                reqsk_free(req);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e32894532416..652685623519 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/config.h>
+#include <linux/igmp.h>
 #include <net/snmp.h>
+#include <net/icmp.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
@@ -19,36 +21,6 @@
 /* From af_inet.c */
 extern int sysctl_ip_nonlocal_bind;
-/* From icmp.c */
-extern int sysctl_icmp_echo_ignore_all;
-extern int sysctl_icmp_echo_ignore_broadcasts;
-extern int sysctl_icmp_ignore_bogus_error_responses;
-extern int sysctl_icmp_errors_use_inbound_ifaddr;
-/* From ip_fragment.c */
-extern int sysctl_ipfrag_low_thresh;
-extern int sysctl_ipfrag_high_thresh; 
-extern int sysctl_ipfrag_time;
-extern int sysctl_ipfrag_secret_interval;
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-/* From icmp.c */
-extern int sysctl_icmp_ratelimit;
-extern int sysctl_icmp_ratemask;
-/* From igmp.c */
-extern int sysctl_igmp_max_memberships;
-extern int sysctl_igmp_max_msf;
-/* From inetpeer.c */
-extern int inet_peer_threshold;
-extern int inet_peer_minttl;
-extern int inet_peer_maxttl;
-extern int inet_peer_gc_mintime;
-extern int inet_peer_gc_maxtime;
 #ifdef CONFIG_SYSCTL
 static int tcp_retr1_max = 255; 
 static int ip_local_port_range_min[] = { 1, 1 };
@@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
 struct ipv4_config ipv4_config;
-extern ctl_table ipv4_route_table[];
 #ifdef CONFIG_SYSCTL
 static
@@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
        return ret;
 }
-int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
+static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
-                                  void __user *oldval, size_t __user *oldlenp,
+                                         int nlen, void __user *oldval,
-                                  void __user *newval, size_t newlen,
+                                         size_t __user *oldlenp,
-                                  void **context)
+                                         void __user *newval, size_t newlen,
+                                         void **context)
 {
        char val[TCP_CA_NAME_MAX];
        ctl_table tbl = {
@@ -259,7 +230,7 @@ ctl_table ipv4_table[] = {
        {
                .ctl_name       = NET_TCP_MAX_TW_BUCKETS,
                .procname       = "tcp_max_tw_buckets",
-                .data           = &sysctl_tcp_max_tw_buckets,
+                .data           = &tcp_death_row.sysctl_max_tw_buckets,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec
@@ -363,7 +334,7 @@ ctl_table ipv4_table[] = {
        {
                .ctl_name       = NET_TCP_TW_RECYCLE,
                .procname       = "tcp_tw_recycle",
-                .data           = &sysctl_tcp_tw_recycle,
+                .data           = &tcp_death_row.sysctl_tw_recycle,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 69b1fcf70077..02fdda68718d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,13 +269,12 @@
 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
-kmem_cache_t *tcp_bucket_cachep;
-kmem_cache_t *tcp_timewait_cachep;
 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
 int sysctl_tcp_mem[3];
 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void)
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 /*
- * LISTEN is a special case for poll..
- */
-static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
-                                               poll_table *wait)
-{
-        return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
-}
-/*
 *      Wait for a TCP event.
 *
 *      Note that we don't need to lock the socket, as the upper poll layers
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
        poll_wait(file, sk->sk_sleep, wait);
        if (sk->sk_state == TCP_LISTEN)
-                return tcp_listen_poll(sk, wait);
+                return inet_csk_listen_poll(sk);
        /* Socket is not locked. We are protected from async events
           by poll logic and correct handling of state changes
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
        return put_user(answ, (int __user *)arg);
 }
-int tcp_listen_start(struct sock *sk)
-{
-        struct inet_sock *inet = inet_sk(sk);
-        struct tcp_sock *tp = tcp_sk(sk);
-        int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
-        if (rc != 0)
-                return rc;
-        sk->sk_max_ack_backlog = 0;
-        sk->sk_ack_backlog = 0;
-        tcp_delack_init(tp);
-        /* There is race window here: we announce ourselves listening,
-         * but this transition is still not validated by get_port().
-         * It is OK, because this socket enters to hash table only
-         * after validation is complete.
-         */
-        sk->sk_state = TCP_LISTEN;
-        if (!sk->sk_prot->get_port(sk, inet->num)) {
-                inet->sport = htons(inet->num);
-                sk_dst_reset(sk);
-                sk->sk_prot->hash(sk);
-                return 0;
-        }
-        sk->sk_state = TCP_CLOSE;
-        reqsk_queue_destroy(&tp->accept_queue);
-        return -EADDRINUSE;
-}
-/*
- *      This routine closes sockets which have been at least partially
- *      opened, but not yet accepted.
- */
-static void tcp_listen_stop (struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct listen_sock *lopt;
-        struct request_sock *acc_req;
-        struct request_sock *req;
-        int i;
-        tcp_delete_keepalive_timer(sk);
-        /* make all the listen_opt local to us */
-        lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
-        acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
-        if (lopt->qlen) {
-                for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
-                        while ((req = lopt->syn_table[i]) != NULL) {
-                                lopt->syn_table[i] = req->dl_next;
-                                lopt->qlen--;
-                                reqsk_free(req);
-                /* Following specs, it would be better either to send FIN
-                 * (and enter FIN-WAIT-1, it is normal close)
-                 * or to send active reset (abort).
-                 * Certainly, it is pretty dangerous while synflood, but it is
-                 * bad justification for our negligence 8)
-                 * To be honest, we are not able to make either
-                 * of the variants now.                 --ANK
-                 */
-                        }
-                }
-        }
-        BUG_TRAP(!lopt->qlen);
-        kfree(lopt);
-        while ((req = acc_req) != NULL) {
-                struct sock *child = req->sk;
-                acc_req = req->dl_next;
-                local_bh_disable();
-                bh_lock_sock(child);
-                BUG_TRAP(!sock_owned_by_user(child));
-                sock_hold(child);
-                tcp_disconnect(child, O_NONBLOCK);
-                sock_orphan(child);
-                atomic_inc(&tcp_orphan_count);
-                tcp_destroy_sock(child);
-                bh_unlock_sock(child);
-                local_bh_enable();
-                sock_put(child);
-                sk_acceptq_removed(sk);
-                __reqsk_free(req);
-        }
-        BUG_TRAP(!sk->sk_ack_backlog);
-}
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -975,7 +862,7 @@ do_fault:
        if (!skb->len) {
                if (sk->sk_send_head == skb)
                        sk->sk_send_head = NULL;
-                __skb_unlink(skb, skb->list);
+                __skb_unlink(skb, &sk->sk_write_queue);
                sk_stream_free_skb(sk, skb);
        }
@@ -1057,20 +944,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
        BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 #endif
-        if (tcp_ack_scheduled(tp)) {
+        if (inet_csk_ack_scheduled(sk)) {
+                const struct inet_connection_sock *icsk = inet_csk(sk);
                   /* Delayed ACKs frequently hit locked sockets during bulk
                    * receive. */
-                if (tp->ack.blocked ||
+                if (icsk->icsk_ack.blocked ||
                    /* Once-per-two-segments ACK was not sent by tcp_input.c */
-                    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+                    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
                    /*
                     * If this read emptied read buffer, we send ACK, if
                     * connection is not bidirectional, user drained
                     * receive buffer and there was a small segment
                     * in queue.
                     */
-                    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
+                    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
-                     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+                     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
                        time_to_ack = 1;
        }
@@ -1572,40 +1460,6 @@ void tcp_shutdown(struct sock *sk, int how)
        }
 }
-/*
- * At this point, there should be no process reference to this
- * socket, and thus no user references at all.  Therefore we
- * can assume the socket waitqueue is inactive and nobody will
- * try to jump onto it.
- */
-void tcp_destroy_sock(struct sock *sk)
-{
-        BUG_TRAP(sk->sk_state == TCP_CLOSE);
-        BUG_TRAP(sock_flag(sk, SOCK_DEAD));
-        /* It cannot be in hash table! */
-        BUG_TRAP(sk_unhashed(sk));
-        /* If it has not 0 inet_sk(sk)->num, it must be bound */
-        BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
-        sk->sk_prot->destroy(sk);
-        sk_stream_kill_queues(sk);
-        xfrm_sk_free_policy(sk);
-#ifdef INET_REFCNT_DEBUG
-        if (atomic_read(&sk->sk_refcnt) != 1) {
-                printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
-                       sk, atomic_read(&sk->sk_refcnt));
-        }
-#endif
-        atomic_dec(&tcp_orphan_count);
-        sock_put(sk);
-}
 void tcp_close(struct sock *sk, long timeout)
 {
        struct sk_buff *skb;
@@ -1618,7 +1472,7 @@ void tcp_close(struct sock *sk, long timeout)
                tcp_set_state(sk, TCP_CLOSE);
                /* Special case. */
-                tcp_listen_stop(sk);
+                inet_csk_listen_stop(sk);
                goto adjudge_to_death;
        }
@@ -1721,12 +1575,12 @@ adjudge_to_death:
                        tcp_send_active_reset(sk, GFP_ATOMIC);
                        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
                } else {
-                        int tmo = tcp_fin_time(tp);
+                        const int tmo = tcp_fin_time(sk);
                        if (tmo > TCP_TIMEWAIT_LEN) {
-                                tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+                                inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
                        } else {
-                                atomic_inc(&tcp_orphan_count);
+                                atomic_inc(sk->sk_prot->orphan_count);
                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                                goto out;
                        }
@@ -1734,7 +1588,7 @@ adjudge_to_death:
        }
        if (sk->sk_state != TCP_CLOSE) {
                sk_stream_mem_reclaim(sk);
-                if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+                if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
                    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
                     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
                        if (net_ratelimit())
@@ -1745,10 +1599,10 @@ adjudge_to_death:
                        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
                }
        }
-        atomic_inc(&tcp_orphan_count);
+        atomic_inc(sk->sk_prot->orphan_count);
        if (sk->sk_state == TCP_CLOSE)
-                tcp_destroy_sock(sk);
+                inet_csk_destroy_sock(sk);
        /* Otherwise, socket is reprieved until protocol close. */
 out:
@@ -1769,6 +1623,7 @@ static inline int tcp_need_reset(int state)
 int tcp_disconnect(struct sock *sk, int flags)
 {
        struct inet_sock *inet = inet_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int err = 0;
        int old_state = sk->sk_state;
@@ -1778,7 +1633,7 @@ int tcp_disconnect(struct sock *sk, int flags)
        /* ABORT function of RFC793 */
        if (old_state == TCP_LISTEN) {
-                tcp_listen_stop(sk);
+                inet_csk_listen_stop(sk);
        } else if (tcp_need_reset(old_state) ||
                   (tp->snd_nxt != tp->write_seq &&
                    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1805,125 +1660,34 @@ int tcp_disconnect(struct sock *sk, int flags)
        tp->srtt = 0;
        if ((tp->write_seq += tp->max_window + 2) == 0)
                tp->write_seq = 1;
-        tp->backoff = 0;
+        icsk->icsk_backoff = 0;
        tp->snd_cwnd = 2;
-        tp->probes_out = 0;
+        icsk->icsk_probes_out = 0;
        tp->packets_out = 0;
        tp->snd_ssthresh = 0x7fffffff;
        tp->snd_cwnd_cnt = 0;
-        tcp_set_ca_state(tp, TCP_CA_Open);
+        tcp_set_ca_state(sk, TCP_CA_Open);
        tcp_clear_retrans(tp);
-        tcp_delack_init(tp);
+        inet_csk_delack_init(sk);
        sk->sk_send_head = NULL;
        tp->rx_opt.saw_tstamp = 0;
        tcp_sack_reset(&tp->rx_opt);
        __sk_dst_reset(sk);
-        BUG_TRAP(!inet->num || tp->bind_hash);
+        BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
        sk->sk_error_report(sk);
        return err;
 }
 /*
- *      Wait for an incoming connection, avoid race
- *      conditions. This must be called with the socket locked.
- */
-static int wait_for_connect(struct sock *sk, long timeo)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        DEFINE_WAIT(wait);
-        int err;
-        /*
-         * True wake-one mechanism for incoming connections: only
-         * one process gets woken up, not the 'whole herd'.
-         * Since we do not 'race & poll' for established sockets
-         * anymore, the common case will execute the loop only once.
-         *
-         * Subtle issue: "add_wait_queue_exclusive()" will be added
-         * after any current non-exclusive waiters, and we know that
-         * it will always _stay_ after any new non-exclusive waiters
-         * because all non-exclusive waiters are added at the
-         * beginning of the wait-queue. As such, it's ok to "drop"
-         * our exclusiveness temporarily when we get woken up without
-         * having to remove and re-insert us on the wait queue.
-         */
-        for (;;) {
-                prepare_to_wait_exclusive(sk->sk_sleep, &wait,
-                                          TASK_INTERRUPTIBLE);
-                release_sock(sk);
-                if (reqsk_queue_empty(&tp->accept_queue))
-                        timeo = schedule_timeout(timeo);
-                lock_sock(sk);
-                err = 0;
-                if (!reqsk_queue_empty(&tp->accept_queue))
-                        break;
-                err = -EINVAL;
-                if (sk->sk_state != TCP_LISTEN)
-                        break;
-                err = sock_intr_errno(timeo);
-                if (signal_pending(current))
-                        break;
-                err = -EAGAIN;
-                if (!timeo)
-                        break;
-        }
-        finish_wait(sk->sk_sleep, &wait);
-        return err;
-}
-/*
- *      This will accept the next outstanding connection.
- */
-struct sock *tcp_accept(struct sock *sk, int flags, int *err)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct sock *newsk;
-        int error;
-        lock_sock(sk);
-        /* We need to make sure that this socket is listening,
-         * and that it has something pending.
-         */
-        error = -EINVAL;
-        if (sk->sk_state != TCP_LISTEN)
-                goto out_err;
-        /* Find already established connection */
-        if (reqsk_queue_empty(&tp->accept_queue)) {
-                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-                /* If this is a non blocking socket don't sleep */
-                error = -EAGAIN;
-                if (!timeo)
-                        goto out_err;
-                error = wait_for_connect(sk, timeo);
-                if (error)
-                        goto out_err;
-        }
-        newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
-        BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
-out:
-        release_sock(sk);
-        return newsk;
-out_err:
-        newsk = NULL;
-        *err = error;
-        goto out;
-}
-/*
 *      Socket option code for TCP.
 */
 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                   int optlen)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        int val;
        int err = 0;
@@ -1945,7 +1709,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                name[val] = 0;
                lock_sock(sk);
-                err = tcp_set_congestion_control(tp, name);
+                err = tcp_set_congestion_control(sk, name);
                release_sock(sk);
                return err;
        }
@@ -2022,7 +1786,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                                        elapsed = tp->keepalive_time - elapsed;
                                else
                                        elapsed = 0;
-                                tcp_reset_keepalive_timer(sk, elapsed);
+                                inet_csk_reset_keepalive_timer(sk, elapsed);
                        }
                }
                break;
@@ -2042,7 +1806,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                if (val < 1 || val > MAX_TCP_SYNCNT)
                        err = -EINVAL;
                else
-                        tp->syn_retries = val;
+                        icsk->icsk_syn_retries = val;
                break;
        case TCP_LINGER2:
@@ -2055,15 +1819,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                break;
        case TCP_DEFER_ACCEPT:
-                tp->defer_accept = 0;
+                icsk->icsk_accept_queue.rskq_defer_accept = 0;
                if (val > 0) {
                        /* Translate value in seconds to number of
                         * retransmits */
-                        while (tp->defer_accept < 32 &&
+                        while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
                               val > ((TCP_TIMEOUT_INIT / HZ) <<
-                                       tp->defer_accept))
+                                       icsk->icsk_accept_queue.rskq_defer_accept))
-                                tp->defer_accept++;
+                                icsk->icsk_accept_queue.rskq_defer_accept++;
-                        tp->defer_accept++;
+                        icsk->icsk_accept_queue.rskq_defer_accept++;
                }
                break;
@@ -2081,16 +1845,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
        case TCP_QUICKACK:
                if (!val) {
-                        tp->ack.pingpong = 1;
+                        icsk->icsk_ack.pingpong = 1;
                } else {
-                        tp->ack.pingpong = 0;
+                        icsk->icsk_ack.pingpong = 0;
                        if ((1 << sk->sk_state) &
                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
-                            tcp_ack_scheduled(tp)) {
+                            inet_csk_ack_scheduled(sk)) {
-                                tp->ack.pending |= TCP_ACK_PUSHED;
+                                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
                                cleanup_rbuf(sk, 1);
                                if (!(val & 1))
-                                        tp->ack.pingpong = 1;
+                                        icsk->icsk_ack.pingpong = 1;
                        }
                }
                break;
@@ -2107,15 +1871,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 now = tcp_time_stamp;
        memset(info, 0, sizeof(*info));
        info->tcpi_state = sk->sk_state;
-        info->tcpi_ca_state = tp->ca_state;
+        info->tcpi_ca_state = icsk->icsk_ca_state;
-        info->tcpi_retransmits = tp->retransmits;
+        info->tcpi_retransmits = icsk->icsk_retransmits;
-        info->tcpi_probes = tp->probes_out;
+        info->tcpi_probes = icsk->icsk_probes_out;
-        info->tcpi_backoff = tp->backoff;
+        info->tcpi_backoff = icsk->icsk_backoff;
        if (tp->rx_opt.tstamp_ok)
                info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2130,10 +1895,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        if (tp->ecn_flags&TCP_ECN_OK)
                info->tcpi_options |= TCPI_OPT_ECN;
-        info->tcpi_rto = jiffies_to_usecs(tp->rto);
+        info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
-        info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+        info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
        info->tcpi_snd_mss = tp->mss_cache;
-        info->tcpi_rcv_mss = tp->ack.rcv_mss;
+        info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
        info->tcpi_unacked = tp->packets_out;
        info->tcpi_sacked = tp->sacked_out;
@@ -2142,7 +1907,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        info->tcpi_fackets = tp->fackets_out;
        info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
-        info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+        info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
        info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
        info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2165,6 +1930,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                   int __user *optlen)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int val, len;
@@ -2202,7 +1968,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
                break;
        case TCP_SYNCNT:
-                val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+                val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
                break;
        case TCP_LINGER2:
                val = tp->linger2;
@@ -2210,8 +1976,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                        val = (val ? : sysctl_tcp_fin_timeout) / HZ;
                break;
        case TCP_DEFER_ACCEPT:
-                val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
+                val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
-                                               (tp->defer_accept - 1));
+                        ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
                break;
        case TCP_WINDOW_CLAMP:
                val = tp->window_clamp;
@@ -2232,7 +1998,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                return 0;
        }
        case TCP_QUICKACK:
-                val = !tp->ack.pingpong;
+                val = !icsk->icsk_ack.pingpong;
                break;
        case TCP_CONGESTION:
@@ -2241,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
                if (put_user(len, optlen))
                        return -EFAULT;
-                if (copy_to_user(optval, tp->ca_ops->name, len))
+                if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
                        return -EFAULT;
                return 0;
        default:
@@ -2278,79 +2044,72 @@ void __init tcp_init(void)
                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
                                           sizeof(skb->cb));
-        tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
+        tcp_hashinfo.bind_bucket_cachep =
-                                              sizeof(struct tcp_bind_bucket),
+                kmem_cache_create("tcp_bind_bucket",
-                                              0, SLAB_HWCACHE_ALIGN,
+                                  sizeof(struct inet_bind_bucket), 0,
-                                              NULL, NULL);
+                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
-        if (!tcp_bucket_cachep)
+        if (!tcp_hashinfo.bind_bucket_cachep)
                panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
-        tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
-                                                sizeof(struct tcp_tw_bucket),
-                                                0, SLAB_HWCACHE_ALIGN,
-                                                NULL, NULL);
-        if (!tcp_timewait_cachep)
-                panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
        /* Size and allocate the main established and bind bucket
         * hash tables.
         *
         * The methodology is similar to that of the buffer cache.
         */
-        tcp_ehash = (struct tcp_ehash_bucket *)
+        tcp_hashinfo.ehash =
                alloc_large_system_hash("TCP established",
-                                        sizeof(struct tcp_ehash_bucket),
+                                        sizeof(struct inet_ehash_bucket),
                                        thash_entries,
                                        (num_physpages >= 128 * 1024) ?
                                                (25 - PAGE_SHIFT) :
                                                (27 - PAGE_SHIFT),
                                        HASH_HIGHMEM,
-                                        &tcp_ehash_size,
+                                        &tcp_hashinfo.ehash_size,
                                        NULL,
                                        0);
-        tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
+        tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
-        for (i = 0; i < (tcp_ehash_size << 1); i++) {
+        for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
-                rwlock_init(&tcp_ehash[i].lock);
+                rwlock_init(&tcp_hashinfo.ehash[i].lock);
-                INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+                INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
        }
-        tcp_bhash = (struct tcp_bind_hashbucket *)
+        tcp_hashinfo.bhash =
                alloc_large_system_hash("TCP bind",
-                                        sizeof(struct tcp_bind_hashbucket),
+                                        sizeof(struct inet_bind_hashbucket),
-                                        tcp_ehash_size,
+                                        tcp_hashinfo.ehash_size,
                                        (num_physpages >= 128 * 1024) ?
                                                (25 - PAGE_SHIFT) :
                                                (27 - PAGE_SHIFT),
                                        HASH_HIGHMEM,
-                                        &tcp_bhash_size,
+                                        &tcp_hashinfo.bhash_size,
                                        NULL,
                                        64 * 1024);
-        tcp_bhash_size = 1 << tcp_bhash_size;
+        tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
-        for (i = 0; i < tcp_bhash_size; i++) {
+        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
-                spin_lock_init(&tcp_bhash[i].lock);
+                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
-                INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
        }
        /* Try to be a bit smarter and adjust defaults depending
         * on available memory.
         */
        for (order = 0; ((1 << order) << PAGE_SHIFT) <
-                        (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+                        (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
                        order++)
                ;
        if (order >= 4) {
                sysctl_local_port_range[0] = 32768;
                sysctl_local_port_range[1] = 61000;
-                sysctl_tcp_max_tw_buckets = 180000;
+                tcp_death_row.sysctl_max_tw_buckets = 180000;
                sysctl_tcp_max_orphans = 4096 << (order - 4);
                sysctl_max_syn_backlog = 1024;
        } else if (order < 3) {
                sysctl_local_port_range[0] = 1024 * (3 - order);
-                sysctl_tcp_max_tw_buckets >>= (3 - order);
+                tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
                sysctl_tcp_max_orphans >>= (3 - order);
                sysctl_max_syn_backlog = 128;
        }
-        tcp_port_rover = sysctl_local_port_range[0] - 1;
+        tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
        sysctl_tcp_mem[0] =  768 << order;
        sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,14 +2124,12 @@ void __init tcp_init(void)
        printk(KERN_INFO "TCP: Hash tables configured "
               "(established %d bind %d)\n",
-               tcp_ehash_size << 1, tcp_bhash_size);
+               tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
        tcp_register_congestion_control(&tcp_reno);
 }
-EXPORT_SYMBOL(tcp_accept);
 EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_destroy_sock);
 EXPORT_SYMBOL(tcp_disconnect);
 EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
@@ -2384,4 +2141,3 @@ EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ec38d45d6649..b940346de4e7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca)
        ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
 }
-static void bictcp_init(struct tcp_sock *tp)
+static void bictcp_init(struct sock *sk)
 {
-        bictcp_reset(tcp_ca(tp));
+        bictcp_reset(inet_csk_ca(sk));
        if (initial_ssthresh)
-                tp->snd_ssthresh = initial_ssthresh;
+                tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 /*
@@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 /* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+static inline void bictcp_low_utilization(struct sock *sk, int flag)
 {
-        struct bictcp *ca = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct bictcp *ca = inet_csk_ca(sk);
        u32 dist, delay;
        /* No time stamp */
@@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
 }
-static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
                              u32 seq_rtt, u32 in_flight, int data_acked)
 {
-        struct bictcp *ca = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct bictcp *ca = inet_csk_ca(sk);
-        bictcp_low_utilization(tp, data_acked);
+        bictcp_low_utilization(sk, data_acked);
        if (in_flight < tp->snd_cwnd)
                return;
@@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
 *      behave like Reno until low_window is reached,
 *      then increase congestion window slowly
 */
-static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
 {
-        struct bictcp *ca = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct bictcp *ca = inet_csk_ca(sk);
        ca->epoch_start = 0;    /* end of epoch */
@@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
                return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
 }
-static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+static u32 bictcp_undo_cwnd(struct sock *sk)
 {
-        struct bictcp *ca = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        const struct bictcp *ca = inet_csk_ca(sk);
        return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
-static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+static u32 bictcp_min_cwnd(struct sock *sk)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        return tp->snd_ssthresh;
 }
-static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+static void bictcp_state(struct sock *sk, u8 new_state)
 {
        if (new_state == TCP_CA_Loss)
-                bictcp_reset(tcp_ca(tp));
+                bictcp_reset(inet_csk_ca(sk));
 }
 /* Track delayed acknowledgement ratio using sliding window
 * ratio = (15*ratio + sample) / 16
 */
-static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+static void bictcp_acked(struct sock *sk, u32 cnt)
 {
-        if (cnt > 0 &&  tp->ca_state == TCP_CA_Open) {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
-                struct bictcp *ca = tcp_ca(tp);
+        if (cnt > 0 &&  icsk->icsk_ca_state == TCP_CA_Open) {
+                struct bictcp *ca = inet_csk_ca(sk);
                cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
                ca->delayed_ack += cnt;
        }
@@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = {
 static int __init bictcp_register(void)
 {
-        BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+        BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
        return tcp_register_congestion_control(&bictcp);
 }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4970d10a7785..bbf2d6624e89 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 /* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct tcp_sock *tp)
+void tcp_init_congestion_control(struct sock *sk)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_congestion_ops *ca;
-        if (tp->ca_ops != &tcp_init_congestion_ops)
+        if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
                return;
        rcu_read_lock();
        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
                if (try_module_get(ca->owner)) {
-                        tp->ca_ops = ca;
+                        icsk->icsk_ca_ops = ca;
                        break;
                }
        }
        rcu_read_unlock();
-        if (tp->ca_ops->init)
+        if (icsk->icsk_ca_ops->init)
-                tp->ca_ops->init(tp);
+                icsk->icsk_ca_ops->init(sk);
 }
 /* Manage refcounts on socket close. */
-void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+void tcp_cleanup_congestion_control(struct sock *sk)
 {
-        if (tp->ca_ops->release)
+        struct inet_connection_sock *icsk = inet_csk(sk);
-                tp->ca_ops->release(tp);
-        module_put(tp->ca_ops->owner);
+        if (icsk->icsk_ca_ops->release)
+                icsk->icsk_ca_ops->release(sk);
+        module_put(icsk->icsk_ca_ops->owner);
 }
 /* Used by sysctl to change default congestion control */
@@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name)
 }
 /* Change congestion control for socket */
-int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+int tcp_set_congestion_control(struct sock *sk, const char *name)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_congestion_ops *ca;
        int err = 0;
        rcu_read_lock();
        ca = tcp_ca_find(name);
-        if (ca == tp->ca_ops)
+        if (ca == icsk->icsk_ca_ops)
                goto out;
        if (!ca)
@@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
                err = -EBUSY;
        else {
-                tcp_cleanup_congestion_control(tp);
+                tcp_cleanup_congestion_control(sk);
-                tp->ca_ops = ca;
+                icsk->icsk_ca_ops = ca;
-                if (tp->ca_ops->init)
+                if (icsk->icsk_ca_ops->init)
-                        tp->ca_ops->init(tp);
+                        icsk->icsk_ca_ops->init(sk);
        }
 out:
        rcu_read_unlock();
@@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
 /* This is Jacobson's slow start and congestion avoidance.
 * SIGCOMM '88, p. 328.
 */
-void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
                         int flag)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        if (in_flight < tp->snd_cwnd)
                return;
@@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 /* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+u32 tcp_reno_ssthresh(struct sock *sk)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        return max(tp->snd_cwnd >> 1U, 2U);
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 /* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+u32 tcp_reno_min_cwnd(struct sock *sk)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        return tp->snd_ssthresh/2;
 }
 EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f66945cb158f..c148c1081880 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
 /*
- * tcp_diag.c   Module for monitoring TCP sockets.
+ * tcp_diag.c   Module for monitoring TCP transport protocols sockets.
 *
 * Version:     $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
 *
@@ -12,779 +12,43 @@
 */
 #include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/random.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/ipv6.h>
-#include <net/inet_common.h>
-#include <linux/inet.h>
-#include <linux/stddef.h>
-#include <linux/tcp_diag.h>
-struct tcpdiag_entry
+#include <linux/module.h>
-{
+#include <linux/inet_diag.h>
-        u32 *saddr;
-        u32 *daddr;
-        u16 sport;
-        u16 dport;
-        u16 family;
-        u16 userlocks;
-};
-static struct sock *tcpnl;
+#include <linux/tcp.h>
-#define TCPDIAG_PUT(skb, attrtype, attrlen) \
+#include <net/tcp.h>
-        RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
-static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
-                        int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+                              void *_info)
 {
-        struct inet_sock *inet = inet_sk(sk);
+        const struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_info *info = _info;
-        struct tcpdiagmsg *r;
-        struct nlmsghdr  *nlh;
-        struct tcp_info  *info = NULL;
-        struct tcpdiag_meminfo  *minfo = NULL;
-        unsigned char    *b = skb->tail;
-        nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
-        nlh->nlmsg_flags = nlmsg_flags;
-        r = NLMSG_DATA(nlh);
-        if (sk->sk_state != TCP_TIME_WAIT) {
-                if (ext & (1<<(TCPDIAG_MEMINFO-1)))
-                        minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
-                if (ext & (1<<(TCPDIAG_INFO-1)))
-                        info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
-                
-                if (ext & (1<<(TCPDIAG_CONG-1))) {
-                        size_t len = strlen(tp->ca_ops->name);
-                        strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
-                               tp->ca_ops->name);
-                }
-        }
-        r->tcpdiag_family = sk->sk_family;
-        r->tcpdiag_state = sk->sk_state;
-        r->tcpdiag_timer = 0;
-        r->tcpdiag_retrans = 0;
-        r->id.tcpdiag_if = sk->sk_bound_dev_if;
-        r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
-        r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
-        if (r->tcpdiag_state == TCP_TIME_WAIT) {
-                struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
-                long tmo = tw->tw_ttd - jiffies;
-                if (tmo < 0)
-                        tmo = 0;
-                r->id.tcpdiag_sport = tw->tw_sport;
-                r->id.tcpdiag_dport = tw->tw_dport;
-                r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
-                r->id.tcpdiag_dst[0] = tw->tw_daddr;
-                r->tcpdiag_state = tw->tw_substate;
-                r->tcpdiag_timer = 3;
-                r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
-                r->tcpdiag_rqueue = 0;
-                r->tcpdiag_wqueue = 0;
-                r->tcpdiag_uid = 0;
-                r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-                if (r->tcpdiag_family == AF_INET6) {
-                        ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-                                       &tw->tw_v6_rcv_saddr);
-                        ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-                                       &tw->tw_v6_daddr);
-                }
-#endif
-                nlh->nlmsg_len = skb->tail - b;
-                return skb->len;
-        }
-        r->id.tcpdiag_sport = inet->sport;
-        r->id.tcpdiag_dport = inet->dport;
-        r->id.tcpdiag_src[0] = inet->rcv_saddr;
-        r->id.tcpdiag_dst[0] = inet->daddr;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-        if (r->tcpdiag_family == AF_INET6) {
-                struct ipv6_pinfo *np = inet6_sk(sk);
-                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-                               &np->rcv_saddr);
-                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-                               &np->daddr);
-        }
-#endif
-#define EXPIRES_IN_MS(tmo)  ((tmo-jiffies)*1000+HZ-1)/HZ
-        if (tp->pending == TCP_TIME_RETRANS) {
-                r->tcpdiag_timer = 1;
-                r->tcpdiag_retrans = tp->retransmits;
-                r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
-        } else if (tp->pending == TCP_TIME_PROBE0) {
-                r->tcpdiag_timer = 4;
-                r->tcpdiag_retrans = tp->probes_out;
-                r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
-        } else if (timer_pending(&sk->sk_timer)) {
-                r->tcpdiag_timer = 2;
-                r->tcpdiag_retrans = tp->probes_out;
-                r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
-        } else {
-                r->tcpdiag_timer = 0;
-                r->tcpdiag_expires = 0;
-        }
-#undef EXPIRES_IN_MS
-        r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+        r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
-        r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
+        r->idiag_wqueue = tp->write_seq - tp->snd_una;
-        r->tcpdiag_uid = sock_i_uid(sk);
+        if (info != NULL)
-        r->tcpdiag_inode = sock_i_ino(sk);
-        if (minfo) {
-                minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
-                minfo->tcpdiag_wmem = sk->sk_wmem_queued;
-                minfo->tcpdiag_fmem = sk->sk_forward_alloc;
-                minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
-        }
-        if (info) 
                tcp_get_info(sk, info);
-        if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
-                tp->ca_ops->get_info(tp, ext, skb);
-        nlh->nlmsg_len = skb->tail - b;
-        return skb->len;
-rtattr_failure:
-nlmsg_failure:
-        skb_trim(skb, b - skb->data);
-        return -1;
-}
-extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
-                                  int dif);
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-                                  struct in6_addr *daddr, u16 dport,
-                                  int dif);
-#else
-static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-                                         struct in6_addr *daddr, u16 dport,
-                                         int dif)
-{
-        return NULL;
-}
-#endif
-static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
-{
-        int err;
-        struct sock *sk;
-        struct tcpdiagreq *req = NLMSG_DATA(nlh);
-        struct sk_buff *rep;
-        if (req->tcpdiag_family == AF_INET) {
-                sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
-                                   req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
-                                   req->id.tcpdiag_if);
-        }
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-        else if (req->tcpdiag_family == AF_INET6) {
-                sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
-                                   (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
-                                   req->id.tcpdiag_if);
-        }
-#endif
-        else {
-                return -EINVAL;
-        }
-        if (sk == NULL)
-                return -ENOENT;
-        err = -ESTALE;
-        if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
-             req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
-            ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
-             (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
-                goto out;
-        err = -ENOMEM;
-        rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
-                                    sizeof(struct tcpdiag_meminfo)+
-                                    sizeof(struct tcp_info)+64), GFP_KERNEL);
-        if (!rep)
-                goto out;
-        if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
-                         NETLINK_CB(in_skb).pid,
-                         nlh->nlmsg_seq, 0) <= 0)
-                BUG();
-        err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
-        if (err > 0)
-                err = 0;
-out:
-        if (sk) {
-                if (sk->sk_state == TCP_TIME_WAIT)
-                        tcp_tw_put((struct tcp_tw_bucket*)sk);
-                else
-                        sock_put(sk);
-        }
-        return err;
-}
-static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
-{
-        int words = bits >> 5;
-        bits &= 0x1f;
-        if (words) {
-                if (memcmp(a1, a2, words << 2))
-                        return 0;
-        }
-        if (bits) {
-                __u32 w1, w2;
-                __u32 mask;
-                w1 = a1[words];
-                w2 = a2[words];
-                mask = htonl((0xffffffff) << (32 - bits));
-                if ((w1 ^ w2) & mask)
-                        return 0;
-        }
-        return 1;
-}
-static int tcpdiag_bc_run(const void *bc, int len,
-                          const struct tcpdiag_entry *entry)
-{
-        while (len > 0) {
-                int yes = 1;
-                const struct tcpdiag_bc_op *op = bc;
-                switch (op->code) {
-                case TCPDIAG_BC_NOP:
-                        break;
-                case TCPDIAG_BC_JMP:
-                        yes = 0;
-                        break;
-                case TCPDIAG_BC_S_GE:
-                        yes = entry->sport >= op[1].no;
-                        break;
-                case TCPDIAG_BC_S_LE:
-                        yes = entry->dport <= op[1].no;
-                        break;
-                case TCPDIAG_BC_D_GE:
-                        yes = entry->dport >= op[1].no;
-                        break;
-                case TCPDIAG_BC_D_LE:
-                        yes = entry->dport <= op[1].no;
-                        break;
-                case TCPDIAG_BC_AUTO:
-                        yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
-                        break;
-                case TCPDIAG_BC_S_COND:
-                case TCPDIAG_BC_D_COND:
-                {
-                        struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
-                        u32 *addr;
-                        if (cond->port != -1 &&
-                            cond->port != (op->code == TCPDIAG_BC_S_COND ?
-                                             entry->sport : entry->dport)) {
-                                yes = 0;
-                                break;
-                        }
-                        
-                        if (cond->prefix_len == 0)
-                                break;
-                        if (op->code == TCPDIAG_BC_S_COND)
-                                addr = entry->saddr;
-                        else
-                                addr = entry->daddr;
-                        if (bitstring_match(addr, cond->addr, cond->prefix_len))
-                                break;
-                        if (entry->family == AF_INET6 &&
-                            cond->family == AF_INET) {
-                                if (addr[0] == 0 && addr[1] == 0 &&
-                                    addr[2] == htonl(0xffff) &&
-                                    bitstring_match(addr+3, cond->addr, cond->prefix_len))
-                                        break;
-                        }
-                        yes = 0;
-                        break;
-                }
-                }
-                if (yes) { 
-                        len -= op->yes;
-                        bc += op->yes;
-                } else {
-                        len -= op->no;
-                        bc += op->no;
-                }
-        }
-        return (len == 0);
-}
-static int valid_cc(const void *bc, int len, int cc)
-{
-        while (len >= 0) {
-                const struct tcpdiag_bc_op *op = bc;
-                if (cc > len)
-                        return 0;
-                if (cc == len)
-                        return 1;
-                if (op->yes < 4)
-                        return 0;
-                len -= op->yes;
-                bc  += op->yes;
-        }
-        return 0;
-}
-static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
-{
-        const unsigned char *bc = bytecode;
-        int  len = bytecode_len;
-        while (len > 0) {
-                struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
-                switch (op->code) {
-                case TCPDIAG_BC_AUTO:
-                case TCPDIAG_BC_S_COND:
-                case TCPDIAG_BC_D_COND:
-                case TCPDIAG_BC_S_GE:
-                case TCPDIAG_BC_S_LE:
-                case TCPDIAG_BC_D_GE:
-                case TCPDIAG_BC_D_LE:
-                        if (op->yes < 4 || op->yes > len+4)
-                                return -EINVAL;
-                case TCPDIAG_BC_JMP:
-                        if (op->no < 4 || op->no > len+4)
-                                return -EINVAL;
-                        if (op->no < len &&
-                            !valid_cc(bytecode, bytecode_len, len-op->no))
-                                return -EINVAL;
-                        break;
-                case TCPDIAG_BC_NOP:
-                        if (op->yes < 4 || op->yes > len+4)
-                                return -EINVAL;
-                        break;
-                default:
-                        return -EINVAL;
-                }
-                bc += op->yes;
-                len -= op->yes;
-        }
-        return len == 0 ? 0 : -EINVAL;
-}
-static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
-                             struct netlink_callback *cb)
-{
-        struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
-                struct tcpdiag_entry entry;
-                struct rtattr *bc = (struct rtattr *)(r + 1);
-                struct inet_sock *inet = inet_sk(sk);
-                entry.family = sk->sk_family;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-                if (entry.family == AF_INET6) {
-                        struct ipv6_pinfo *np = inet6_sk(sk);
-                        entry.saddr = np->rcv_saddr.s6_addr32;
-                        entry.daddr = np->daddr.s6_addr32;
-                } else
-#endif
-                {
-                        entry.saddr = &inet->rcv_saddr;
-                        entry.daddr = &inet->daddr;
-                }
-                entry.sport = inet->num;
-                entry.dport = ntohs(inet->dport);
-                entry.userlocks = sk->sk_userlocks;
-                if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
-                        return 0;
-        }
-        return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
-                            cb->nlh->nlmsg_seq, NLM_F_MULTI);
 }
-static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
+static struct inet_diag_handler tcp_diag_handler = {
-                            struct request_sock *req,
+        .idiag_hashinfo  = &tcp_hashinfo,
-                            u32 pid, u32 seq)
+        .idiag_get_info  = tcp_diag_get_info,
-{
+        .idiag_type      = TCPDIAG_GETSOCK,
-        const struct inet_request_sock *ireq = inet_rsk(req);
+        .idiag_info_size = sizeof(struct tcp_info),
-        struct inet_sock *inet = inet_sk(sk);
+};
-        unsigned char *b = skb->tail;
-        struct tcpdiagmsg *r;
-        struct nlmsghdr *nlh;
-        long tmo;
-        nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
-        nlh->nlmsg_flags = NLM_F_MULTI;
-        r = NLMSG_DATA(nlh);
-        r->tcpdiag_family = sk->sk_family;
-        r->tcpdiag_state = TCP_SYN_RECV;
-        r->tcpdiag_timer = 1;
-        r->tcpdiag_retrans = req->retrans;
-        r->id.tcpdiag_if = sk->sk_bound_dev_if;
-        r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
-        r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
-        tmo = req->expires - jiffies;
-        if (tmo < 0)
-                tmo = 0;
-        r->id.tcpdiag_sport = inet->sport;
-        r->id.tcpdiag_dport = ireq->rmt_port;
-        r->id.tcpdiag_src[0] = ireq->loc_addr;
-        r->id.tcpdiag_dst[0] = ireq->rmt_addr;
-        r->tcpdiag_expires = jiffies_to_msecs(tmo),
-        r->tcpdiag_rqueue = 0;
-        r->tcpdiag_wqueue = 0;
-        r->tcpdiag_uid = sock_i_uid(sk);
-        r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-        if (r->tcpdiag_family == AF_INET6) {
-                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-                               &tcp6_rsk(req)->loc_addr);
-                ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-                               &tcp6_rsk(req)->rmt_addr);
-        }
-#endif
-        nlh->nlmsg_len = skb->tail - b;
-        return skb->len;
-nlmsg_failure:
-        skb_trim(skb, b - skb->data);
-        return -1;
-}
-static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
-                             struct netlink_callback *cb)
-{
-        struct tcpdiag_entry entry;
-        struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct listen_sock *lopt;
-        struct rtattr *bc = NULL;
-        struct inet_sock *inet = inet_sk(sk);
-        int j, s_j;
-        int reqnum, s_reqnum;
-        int err = 0;
-        s_j = cb->args[3];
-        s_reqnum = cb->args[4];
-        if (s_j > 0)
-                s_j--;
-        entry.family = sk->sk_family;
-        read_lock_bh(&tp->accept_queue.syn_wait_lock);
-        lopt = tp->accept_queue.listen_opt;
-        if (!lopt || !lopt->qlen)
-                goto out;
-        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
-                bc = (struct rtattr *)(r + 1);
-                entry.sport = inet->num;
-                entry.userlocks = sk->sk_userlocks;
-        }
-        for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
-                struct request_sock *req, *head = lopt->syn_table[j];
-                reqnum = 0;
-                for (req = head; req; reqnum++, req = req->dl_next) {
-                        struct inet_request_sock *ireq = inet_rsk(req);
-                        if (reqnum < s_reqnum)
-                                continue;
-                        if (r->id.tcpdiag_dport != ireq->rmt_port &&
-                            r->id.tcpdiag_dport)
-                                continue;
-                        if (bc) {
-                                entry.saddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-                                        (entry.family == AF_INET6) ?
-                                        tcp6_rsk(req)->loc_addr.s6_addr32 :
-#endif
-                                        &ireq->loc_addr;
-                                entry.daddr = 
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-                                        (entry.family == AF_INET6) ?
-                                        tcp6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
-                                        &ireq->rmt_addr;
-                                entry.dport = ntohs(ireq->rmt_port);
-                                if (!tcpdiag_bc_run(RTA_DATA(bc),
-                                                    RTA_PAYLOAD(bc), &entry))
-                                        continue;
-                        }
-                        err = tcpdiag_fill_req(skb, sk, req,
-                                               NETLINK_CB(cb->skb).pid,
-                                               cb->nlh->nlmsg_seq);
-                        if (err < 0) {
-                                cb->args[3] = j + 1;
-                                cb->args[4] = reqnum;
-                                goto out;
-                        }
-                }
-                s_reqnum = 0;
-        }
-out:
-        read_unlock_bh(&tp->accept_queue.syn_wait_lock);
-        return err;
-}
-static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
-        int i, num;
-        int s_i, s_num;
-        struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-        s_i = cb->args[1];
-        s_num = num = cb->args[2];
-        if (cb->args[0] == 0) {
-                if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
-                        goto skip_listen_ht;
-                tcp_listen_lock();
-                for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
-                        struct sock *sk;
-                        struct hlist_node *node;
-                        num = 0;
-                        sk_for_each(sk, node, &tcp_listening_hash[i]) {
-                                struct inet_sock *inet = inet_sk(sk);
-                                if (num < s_num) {
-                                        num++;
-                                        continue;
-                                }
-                                if (r->id.tcpdiag_sport != inet->sport &&
-                                    r->id.tcpdiag_sport)
-                                        goto next_listen;
-                                if (!(r->tcpdiag_states&TCPF_LISTEN) ||
-                                    r->id.tcpdiag_dport ||
-                                    cb->args[3] > 0)
-                                        goto syn_recv;
-                                if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-                                        tcp_listen_unlock();
-                                        goto done;
-                                }
-syn_recv:
-                                if (!(r->tcpdiag_states&TCPF_SYN_RECV))
-                                        goto next_listen;
-                                if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
-                                        tcp_listen_unlock();
-                                        goto done;
-                                }
-next_listen:
-                                cb->args[3] = 0;
-                                cb->args[4] = 0;
-                                ++num;
-                        }
-                        s_num = 0;
-                        cb->args[3] = 0;
-                        cb->args[4] = 0;
-                }
-                tcp_listen_unlock();
-skip_listen_ht:
-                cb->args[0] = 1;
-                s_i = num = s_num = 0;
-        }
-        if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
-                return skb->len;
-        for (i = s_i; i < tcp_ehash_size; i++) {
-                struct tcp_ehash_bucket *head = &tcp_ehash[i];
-                struct sock *sk;
-                struct hlist_node *node;
-                if (i > s_i)
-                        s_num = 0;
-                read_lock_bh(&head->lock);
-                num = 0;
-                sk_for_each(sk, node, &head->chain) {
-                        struct inet_sock *inet = inet_sk(sk);
-                        if (num < s_num)
-                                goto next_normal;
-                        if (!(r->tcpdiag_states & (1 << sk->sk_state)))
-                                goto next_normal;
-                        if (r->id.tcpdiag_sport != inet->sport &&
-                            r->id.tcpdiag_sport)
-                                goto next_normal;
-                        if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
-                                goto next_normal;
-                        if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-                                read_unlock_bh(&head->lock);
-                                goto done;
-                        }
-next_normal:
-                        ++num;
-                }
-                if (r->tcpdiag_states&TCPF_TIME_WAIT) {
-                        sk_for_each(sk, node,
-                                    &tcp_ehash[i + tcp_ehash_size].chain) {
-                                struct inet_sock *inet = inet_sk(sk);
-                                if (num < s_num)
-                                        goto next_dying;
-                                if (r->id.tcpdiag_sport != inet->sport &&
-                                    r->id.tcpdiag_sport)
-                                        goto next_dying;
-                                if (r->id.tcpdiag_dport != inet->dport &&
-                                    r->id.tcpdiag_dport)
-                                        goto next_dying;
-                                if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-                                        read_unlock_bh(&head->lock);
-                                        goto done;
-                                }
-next_dying:
-                                ++num;
-                        }
-                }
-                read_unlock_bh(&head->lock);
-        }
-done:
-        cb->args[1] = i;
-        cb->args[2] = num;
-        return skb->len;
-}
-static int tcpdiag_dump_done(struct netlink_callback *cb)
-{
-        return 0;
-}
-static __inline__ int
-tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
-                return 0;
-        if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
-                goto err_inval;
-        if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
-                goto err_inval;
-        if (nlh->nlmsg_flags&NLM_F_DUMP) {
-                if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
-                        struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
-                        if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
-                            rta->rta_len < 8 ||
-                            rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
-                                goto err_inval;
-                        if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
-                                goto err_inval;
-                }
-                return netlink_dump_start(tcpnl, skb, nlh,
-                                          tcpdiag_dump,
-                                          tcpdiag_dump_done);
-        } else {
-                return tcpdiag_get_exact(skb, nlh);
-        }
-err_inval:
-        return -EINVAL;
-}
-static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
-{
-        int err;
-        struct nlmsghdr * nlh;
-        if (skb->len >= NLMSG_SPACE(0)) {
-                nlh = (struct nlmsghdr *)skb->data;
-                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-                        return;
-                err = tcpdiag_rcv_msg(skb, nlh);
-                if (err || nlh->nlmsg_flags & NLM_F_ACK) 
-                        netlink_ack(skb, nlh, err);
-        }
-}
-static void tcpdiag_rcv(struct sock *sk, int len)
-{
-        struct sk_buff *skb;
-        unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-        while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
-                tcpdiag_rcv_skb(skb);
-                kfree_skb(skb);
-        }
-}
-static int __init tcpdiag_init(void)
+static int __init tcp_diag_init(void)
 {
-        tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
+        return inet_diag_register(&tcp_diag_handler);
-        if (tcpnl == NULL)
-                return -ENOMEM;
-        return 0;
 }
-static void __exit tcpdiag_exit(void)
+static void __exit tcp_diag_exit(void)
 {
-        sock_release(tcpnl->sk_socket);
+        inet_diag_unregister(&tcp_diag_handler);
 }
-module_init(tcpdiag_init);
+module_init(tcp_diag_init);
-module_exit(tcpdiag_exit);
+module_exit(tcp_diag_exit);
 MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 36c51f8136bf..6acc04bde080 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,9 +98,10 @@ struct hstcp {
        u32     ai;
 };
-static void hstcp_init(struct tcp_sock *tp)
+static void hstcp_init(struct sock *sk)
 {
-        struct hstcp *ca = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct hstcp *ca = inet_csk_ca(sk);
        ca->ai = 0;
@@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp)
        tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
 }
-static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
                             u32 in_flight, int good)
 {
-        struct hstcp *ca = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct hstcp *ca = inet_csk_ca(sk);
        if (in_flight < tp->snd_cwnd)
                return;
@@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
        }
 }
-static u32 hstcp_ssthresh(struct tcp_sock *tp)
+static u32 hstcp_ssthresh(struct sock *sk)
 {
-        struct hstcp *ca = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        const struct hstcp *ca = inet_csk_ca(sk);
        /* Do multiplicative decrease */
        return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
@@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = {
 static int __init hstcp_register(void)
 {
-        BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+        BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
        return tcp_register_congestion_control(&tcp_highspeed);
 }
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 40168275acf9..e47b37984e95 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca)
        ca->snd_cwnd_cnt2 = 0;
 }
-static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+static u32 htcp_cwnd_undo(struct sock *sk)
 {
-        struct htcp *ca = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct htcp *ca = inet_csk_ca(sk);
        ca->ccount = ca->undo_ccount;
        ca->maxRTT = ca->undo_maxRTT;
        ca->old_maxB = ca->undo_old_maxB;
        return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
 }
-static inline void measure_rtt(struct tcp_sock *tp)
+static inline void measure_rtt(struct sock *sk)
 {
-        struct htcp *ca = tcp_ca(tp);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct htcp *ca = inet_csk_ca(sk);
        u32 srtt = tp->srtt>>3;
        /* keep track of minimum RTT seen so far, minRTT is zero at first */
@@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp)
                ca->minRTT = srtt;
        /* max RTT */
-        if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+        if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
                if (ca->maxRTT < ca->minRTT)
                        ca->maxRTT = ca->minRTT;
                if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
@@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp)
        }
 }
-static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
 {
-        struct htcp *ca = tcp_ca(tp);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct htcp *ca = inet_csk_ca(sk);
        u32 now = tcp_time_stamp;
        /* achieved throughput calculations */
-        if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+        if (icsk->icsk_ca_state != TCP_CA_Open &&
+            icsk->icsk_ca_state != TCP_CA_Disorder) {
                ca->packetcount = 0;
                ca->lasttime = now;
                return;
@@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca)
 * that point do we really have a real sense of maxRTT (the queues en route
 * were getting just too full now).
 */
-static void htcp_param_update(struct tcp_sock *tp)
+static void htcp_param_update(struct sock *sk)
 {
-        struct htcp *ca = tcp_ca(tp);
+        struct htcp *ca = inet_csk_ca(sk);
        u32 minRTT = ca->minRTT;
        u32 maxRTT = ca->maxRTT;
@@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp)
                ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
 }
-static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 htcp_recalc_ssthresh(struct sock *sk)
 {
-        struct htcp *ca = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
-        htcp_param_update(tp);
+        const struct htcp *ca = inet_csk_ca(sk);
+        htcp_param_update(sk);
        return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
 }
-static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
                            u32 in_flight, int data_acked)
 {
-        struct htcp *ca = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct htcp *ca = inet_csk_ca(sk);
        if (in_flight < tp->snd_cwnd)
                return;
@@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
                        tp->snd_cwnd++;
        } else {
-                measure_rtt(tp);
+                measure_rtt(sk);
                /* keep track of number of round-trip times since last backoff event */
                if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
@@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 }
 /* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct tcp_sock *tp)
+static u32 htcp_min_cwnd(struct sock *sk)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        return tp->snd_ssthresh;
 }
-static void htcp_init(struct tcp_sock *tp)
+static void htcp_init(struct sock *sk)
 {
-        struct htcp *ca = tcp_ca(tp);
+        struct htcp *ca = inet_csk_ca(sk);
        memset(ca, 0, sizeof(struct htcp));
        ca->alpha = ALPHA_BASE;
        ca->beta = BETA_MIN;
 }
-static void htcp_state(struct tcp_sock *tp, u8 new_state)
+static void htcp_state(struct sock *sk, u8 new_state)
 {
        switch (new_state) {
        case TCP_CA_CWR:
        case TCP_CA_Recovery:
        case TCP_CA_Loss:
-                htcp_reset(tcp_ca(tp));
+                htcp_reset(inet_csk_ca(sk));
                break;
        }
 }
@@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = {
 static int __init htcp_register(void)
 {
-        BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+        BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
        BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
        if (!use_bandwidth_switch)
                htcp.pkts_acked = NULL;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 13a66342c304..77add63623df 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
 /* This is called to refresh values for hybla parameters */
-static inline void hybla_recalc_param (struct tcp_sock *tp)
+static inline void hybla_recalc_param (struct sock *sk)
 {
-        struct hybla *ca = tcp_ca(tp);
+        struct hybla *ca = inet_csk_ca(sk);
-        ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+        ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
        ca->rho = ca->rho_3ls >> 3;
        ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
        ca->rho2 = ca->rho2_7ls >>7;
 }
-static void hybla_init(struct tcp_sock *tp)
+static void hybla_init(struct sock *sk)
 {
-        struct hybla *ca = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct hybla *ca = inet_csk_ca(sk);
        ca->rho = 0;
        ca->rho2 = 0;
@@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp)
        tp->snd_cwnd_clamp = 65535;
        /* 1st Rho measurement based on initial srtt */
-        hybla_recalc_param(tp);
+        hybla_recalc_param(sk);
        /* set minimum rtt as this is the 1st ever seen */
        ca->minrtt = tp->srtt;
        tp->snd_cwnd = ca->rho;
 }
-static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+static void hybla_state(struct sock *sk, u8 ca_state)
 {
-        struct hybla *ca = tcp_ca(tp);
+        struct hybla *ca = inet_csk_ca(sk);
        ca->hybla_en = (ca_state == TCP_CA_Open);
 }
@@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds)
 *     o Give cwnd a new value based on the model proposed
 *     o remember increments <1
 */
-static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
                            u32 in_flight, int flag)
 {
-        struct hybla *ca = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct hybla *ca = inet_csk_ca(sk);
        u32 increment, odd, rho_fractions;
        int is_slowstart = 0;
        /*  Recalculate rho only if this srtt is the lowest */
        if (tp->srtt < ca->minrtt){
-                hybla_recalc_param(tp);
+                hybla_recalc_param(sk);
                ca->minrtt = tp->srtt;
        }
        if (!ca->hybla_en)
-                return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+                return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
        if (in_flight < tp->snd_cwnd)
                return;
        if (ca->rho == 0)
-                hybla_recalc_param(tp);
+                hybla_recalc_param(sk);
        rho_fractions = ca->rho_3ls - (ca->rho << 3);
@@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = {
 static int __init hybla_register(void)
 {
-        BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+        BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
        return tcp_register_congestion_control(&tcp_hybla);
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53a8a5399f1e..1afb080bdf0c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1;
 /* Adapt the MSS value used to make delayed ack decision to the 
 * real world.
 */ 
-static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
+static inline void tcp_measure_rcv_mss(struct sock *sk,
-                                       struct sk_buff *skb)
+                                       const struct sk_buff *skb)
 {
-        unsigned int len, lss;
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        const unsigned int lss = icsk->icsk_ack.last_seg_size; 
+        unsigned int len;
-        lss = tp->ack.last_seg_size; 
+        icsk->icsk_ack.last_seg_size = 0; 
-        tp->ack.last_seg_size = 0; 
        /* skb->len may jitter because of SACKs, even if peer
         * sends good full-sized frames.
         */
        len = skb->len;
-        if (len >= tp->ack.rcv_mss) {
+        if (len >= icsk->icsk_ack.rcv_mss) {
-                tp->ack.rcv_mss = len;
+                icsk->icsk_ack.rcv_mss = len;
        } else {
                /* Otherwise, we make more careful check taking into account,
                 * that SACKs block is variable.
@@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
                         * tcp header plus fixed timestamp option length.
                         * Resulting "len" is MSS free of SACK jitter.
                         */
-                        len -= tp->tcp_header_len;
+                        len -= tcp_sk(sk)->tcp_header_len;
-                        tp->ack.last_seg_size = len;
+                        icsk->icsk_ack.last_seg_size = len;
                        if (len == lss) {
-                                tp->ack.rcv_mss = len;
+                                icsk->icsk_ack.rcv_mss = len;
                                return;
                        }
                }
-                tp->ack.pending |= TCP_ACK_PUSHED;
+                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
        }
 }
-static void tcp_incr_quickack(struct tcp_sock *tp)
+static void tcp_incr_quickack(struct sock *sk)
 {
-        unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
        if (quickacks==0)
                quickacks=2;
-        if (quickacks > tp->ack.quick)
+        if (quickacks > icsk->icsk_ack.quick)
-                tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+                icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 }
-void tcp_enter_quickack_mode(struct tcp_sock *tp)
+void tcp_enter_quickack_mode(struct sock *sk)
 {
-        tcp_incr_quickack(tp);
+        struct inet_connection_sock *icsk = inet_csk(sk);
-        tp->ack.pingpong = 0;
+        tcp_incr_quickack(sk);
-        tp->ack.ato = TCP_ATO_MIN;
+        icsk->icsk_ack.pingpong = 0;
+        icsk->icsk_ack.ato = TCP_ATO_MIN;
 }
 /* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */
-static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+static inline int tcp_in_quickack_mode(const struct sock *sk)
 {
-        return (tp->ack.quick && !tp->ack.pingpong);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
 }
 /* Buffer size and advertised window tuning.
@@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
 */
 /* Slow part of check#2. */
-static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
-                             struct sk_buff *skb)
+                             const struct sk_buff *skb)
 {
        /* Optimize this! */
        int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
        while (tp->rcv_ssthresh <= window) {
                if (truesize <= skb->len)
-                        return 2*tp->ack.rcv_mss;
+                        return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
                truesize >>= 1;
                window >>= 1;
@@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
                if (incr) {
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
-                        tp->ack.quick |= 1;
+                        inet_csk(sk)->icsk_ack.quick |= 1;
                }
        }
 }
@@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk)
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb;
        unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
        int ofo_win = 0;
-        tp->ack.quick = 0;
+        icsk->icsk_ack.quick = 0;
        skb_queue_walk(&tp->out_of_order_queue, skb) {
                ofo_win += skb->len;
@@ -346,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
                app_win += ofo_win;
                if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
                        app_win >>= 1;
-                if (app_win > tp->ack.rcv_mss)
+                if (app_win > icsk->icsk_ack.rcv_mss)
-                        app_win -= tp->ack.rcv_mss;
+                        app_win -= icsk->icsk_ack.rcv_mss;
                app_win = max(app_win, 2U*tp->advmss);
                if (!ofo_win)
@@ -415,11 +420,12 @@ new_measure:
        tp->rcv_rtt_est.time = tcp_time_stamp;
 }
-static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        if (tp->rx_opt.rcv_tsecr &&
            (TCP_SKB_CB(skb)->end_seq -
-             TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+             TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
                tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
 }
@@ -492,41 +498,42 @@ new_measure:
 */
 static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        u32 now;
-        tcp_schedule_ack(tp);
+        inet_csk_schedule_ack(sk);
-        tcp_measure_rcv_mss(tp, skb);
+        tcp_measure_rcv_mss(sk, skb);
        tcp_rcv_rtt_measure(tp);
        
        now = tcp_time_stamp;
-        if (!tp->ack.ato) {
+        if (!icsk->icsk_ack.ato) {
                /* The _first_ data packet received, initialize
                 * delayed ACK engine.
                 */
-                tcp_incr_quickack(tp);
+                tcp_incr_quickack(sk);
-                tp->ack.ato = TCP_ATO_MIN;
+                icsk->icsk_ack.ato = TCP_ATO_MIN;
        } else {
-                int m = now - tp->ack.lrcvtime;
+                int m = now - icsk->icsk_ack.lrcvtime;
                if (m <= TCP_ATO_MIN/2) {
                        /* The fastest case is the first. */
-                        tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
+                        icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
-                } else if (m < tp->ack.ato) {
+                } else if (m < icsk->icsk_ack.ato) {
-                        tp->ack.ato = (tp->ack.ato>>1) + m;
+                        icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
-                        if (tp->ack.ato > tp->rto)
+                        if (icsk->icsk_ack.ato > icsk->icsk_rto)
-                                tp->ack.ato = tp->rto;
+                                icsk->icsk_ack.ato = icsk->icsk_rto;
-                } else if (m > tp->rto) {
+                } else if (m > icsk->icsk_rto) {
                        /* Too long gap. Apparently sender falled to
                         * restart window, so that we send ACKs quickly.
                         */
-                        tcp_incr_quickack(tp);
+                        tcp_incr_quickack(sk);
                        sk_stream_mem_reclaim(sk);
                }
        }
-        tp->ack.lrcvtime = now;
+        icsk->icsk_ack.lrcvtime = now;
        TCP_ECN_check_ce(tp, skb);
@@ -543,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        long m = mrtt; /* RTT */
        /*      The following amusing code comes from Jacobson's
@@ -604,15 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
                tp->rtt_seq = tp->snd_nxt;
        }
-        if (tp->ca_ops->rtt_sample)
+        if (icsk->icsk_ca_ops->rtt_sample)
-                tp->ca_ops->rtt_sample(tp, *usrtt);
+                icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
 }
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
-static inline void tcp_set_rto(struct tcp_sock *tp)
+static inline void tcp_set_rto(struct sock *sk)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        /* Old crap is replaced with new one. 8)
         *
         * More seriously:
@@ -623,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
         *    is invisible. Actually, Linux-2.4 also generates erratic
         *    ACKs in some curcumstances.
         */
-        tp->rto = (tp->srtt >> 3) + tp->rttvar;
+        inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
        /* 2. Fixups made earlier cannot be right.
         *    If we do not estimate RTO correctly without them,
@@ -635,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
 /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
 * guarantees that rto is higher.
 */
-static inline void tcp_bound_rto(struct tcp_sock *tp)
+static inline void tcp_bound_rto(struct sock *sk)
 {
-        if (tp->rto > TCP_RTO_MAX)
+        if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
-                tp->rto = TCP_RTO_MAX;
+                inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
 }
 /* Save metrics learned by this TCP session.
@@ -656,9 +666,10 @@ void tcp_update_metrics(struct sock *sk)
        dst_confirm(dst);
        if (dst && (dst->flags&DST_HOST)) {
+                const struct inet_connection_sock *icsk = inet_csk(sk);
                int m;
-                if (tp->backoff || !tp->srtt) {
+                if (icsk->icsk_backoff || !tp->srtt) {
                        /* This session failed to estimate rtt. Why?
                         * Probably, no packets returned in time.
                         * Reset our results.
@@ -707,7 +718,7 @@ void tcp_update_metrics(struct sock *sk)
                            tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
                                dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
                } else if (tp->snd_cwnd > tp->snd_ssthresh &&
-                           tp->ca_state == TCP_CA_Open) {
+                           icsk->icsk_ca_state == TCP_CA_Open) {
                        /* Cong. avoidance phase, cwnd is reliable. */
                        if (!dst_metric_locked(dst, RTAX_SSTHRESH))
                                dst->metrics[RTAX_SSTHRESH-1] =
@@ -801,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk)
                tp->mdev = dst_metric(dst, RTAX_RTTVAR);
                tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
        }
-        tcp_set_rto(tp);
+        tcp_set_rto(sk);
-        tcp_bound_rto(tp);
+        tcp_bound_rto(sk);
-        if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+        if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
                goto reset;
        tp->snd_cwnd = tcp_init_cwnd(tp, dst);
        tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -817,12 +828,14 @@ reset:
        if (!tp->rx_opt.saw_tstamp && tp->srtt) {
                tp->srtt = 0;
                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
-                tp->rto = TCP_TIMEOUT_INIT;
+                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
        }
 }
-static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+static void tcp_update_reordering(struct sock *sk, const int metric,
+                                  const int ts)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        if (metric > tp->reordering) {
                tp->reordering = min(TCP_MAX_REORDERING, metric);
@@ -837,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
                        NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
 #if FASTRETRANS_DEBUG > 1
                printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
-                       tp->rx_opt.sack_ok, tp->ca_state,
+                       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
                       tp->reordering,
                       tp->fackets_out,
                       tp->sacked_out,
@@ -899,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
 static int
 tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
 {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
        struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -1064,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
         * we have to account for reordering! Ugly,
         * but should help.
         */
-        if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+        if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
                struct sk_buff *skb;
                sk_stream_for_retrans_queue(skb, sk) {
@@ -1093,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
        tp->left_out = tp->sacked_out + tp->lost_out;
-        if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
+        if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
-                tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+                tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
 #if FASTRETRANS_DEBUG > 0
        BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1111,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 */
 void tcp_enter_frto(struct sock *sk)
 {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        tp->frto_counter = 1;
-        if (tp->ca_state <= TCP_CA_Disorder ||
+        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
            tp->snd_una == tp->high_seq ||
-            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
-                tp->prior_ssthresh = tcp_current_ssthresh(tp);
+                tp->prior_ssthresh = tcp_current_ssthresh(sk);
-                tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+                tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-                tcp_ca_event(tp, CA_EVENT_FRTO);
+                tcp_ca_event(sk, CA_EVENT_FRTO);
        }
        /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1138,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk)
        }
        tcp_sync_left_out(tp);
-        tcp_set_ca_state(tp, TCP_CA_Open);
+        tcp_set_ca_state(sk, TCP_CA_Open);
        tp->frto_highmark = tp->snd_nxt;
 }
@@ -1184,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
        tp->reordering = min_t(unsigned int, tp->reordering,
                                             sysctl_tcp_reordering);
-        tcp_set_ca_state(tp, TCP_CA_Loss);
+        tcp_set_ca_state(sk, TCP_CA_Loss);
        tp->high_seq = tp->frto_highmark;
        TCP_ECN_queue_cwr(tp);
 }
@@ -1208,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
 */
 void tcp_enter_loss(struct sock *sk, int how)
 {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int cnt = 0;
        /* Reduce ssthresh if it has not yet been made inside this window. */
-        if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+        if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
-            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
-                tp->prior_ssthresh = tcp_current_ssthresh(tp);
+                tp->prior_ssthresh = tcp_current_ssthresh(sk);
-                tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+                tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-                tcp_ca_event(tp, CA_EVENT_LOSS);
+                tcp_ca_event(sk, CA_EVENT_LOSS);
        }
        tp->snd_cwnd       = 1;
        tp->snd_cwnd_cnt   = 0;
@@ -1248,12 +1264,12 @@ void tcp_enter_loss(struct sock *sk, int how)
        tp->reordering = min_t(unsigned int, tp->reordering,
                                             sysctl_tcp_reordering);
-        tcp_set_ca_state(tp, TCP_CA_Loss);
+        tcp_set_ca_state(sk, TCP_CA_Loss);
        tp->high_seq = tp->snd_nxt;
        TCP_ECN_queue_cwr(tp);
 }
-static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+static int tcp_check_sack_reneging(struct sock *sk)
 {
        struct sk_buff *skb;
@@ -1265,12 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
         */
        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
            (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+                struct inet_connection_sock *icsk = inet_csk(sk);
                NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
                tcp_enter_loss(sk, 1);
-                tp->retransmits++;
+                icsk->icsk_retransmits++;
                tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
-                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                          icsk->icsk_rto, TCP_RTO_MAX);
                return 1;
        }
        return 0;
@@ -1281,15 +1299,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
        return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
 }
-static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 {
-        return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+        return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
 }
 static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
 {
        return tp->packets_out &&
-               tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+               tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
 }
 /* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1423,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
 * in assumption of absent reordering, interpret this as reordering.
 * The only another reason could be bug in receiver TCP.
 */
-static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        u32 holes;
        holes = max(tp->lost_out, 1U);
@@ -1432,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
        if ((tp->sacked_out + holes) > tp->packets_out) {
                tp->sacked_out = tp->packets_out - holes;
-                tcp_update_reordering(tp, tp->packets_out+addend, 0);
+                tcp_update_reordering(sk, tp->packets_out + addend, 0);
        }
 }
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct tcp_sock *tp)
+static void tcp_add_reno_sack(struct sock *sk)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        tp->sacked_out++;
-        tcp_check_reno_reordering(tp, 0);
+        tcp_check_reno_reordering(sk, 0);
        tcp_sync_left_out(tp);
 }
@@ -1456,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
                else
                        tp->sacked_out -= acked-1;
        }
-        tcp_check_reno_reordering(tp, acked);
+        tcp_check_reno_reordering(sk, acked);
        tcp_sync_left_out(tp);
 }
@@ -1509,7 +1529,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
                struct sk_buff *skb;
                sk_stream_for_retrans_queue(skb, sk) {
-                        if (tcp_skb_timedout(tp, skb) &&
+                        if (tcp_skb_timedout(sk, skb) &&
                            !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                                tp->lost_out += tcp_skb_pcount(skb);
@@ -1530,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 }
 /* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct tcp_sock *tp)
+static void tcp_cwnd_down(struct sock *sk)
 {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        int decr = tp->snd_cwnd_cnt + 1;
        tp->snd_cwnd_cnt = decr&1;
        decr >>= 1;
-        if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
+        if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
                tp->snd_cwnd -= decr;
        tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1571,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
 #define DBGUNDO(x...) do { } while (0)
 #endif
-static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+static void tcp_undo_cwr(struct sock *sk, const int undo)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        if (tp->prior_ssthresh) {
-                if (tp->ca_ops->undo_cwnd)
+                const struct inet_connection_sock *icsk = inet_csk(sk);
-                        tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
+                if (icsk->icsk_ca_ops->undo_cwnd)
+                        tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
                else
                        tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
@@ -1603,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
                /* Happy end! We did not retransmit anything
                 * or our original transmission succeeded.
                 */
-                DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
+                DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
-                tcp_undo_cwr(tp, 1);
+                tcp_undo_cwr(sk, 1);
-                if (tp->ca_state == TCP_CA_Loss)
+                if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
                        NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
                else
                        NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1618,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
                tcp_moderate_cwnd(tp);
                return 1;
        }
-        tcp_set_ca_state(tp, TCP_CA_Open);
+        tcp_set_ca_state(sk, TCP_CA_Open);
        return 0;
 }
@@ -1627,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
 {
        if (tp->undo_marker && !tp->undo_retrans) {
                DBGUNDO(sk, tp, "D-SACK");
-                tcp_undo_cwr(tp, 1);
+                tcp_undo_cwr(sk, 1);
                tp->undo_marker = 0;
                NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
        }
@@ -1648,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
                if (tp->retrans_out == 0)
                        tp->retrans_stamp = 0;
-                tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+                tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
                DBGUNDO(sk, tp, "Hoe");
-                tcp_undo_cwr(tp, 0);
+                tcp_undo_cwr(sk, 0);
                NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
                /* So... Do not make Hoe's retransmit yet.
@@ -1674,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
                DBGUNDO(sk, tp, "partial loss");
                tp->lost_out = 0;
                tp->left_out = tp->sacked_out;
-                tcp_undo_cwr(tp, 1);
+                tcp_undo_cwr(sk, 1);
                NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
-                tp->retransmits = 0;
+                inet_csk(sk)->icsk_retransmits = 0;
                tp->undo_marker = 0;
                if (!IsReno(tp))
-                        tcp_set_ca_state(tp, TCP_CA_Open);
+                        tcp_set_ca_state(sk, TCP_CA_Open);
                return 1;
        }
        return 0;
 }
-static inline void tcp_complete_cwr(struct tcp_sock *tp)
+static inline void tcp_complete_cwr(struct sock *sk)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
        tp->snd_cwnd_stamp = tcp_time_stamp;
-        tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
+        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
 static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1700,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
                tp->retrans_stamp = 0;
        if (flag&FLAG_ECE)
-                tcp_enter_cwr(tp);
+                tcp_enter_cwr(sk);
-        if (tp->ca_state != TCP_CA_CWR) {
+        if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
                int state = TCP_CA_Open;
                if (tp->left_out || tp->retrans_out || tp->undo_marker)
                        state = TCP_CA_Disorder;
-                if (tp->ca_state != state) {
+                if (inet_csk(sk)->icsk_ca_state != state) {
-                        tcp_set_ca_state(tp, state);
+                        tcp_set_ca_state(sk, state);
                        tp->high_seq = tp->snd_nxt;
                }
                tcp_moderate_cwnd(tp);
        } else {
-                tcp_cwnd_down(tp);
+                tcp_cwnd_down(sk);
        }
 }
@@ -1733,6 +1760,7 @@ static void
 tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                      int prior_packets, int flag)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
@@ -1750,13 +1778,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                tp->prior_ssthresh = 0;
        /* B. In all the states check for reneging SACKs. */
-        if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+        if (tp->sacked_out && tcp_check_sack_reneging(sk))
                return;
        /* C. Process data loss notification, provided it is valid. */
        if ((flag&FLAG_DATA_LOST) &&
            before(tp->snd_una, tp->high_seq) &&
-            tp->ca_state != TCP_CA_Open &&
+            icsk->icsk_ca_state != TCP_CA_Open &&
            tp->fackets_out > tp->reordering) {
                tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
                NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1767,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
        /* E. Check state exit conditions. State can be terminated
         *    when high_seq is ACKed. */
-        if (tp->ca_state == TCP_CA_Open) {
+        if (icsk->icsk_ca_state == TCP_CA_Open) {
                if (!sysctl_tcp_frto)
                        BUG_TRAP(tp->retrans_out == 0);
                tp->retrans_stamp = 0;
        } else if (!before(tp->snd_una, tp->high_seq)) {
-                switch (tp->ca_state) {
+                switch (icsk->icsk_ca_state) {
                case TCP_CA_Loss:
-                        tp->retransmits = 0;
+                        icsk->icsk_retransmits = 0;
                        if (tcp_try_undo_recovery(sk, tp))
                                return;
                        break;
@@ -1783,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                        /* CWR is to be held something *above* high_seq
                         * is ACKed for CWR bit to reach receiver. */
                        if (tp->snd_una != tp->high_seq) {
-                                tcp_complete_cwr(tp);
+                                tcp_complete_cwr(sk);
-                                tcp_set_ca_state(tp, TCP_CA_Open);
+                                tcp_set_ca_state(sk, TCP_CA_Open);
                        }
                        break;
@@ -1795,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                             * catching for all duplicate ACKs. */
                            IsReno(tp) || tp->snd_una != tp->high_seq) {
                                tp->undo_marker = 0;
-                                tcp_set_ca_state(tp, TCP_CA_Open);
+                                tcp_set_ca_state(sk, TCP_CA_Open);
                        }
                        break;
@@ -1804,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                                tcp_reset_reno_sack(tp);
                        if (tcp_try_undo_recovery(sk, tp))
                                return;
-                        tcp_complete_cwr(tp);
+                        tcp_complete_cwr(sk);
                        break;
                }
        }
        /* F. Process state. */
-        switch (tp->ca_state) {
+        switch (icsk->icsk_ca_state) {
        case TCP_CA_Recovery:
                if (prior_snd_una == tp->snd_una) {
                        if (IsReno(tp) && is_dupack)
-                                tcp_add_reno_sack(tp);
+                                tcp_add_reno_sack(sk);
                } else {
                        int acked = prior_packets - tp->packets_out;
                        if (IsReno(tp))
@@ -1824,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                break;
        case TCP_CA_Loss:
                if (flag&FLAG_DATA_ACKED)
-                        tp->retransmits = 0;
+                        icsk->icsk_retransmits = 0;
                if (!tcp_try_undo_loss(sk, tp)) {
                        tcp_moderate_cwnd(tp);
                        tcp_xmit_retransmit_queue(sk);
                        return;
                }
-                if (tp->ca_state != TCP_CA_Open)
+                if (icsk->icsk_ca_state != TCP_CA_Open)
                        return;
                /* Loss is undone; fall through to processing in Open state. */
        default:
@@ -1838,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                        if (tp->snd_una != prior_snd_una)
                                tcp_reset_reno_sack(tp);
                        if (is_dupack)
-                                tcp_add_reno_sack(tp);
+                                tcp_add_reno_sack(sk);
                }
-                if (tp->ca_state == TCP_CA_Disorder)
+                if (icsk->icsk_ca_state == TCP_CA_Disorder)
                        tcp_try_undo_dsack(sk, tp);
                if (!tcp_time_to_recover(sk, tp)) {
@@ -1861,30 +1889,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                tp->undo_marker = tp->snd_una;
                tp->undo_retrans = tp->retrans_out;
-                if (tp->ca_state < TCP_CA_CWR) {
+                if (icsk->icsk_ca_state < TCP_CA_CWR) {
                        if (!(flag&FLAG_ECE))
-                                tp->prior_ssthresh = tcp_current_ssthresh(tp);
+                                tp->prior_ssthresh = tcp_current_ssthresh(sk);
-                        tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+                        tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
                        TCP_ECN_queue_cwr(tp);
                }
                tp->snd_cwnd_cnt = 0;
-                tcp_set_ca_state(tp, TCP_CA_Recovery);
+                tcp_set_ca_state(sk, TCP_CA_Recovery);
        }
        if (is_dupack || tcp_head_timedout(sk, tp))
                tcp_update_scoreboard(sk, tp);
-        tcp_cwnd_down(tp);
+        tcp_cwnd_down(sk);
        tcp_xmit_retransmit_queue(sk);
 }
 /* Read draft-ietf-tcplw-high-performance before mucking
 * with this code. (Superceeds RFC1323)
 */
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
 {
-        __u32 seq_rtt;
        /* RTTM Rule: A TSecr value received in a segment is used to
         * update the averaged RTT measurement only if the segment
         * acknowledges some new data, i.e., only if it advances the
@@ -1900,14 +1926,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
         * answer arrives rto becomes 120 seconds! If at least one of segments
         * in window is lost... Voila.                          --ANK (010210)
         */
-        seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+        struct tcp_sock *tp = tcp_sk(sk);
-        tcp_rtt_estimator(tp, seq_rtt, usrtt);
+        const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-        tcp_set_rto(tp);
+        tcp_rtt_estimator(sk, seq_rtt, usrtt);
-        tp->backoff = 0;
+        tcp_set_rto(sk);
-        tcp_bound_rto(tp);
+        inet_csk(sk)->icsk_backoff = 0;
+        tcp_bound_rto(sk);
 }
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
 {
        /* We don't have a timestamp. Can only use
         * packets that are not retransmitted to determine
@@ -1921,27 +1948,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int
        if (flag & FLAG_RETRANS_DATA_ACKED)
                return;
-        tcp_rtt_estimator(tp, seq_rtt, usrtt);
+        tcp_rtt_estimator(sk, seq_rtt, usrtt);
-        tcp_set_rto(tp);
+        tcp_set_rto(sk);
-        tp->backoff = 0;
+        inet_csk(sk)->icsk_backoff = 0;
-        tcp_bound_rto(tp);
+        tcp_bound_rto(sk);
 }
-static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
-                                      int flag, s32 seq_rtt, u32 *usrtt)
+                                      const s32 seq_rtt, u32 *usrtt)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-                tcp_ack_saw_tstamp(tp, usrtt, flag);
+                tcp_ack_saw_tstamp(sk, usrtt, flag);
        else if (seq_rtt >= 0)
-                tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
+                tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
 }
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
                                  u32 in_flight, int good)
 {
-        tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
-        tp->snd_cwnd_stamp = tcp_time_stamp;
+        icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
+        tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
 }
 /* Restart timer after forward progress on connection.
@@ -1951,9 +1980,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
 {
        if (!tp->packets_out) {
-                tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
        } else {
-                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
        }
 }
@@ -2068,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
                                seq_rtt = -1;
                        } else if (seq_rtt < 0)
                                seq_rtt = now - scb->when;
-                        if (seq_usrtt)
+                        if (seq_usrtt) {
-                                *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
+                                struct timeval tv;
-                                        + (usnow.tv_usec - skb->stamp.tv_usec);
+                        
+                                skb_get_timestamp(skb, &tv);
+                                *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
+                                        + (usnow.tv_usec - tv.tv_usec);
+                        }
                        if (sacked & TCPCB_SACKED_ACKED)
                                tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2085,16 +2118,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
                        seq_rtt = now - scb->when;
                tcp_dec_pcount_approx(&tp->fackets_out, skb);
                tcp_packets_out_dec(tp, skb);
-                __skb_unlink(skb, skb->list);
+                __skb_unlink(skb, &sk->sk_write_queue);
                sk_stream_free_skb(sk, skb);
        }
        if (acked&FLAG_ACKED) {
-                tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
+                const struct inet_connection_sock *icsk = inet_csk(sk);
+                tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
                tcp_ack_packets_out(sk, tp);
-                if (tp->ca_ops->pkts_acked)
+                if (icsk->icsk_ca_ops->pkts_acked)
-                        tp->ca_ops->pkts_acked(tp, pkts_acked);
+                        icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
        }
 #if FASTRETRANS_DEBUG > 0
@@ -2102,19 +2136,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
        BUG_TRAP((int)tp->lost_out >= 0);
        BUG_TRAP((int)tp->retrans_out >= 0);
        if (!tp->packets_out && tp->rx_opt.sack_ok) {
+                const struct inet_connection_sock *icsk = inet_csk(sk);
                if (tp->lost_out) {
                        printk(KERN_DEBUG "Leak l=%u %d\n",
-                               tp->lost_out, tp->ca_state);
+                               tp->lost_out, icsk->icsk_ca_state);
                        tp->lost_out = 0;
                }
                if (tp->sacked_out) {
                        printk(KERN_DEBUG "Leak s=%u %d\n",
-                               tp->sacked_out, tp->ca_state);
+                               tp->sacked_out, icsk->icsk_ca_state);
                        tp->sacked_out = 0;
                }
                if (tp->retrans_out) {
                        printk(KERN_DEBUG "Leak r=%u %d\n",
-                               tp->retrans_out, tp->ca_state);
+                               tp->retrans_out, icsk->icsk_ca_state);
                        tp->retrans_out = 0;
                }
        }
@@ -2125,40 +2160,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 static void tcp_ack_probe(struct sock *sk)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        /* Was it a usable window open? */
        if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
                   tp->snd_una + tp->snd_wnd)) {
-                tp->backoff = 0;
+                icsk->icsk_backoff = 0;
-                tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+                inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
                /* Socket must be waked up by subsequent tcp_data_snd_check().
                 * This function is not for random using!
                 */
        } else {
-                tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-                                     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+                                          min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+                                          TCP_RTO_MAX);
        }
 }
-static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
        return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
-                tp->ca_state != TCP_CA_Open);
+                inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
 }
-static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-                !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+                !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
 }
 /* Check that window update is acceptable.
 * The function assumes that snd_una<=ack<=snd_next.
 */
-static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
+static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
-                                        u32 ack_seq, u32 nwin)
+                                        const u32 ack_seq, const u32 nwin)
 {
        return (after(ack, tp->snd_una) ||
                after(ack_seq, tp->snd_wl1) ||
@@ -2241,6 +2279,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        u32 prior_snd_una = tp->snd_una;
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
@@ -2268,7 +2307,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
                tp->snd_una = ack;
                flag |= FLAG_WIN_UPDATE;
-                tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+                tcp_ca_event(sk, CA_EVENT_FAST_ACK);
                NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
        } else {
@@ -2285,7 +2324,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
                if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
                        flag |= FLAG_ECE;
-                tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
+                tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
        }
        /* We passed data and got it acked, remove any soft error
@@ -2301,19 +2340,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        /* See if we can take anything off of the retransmit queue. */
        flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
-                                    tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
+                                    icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
        if (tp->frto_counter)
                tcp_process_frto(sk, prior_snd_una);
-        if (tcp_ack_is_dubious(tp, flag)) {
+        if (tcp_ack_is_dubious(sk, flag)) {
                /* Advanve CWND, if state allows this. */
-                if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
+                if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
-                        tcp_cong_avoid(tp, ack,  seq_rtt, prior_in_flight, 0);
+                        tcp_cong_avoid(sk, ack,  seq_rtt, prior_in_flight, 0);
                tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
        } else {
                if ((flag & FLAG_DATA_ACKED))
-                        tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
+                        tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
        }
        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2322,7 +2361,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        return 1;
 no_queue:
-        tp->probes_out = 0;
+        icsk->icsk_probes_out = 0;
        /* If this ack opens up a zero window, clear backoff.  It was
         * being used to time the probes, and is probably far higher than
@@ -2500,8 +2539,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 * up to bandwidth of 18Gigabit/sec. 8) ]
 */
-static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        struct tcphdr *th = skb->h.th;
        u32 seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -2516,14 +2556,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
                !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
                /* 4. ... and sits in replay window. */
-                (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+                (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
 }
-static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
                xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
-                !tcp_disordered_ack(tp, skb));
+                !tcp_disordered_ack(sk, skb));
 }
 /* Check segment sequence number for validity.
@@ -2586,7 +2627,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        tcp_schedule_ack(tp);
+        inet_csk_schedule_ack(sk);
        sk->sk_shutdown |= RCV_SHUTDOWN;
        sock_set_flag(sk, SOCK_DONE);
@@ -2596,7 +2637,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
                case TCP_ESTABLISHED:
                        /* Move to CLOSE_WAIT */
                        tcp_set_state(sk, TCP_CLOSE_WAIT);
-                        tp->ack.pingpong = 1;
+                        inet_csk(sk)->icsk_ack.pingpong = 1;
                        break;
                case TCP_CLOSE_WAIT:
@@ -2694,7 +2735,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
            before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
-                tcp_enter_quickack_mode(tp);
+                tcp_enter_quickack_mode(sk);
                if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
                        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -2853,7 +2894,7 @@ static void tcp_ofo_queue(struct sock *sk)
                if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
                        SOCK_DEBUG(sk, "ofo packet was already received \n");
-                        __skb_unlink(skb, skb->list);
+                        __skb_unlink(skb, &tp->out_of_order_queue);
                        __kfree_skb(skb);
                        continue;
                }
@@ -2861,7 +2902,7 @@ static void tcp_ofo_queue(struct sock *sk)
                           tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
                           TCP_SKB_CB(skb)->end_seq);
-                __skb_unlink(skb, skb->list);
+                __skb_unlink(skb, &tp->out_of_order_queue);
                __skb_queue_tail(&sk->sk_receive_queue, skb);
                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                if(skb->h.th->fin)
@@ -2942,7 +2983,7 @@ queue_and_out:
                         * gap in queue is filled.
                         */
                        if (skb_queue_empty(&tp->out_of_order_queue))
-                                tp->ack.pingpong = 0;
+                                inet_csk(sk)->icsk_ack.pingpong = 0;
                }
                if (tp->rx_opt.num_sacks)
@@ -2963,8 +3004,8 @@ queue_and_out:
                tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 out_of_window:
-                tcp_enter_quickack_mode(tp);
+                tcp_enter_quickack_mode(sk);
-                tcp_schedule_ack(tp);
+                inet_csk_schedule_ack(sk);
 drop:
                __kfree_skb(skb);
                return;
@@ -2974,7 +3015,7 @@ drop:
        if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
                goto out_of_window;
-        tcp_enter_quickack_mode(tp);
+        tcp_enter_quickack_mode(sk);
        if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                /* Partial packet, seq < rcv_next < end_seq */
@@ -3003,7 +3044,7 @@ drop:
        /* Disable header prediction. */
        tp->pred_flags = 0;
-        tcp_schedule_ack(tp);
+        inet_csk_schedule_ack(sk);
        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
                   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3027,7 +3068,7 @@ drop:
                u32 end_seq = TCP_SKB_CB(skb)->end_seq;
                if (seq == TCP_SKB_CB(skb1)->end_seq) {
-                        __skb_append(skb1, skb);
+                        __skb_append(skb1, skb, &tp->out_of_order_queue);
                        if (!tp->rx_opt.num_sacks ||
                            tp->selective_acks[0].end_seq != seq)
@@ -3071,7 +3112,7 @@ drop:
                               tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
                               break;
                       }
-                       __skb_unlink(skb1, skb1->list);
+                       __skb_unlink(skb1, &tp->out_of_order_queue);
                       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
                       __kfree_skb(skb1);
                }
@@ -3088,8 +3129,9 @@ add_sack:
 * simplifies code)
 */
 static void
-tcp_collapse(struct sock *sk, struct sk_buff *head,
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
-             struct sk_buff *tail, u32 start, u32 end)
+             struct sk_buff *head, struct sk_buff *tail,
+             u32 start, u32 end)
 {
        struct sk_buff *skb;
@@ -3099,7 +3141,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
                /* No new bits? It is possible on ofo queue. */
                if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
                        struct sk_buff *next = skb->next;
-                        __skb_unlink(skb, skb->list);
+                        __skb_unlink(skb, list);
                        __kfree_skb(skb);
                        NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
                        skb = next;
@@ -3145,7 +3187,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
                nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
                memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
                TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-                __skb_insert(nskb, skb->prev, skb, skb->list);
+                __skb_insert(nskb, skb->prev, skb, list);
                sk_stream_set_owner_r(nskb, sk);
                /* Copy data, releasing collapsed skbs. */
@@ -3164,7 +3206,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
                        }
                        if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
                                struct sk_buff *next = skb->next;
-                                __skb_unlink(skb, skb->list);
+                                __skb_unlink(skb, list);
                                __kfree_skb(skb);
                                NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
                                skb = next;
@@ -3200,7 +3242,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
                if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
                    after(TCP_SKB_CB(skb)->seq, end) ||
                    before(TCP_SKB_CB(skb)->end_seq, start)) {
-                        tcp_collapse(sk, head, skb, start, end);
+                        tcp_collapse(sk, &tp->out_of_order_queue,
+                                     head, skb, start, end);
                        head = skb;
                        if (skb == (struct sk_buff *)&tp->out_of_order_queue)
                                break;
@@ -3237,7 +3280,8 @@ static int tcp_prune_queue(struct sock *sk)
                tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
        tcp_collapse_ofo_queue(sk);
-        tcp_collapse(sk, sk->sk_receive_queue.next,
+        tcp_collapse(sk, &sk->sk_receive_queue,
+                     sk->sk_receive_queue.next,
                     (struct sk_buff*)&sk->sk_receive_queue,
                     tp->copied_seq, tp->rcv_nxt);
        sk_stream_mem_reclaim(sk);
@@ -3286,12 +3330,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        if (tp->ca_state == TCP_CA_Open &&
+        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
            sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
                /* Limited by application or receiver window. */
                u32 win_used = max(tp->snd_cwnd_used, 2U);
                if (win_used < tp->snd_cwnd) {
-                        tp->snd_ssthresh = tcp_current_ssthresh(tp);
+                        tp->snd_ssthresh = tcp_current_ssthresh(sk);
                        tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
                }
                tp->snd_cwnd_used = 0;
@@ -3370,13 +3414,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
        struct tcp_sock *tp = tcp_sk(sk);
            /* More than one full frame received... */
-        if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+        if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
             /* ... and right edge of window advances far enough.
              * (tcp_recvmsg() will send ACK otherwise). Or...
              */
             && __tcp_select_window(sk) >= tp->rcv_wnd) ||
            /* We ACK each frame or... */
-            tcp_in_quickack_mode(tp) ||
+            tcp_in_quickack_mode(sk) ||
            /* We have out of order data. */
            (ofo_possible &&
             skb_peek(&tp->out_of_order_queue))) {
@@ -3390,8 +3434,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 static __inline__ void tcp_ack_snd_check(struct sock *sk)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        if (!inet_csk_ack_scheduled(sk)) {
-        if (!tcp_ack_scheduled(tp)) {
                /* We sent a data segment already. */
                return;
        }
@@ -3462,7 +3505,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
                struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
                tp->copied_seq++;
                if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
-                        __skb_unlink(skb, skb->list);
+                        __skb_unlink(skb, &sk->sk_receive_queue);
                        __kfree_skb(skb);
                }
        }
@@ -3645,7 +3688,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                    tp->rcv_nxt == tp->rcv_wup)
                                        tcp_store_ts_recent(tp);
-                                tcp_rcv_rtt_measure_ts(tp, skb);
+                                tcp_rcv_rtt_measure_ts(sk, skb);
                                /* We know that such packets are checksummed
                                 * on entry.
@@ -3678,7 +3721,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                            tp->rcv_nxt == tp->rcv_wup)
                                                tcp_store_ts_recent(tp);
-                                        tcp_rcv_rtt_measure_ts(tp, skb);
+                                        tcp_rcv_rtt_measure_ts(sk, skb);
                                        __skb_pull(skb, tcp_header_len);
                                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3699,7 +3742,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                    tp->rcv_nxt == tp->rcv_wup)
                                        tcp_store_ts_recent(tp);
-                                tcp_rcv_rtt_measure_ts(tp, skb);
+                                tcp_rcv_rtt_measure_ts(sk, skb);
                                if ((int)skb->truesize > sk->sk_forward_alloc)
                                        goto step5;
@@ -3719,7 +3762,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                /* Well, only one small jumplet in fast path... */
                                tcp_ack(sk, skb, FLAG_DATA);
                                tcp_data_snd_check(sk, tp);
-                                if (!tcp_ack_scheduled(tp))
+                                if (!inet_csk_ack_scheduled(sk))
                                        goto no_ack;
                        }
@@ -3741,7 +3784,7 @@ slow_path:
         * RFC1323: H1. Apply PAWS check first.
         */
        if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-            tcp_paws_discard(tp, skb)) {
+            tcp_paws_discard(sk, skb)) {
                if (!th->rst) {
                        NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
                        tcp_send_dupack(sk, skb);
@@ -3788,7 +3831,7 @@ step5:
        if(th->ack)
                tcp_ack(sk, skb, FLAG_SLOWPATH);
-        tcp_rcv_rtt_measure_ts(tp, skb);
+        tcp_rcv_rtt_measure_ts(sk, skb);
        /* Process urgent data. */
        tcp_urg(sk, skb, th);
@@ -3817,6 +3860,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
        tcp_parse_options(skb, &tp->rx_opt, 0);
        if (th->ack) {
+                struct inet_connection_sock *icsk;
                /* rfc793:
                 * "If the state is SYN-SENT then
                 *    first check the ACK bit
@@ -3920,7 +3964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                tcp_init_metrics(sk);
-                tcp_init_congestion_control(tp);
+                tcp_init_congestion_control(sk);
                /* Prevent spurious tcp_cwnd_restart() on first data
                 * packet.
@@ -3930,7 +3974,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                tcp_init_buffer_space(sk);
                if (sock_flag(sk, SOCK_KEEPOPEN))
-                        tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+                        inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
                if (!tp->rx_opt.snd_wscale)
                        __tcp_fast_path_on(tp, tp->snd_wnd);
@@ -3942,7 +3986,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                        sk_wake_async(sk, 0, POLL_OUT);
                }
-                if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+                icsk = inet_csk(sk);
+                if (sk->sk_write_pending ||
+                    icsk->icsk_accept_queue.rskq_defer_accept ||
+                    icsk->icsk_ack.pingpong) {
                        /* Save one ACK. Data will be ready after
                         * several ticks, if write_pending is set.
                         *
@@ -3950,12 +3998,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                         * look so _wonderfully_ clever, that I was not able
                         * to stand against the temptation 8)     --ANK
                         */
-                        tcp_schedule_ack(tp);
+                        inet_csk_schedule_ack(sk);
-                        tp->ack.lrcvtime = tcp_time_stamp;
+                        icsk->icsk_ack.lrcvtime = tcp_time_stamp;
-                        tp->ack.ato      = TCP_ATO_MIN;
+                        icsk->icsk_ack.ato       = TCP_ATO_MIN;
-                        tcp_incr_quickack(tp);
+                        tcp_incr_quickack(sk);
-                        tcp_enter_quickack_mode(tp);
+                        tcp_enter_quickack_mode(sk);
-                        tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                  TCP_DELACK_MAX, TCP_RTO_MAX);
 discard:
                        __kfree_skb(skb);
@@ -4111,7 +4160,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
        }
        if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-            tcp_paws_discard(tp, skb)) {
+            tcp_paws_discard(sk, skb)) {
                if (!th->rst) {
                        NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
                        tcp_send_dupack(sk, skb);
@@ -4180,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                 */
                                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
                                    !tp->srtt)
-                                        tcp_ack_saw_tstamp(tp, 0, 0);
+                                        tcp_ack_saw_tstamp(sk, NULL, 0);
                                if (tp->rx_opt.tstamp_ok)
                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4192,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                tcp_init_metrics(sk);
-                                tcp_init_congestion_control(tp);
+                                tcp_init_congestion_control(sk);
                                /* Prevent spurious tcp_cwnd_restart() on
                                 * first data packet.
@@ -4227,9 +4276,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                                return 1;
                                        }
-                                        tmo = tcp_fin_time(tp);
+                                        tmo = tcp_fin_time(sk);
                                        if (tmo > TCP_TIMEWAIT_LEN) {
-                                                tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+                                                inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
                                        } else if (th->fin || sock_owned_by_user(sk)) {
                                                /* Bad case. We could lose such FIN otherwise.
                                                 * It is not a big problem, but it looks confusing
@@ -4237,7 +4286,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                                 * if it spins in bh_lock_sock(), but it is really
                                                 * marginal case.
                                                 */
-                                                tcp_reset_keepalive_timer(sk, tmo);
+                                                inet_csk_reset_keepalive_timer(sk, tmo);
                                        } else {
                                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                                                goto discard;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 67c670886c1f..13dfb391cdf1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -64,7 +64,9 @@
 #include <linux/times.h>
 #include <net/icmp.h>
+#include <net/inet_hashtables.h>
 #include <net/tcp.h>
+#include <net/transp_v6.h>
 #include <net/ipv6.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
@@ -75,7 +77,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-extern int sysctl_ip_dynaddr;
 int sysctl_tcp_tw_reuse;
 int sysctl_tcp_low_latency;
@@ -88,463 +89,29 @@ static struct socket *tcp_socket;
 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
                       struct sk_buff *skb);
-struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
+struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
-        .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
+        .lhash_lock     = RW_LOCK_UNLOCKED,
-        .__tcp_lhash_users      =       ATOMIC_INIT(0),
+        .lhash_users    = ATOMIC_INIT(0),
-        .__tcp_lhash_wait
+        .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-          = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
+        .portalloc_lock = SPIN_LOCK_UNLOCKED,
-        .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
+        .port_rover     = 1024 - 1,
 };
-/*
- * This array holds the first and last local port number.
- * For high-usage systems, use sysctl to change this to
- * 32768-61000
- */
-int sysctl_local_port_range[2] = { 1024, 4999 };
-int tcp_port_rover = 1024 - 1;
-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
-                                 __u32 faddr, __u16 fport)
-{
-        int h = (laddr ^ lport) ^ (faddr ^ fport);
-        h ^= h >> 16;
-        h ^= h >> 8;
-        return h & (tcp_ehash_size - 1);
-}
-static __inline__ int tcp_sk_hashfn(struct sock *sk)
-{
-        struct inet_sock *inet = inet_sk(sk);
-        __u32 laddr = inet->rcv_saddr;
-        __u16 lport = inet->num;
-        __u32 faddr = inet->daddr;
-        __u16 fport = inet->dport;
-        return tcp_hashfn(laddr, lport, faddr, fport);
-}
-/* Allocate and initialize a new TCP local port bind bucket.
- * The bindhash mutex for snum's hash chain must be held here.
- */
-struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
-                                          unsigned short snum)
-{
-        struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
-                                                      SLAB_ATOMIC);
-        if (tb) {
-                tb->port = snum;
-                tb->fastreuse = 0;
-                INIT_HLIST_HEAD(&tb->owners);
-                hlist_add_head(&tb->node, &head->chain);
-        }
-        return tb;
-}
-/* Caller must hold hashbucket lock for this tb with local BH disabled */
-void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
-{
-        if (hlist_empty(&tb->owners)) {
-                __hlist_del(&tb->node);
-                kmem_cache_free(tcp_bucket_cachep, tb);
-        }
-}
-/* Caller must disable local BH processing. */
-static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
-{
-        struct tcp_bind_hashbucket *head =
-                                &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
-        struct tcp_bind_bucket *tb;
-        spin_lock(&head->lock);
-        tb = tcp_sk(sk)->bind_hash;
-        sk_add_bind_node(child, &tb->owners);
-        tcp_sk(child)->bind_hash = tb;
-        spin_unlock(&head->lock);
-}
-inline void tcp_inherit_port(struct sock *sk, struct sock *child)
-{
-        local_bh_disable();
-        __tcp_inherit_port(sk, child);
-        local_bh_enable();
-}
-void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
-                   unsigned short snum)
-{
-        inet_sk(sk)->num = snum;
-        sk_add_bind_node(sk, &tb->owners);
-        tcp_sk(sk)->bind_hash = tb;
-}
-static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{
-        const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
-        struct sock *sk2;
-        struct hlist_node *node;
-        int reuse = sk->sk_reuse;
-        sk_for_each_bound(sk2, node, &tb->owners) {
-                if (sk != sk2 &&
-                    !tcp_v6_ipv6only(sk2) &&
-                    (!sk->sk_bound_dev_if ||
-                     !sk2->sk_bound_dev_if ||
-                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
-                        if (!reuse || !sk2->sk_reuse ||
-                            sk2->sk_state == TCP_LISTEN) {
-                                const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
-                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
-                                    sk2_rcv_saddr == sk_rcv_saddr)
-                                        break;
-                        }
-                }
-        }
-        return node != NULL;
-}
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- */
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 {
-        struct tcp_bind_hashbucket *head;
+        return inet_csk_get_port(&tcp_hashinfo, sk, snum);
-        struct hlist_node *node;
-        struct tcp_bind_bucket *tb;
-        int ret;
-        local_bh_disable();
-        if (!snum) {
-                int low = sysctl_local_port_range[0];
-                int high = sysctl_local_port_range[1];
-                int remaining = (high - low) + 1;
-                int rover;
-                spin_lock(&tcp_portalloc_lock);
-                if (tcp_port_rover < low)
-                        rover = low;
-                else
-                        rover = tcp_port_rover;
-                do {
-                        rover++;
-                        if (rover > high)
-                                rover = low;
-                        head = &tcp_bhash[tcp_bhashfn(rover)];
-                        spin_lock(&head->lock);
-                        tb_for_each(tb, node, &head->chain)
-                                if (tb->port == rover)
-                                        goto next;
-                        break;
-                next:
-                        spin_unlock(&head->lock);
-                } while (--remaining > 0);
-                tcp_port_rover = rover;
-                spin_unlock(&tcp_portalloc_lock);
-                /* Exhausted local port range during search?  It is not
-                 * possible for us to be holding one of the bind hash
-                 * locks if this test triggers, because if 'remaining'
-                 * drops to zero, we broke out of the do/while loop at
-                 * the top level, not from the 'break;' statement.
-                 */
-                ret = 1;
-                if (unlikely(remaining <= 0))
-                        goto fail;
-                /* OK, here is the one we will use.  HEAD is
-                 * non-NULL and we hold it's mutex.
-                 */
-                snum = rover;
-        } else {
-                head = &tcp_bhash[tcp_bhashfn(snum)];
-                spin_lock(&head->lock);
-                tb_for_each(tb, node, &head->chain)
-                        if (tb->port == snum)
-                                goto tb_found;
-        }
-        tb = NULL;
-        goto tb_not_found;
-tb_found:
-        if (!hlist_empty(&tb->owners)) {
-                if (sk->sk_reuse > 1)
-                        goto success;
-                if (tb->fastreuse > 0 &&
-                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
-                        goto success;
-                } else {
-                        ret = 1;
-                        if (tcp_bind_conflict(sk, tb))
-                                goto fail_unlock;
-                }
-        }
-tb_not_found:
-        ret = 1;
-        if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
-                goto fail_unlock;
-        if (hlist_empty(&tb->owners)) {
-                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
-                        tb->fastreuse = 1;
-                else
-                        tb->fastreuse = 0;
-        } else if (tb->fastreuse &&
-                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
-                tb->fastreuse = 0;
-success:
-        if (!tcp_sk(sk)->bind_hash)
-                tcp_bind_hash(sk, tb, snum);
-        BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
-        ret = 0;
-fail_unlock:
-        spin_unlock(&head->lock);
-fail:
-        local_bh_enable();
-        return ret;
-}
-/* Get rid of any references to a local port held by the
- * given sock.
- */
-static void __tcp_put_port(struct sock *sk)
-{
-        struct inet_sock *inet = inet_sk(sk);
-        struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
-        struct tcp_bind_bucket *tb;
-        spin_lock(&head->lock);
-        tb = tcp_sk(sk)->bind_hash;
-        __sk_del_bind_node(sk);
-        tcp_sk(sk)->bind_hash = NULL;
-        inet->num = 0;
-        tcp_bucket_destroy(tb);
-        spin_unlock(&head->lock);
-}
-void tcp_put_port(struct sock *sk)
-{
-        local_bh_disable();
-        __tcp_put_port(sk);
-        local_bh_enable();
-}
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-void tcp_listen_wlock(void)
-{
-        write_lock(&tcp_lhash_lock);
-        if (atomic_read(&tcp_lhash_users)) {
-                DEFINE_WAIT(wait);
-                for (;;) {
-                        prepare_to_wait_exclusive(&tcp_lhash_wait,
-                                                &wait, TASK_UNINTERRUPTIBLE);
-                        if (!atomic_read(&tcp_lhash_users))
-                                break;
-                        write_unlock_bh(&tcp_lhash_lock);
-                        schedule();
-                        write_lock_bh(&tcp_lhash_lock);
-                }
-                finish_wait(&tcp_lhash_wait, &wait);
-        }
-}
-static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
-{
-        struct hlist_head *list;
-        rwlock_t *lock;
-        BUG_TRAP(sk_unhashed(sk));
-        if (listen_possible && sk->sk_state == TCP_LISTEN) {
-                list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
-                lock = &tcp_lhash_lock;
-                tcp_listen_wlock();
-        } else {
-                list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
-                lock = &tcp_ehash[sk->sk_hashent].lock;
-                write_lock(lock);
-        }
-        __sk_add_node(sk, list);
-        sock_prot_inc_use(sk->sk_prot);
-        write_unlock(lock);
-        if (listen_possible && sk->sk_state == TCP_LISTEN)
-                wake_up(&tcp_lhash_wait);
 }
 static void tcp_v4_hash(struct sock *sk)
 {
-        if (sk->sk_state != TCP_CLOSE) {
+        inet_hash(&tcp_hashinfo, sk);
-                local_bh_disable();
-                __tcp_v4_hash(sk, 1);
-                local_bh_enable();
-        }
 }
 void tcp_unhash(struct sock *sk)
 {
-        rwlock_t *lock;
+        inet_unhash(&tcp_hashinfo, sk);
-        if (sk_unhashed(sk))
-                goto ende;
-        if (sk->sk_state == TCP_LISTEN) {
-                local_bh_disable();
-                tcp_listen_wlock();
-                lock = &tcp_lhash_lock;
-        } else {
-                struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
-                lock = &head->lock;
-                write_lock_bh(&head->lock);
-        }
-        if (__sk_del_node_init(sk))
-                sock_prot_dec_use(sk->sk_prot);
-        write_unlock_bh(lock);
- ende:
-        if (sk->sk_state == TCP_LISTEN)
-                wake_up(&tcp_lhash_wait);
-}
-/* Don't inline this cruft.  Here are some nice properties to
- * exploit here.  The BSD API does not allow a listening TCP
- * to specify the remote port nor the remote address for the
- * connection.  So always assume those are both wildcarded
- * during the search since they can never be otherwise.
- */
-static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
-                                             unsigned short hnum, int dif)
-{
-        struct sock *result = NULL, *sk;
-        struct hlist_node *node;
-        int score, hiscore;
-        hiscore=-1;
-        sk_for_each(sk, node, head) {
-                struct inet_sock *inet = inet_sk(sk);
-                if (inet->num == hnum && !ipv6_only_sock(sk)) {
-                        __u32 rcv_saddr = inet->rcv_saddr;
-                        score = (sk->sk_family == PF_INET ? 1 : 0);
-                        if (rcv_saddr) {
-                                if (rcv_saddr != daddr)
-                                        continue;
-                                score+=2;
-                        }
-                        if (sk->sk_bound_dev_if) {
-                                if (sk->sk_bound_dev_if != dif)
-                                        continue;
-                                score+=2;
-                        }
-                        if (score == 5)
-                                return sk;
-                        if (score > hiscore) {
-                                hiscore = score;
-                                result = sk;
-                        }
-                }
-        }
-        return result;
-}
-/* Optimize the common listener case. */
-static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
-                unsigned short hnum, int dif)
-{
-        struct sock *sk = NULL;
-        struct hlist_head *head;
-        read_lock(&tcp_lhash_lock);
-        head = &tcp_listening_hash[tcp_lhashfn(hnum)];
-        if (!hlist_empty(head)) {
-                struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-                if (inet->num == hnum && !sk->sk_node.next &&
-                    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
-                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
-                    !sk->sk_bound_dev_if)
-                        goto sherry_cache;
-                sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
-        }
-        if (sk) {
-sherry_cache:
-                sock_hold(sk);
-        }
-        read_unlock(&tcp_lhash_lock);
-        return sk;
 }
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * Local BH must be disabled here.
- */
-static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
-                                                       u32 daddr, u16 hnum,
-                                                       int dif)
-{
-        struct tcp_ehash_bucket *head;
-        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
-        __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
-        struct sock *sk;
-        struct hlist_node *node;
-        /* Optimize here for direct hit, only listening connections can
-         * have wildcards anyways.
-         */
-        int hash = tcp_hashfn(daddr, hnum, saddr, sport);
-        head = &tcp_ehash[hash];
-        read_lock(&head->lock);
-        sk_for_each(sk, node, &head->chain) {
-                if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
-                        goto hit; /* You sunk my battleship! */
-        }
-        /* Must check for a TIME_WAIT'er before going to listener hash. */
-        sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
-                if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
-                        goto hit;
-        }
-        sk = NULL;
-out:
-        read_unlock(&head->lock);
-        return sk;
-hit:
-        sock_hold(sk);
-        goto out;
-}
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
-                                           u32 daddr, u16 hnum, int dif)
-{
-        struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
-                                                      daddr, hnum, dif);
-        return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
-}
-inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
-                                  u16 dport, int dif)
-{
-        struct sock *sk;
-        local_bh_disable();
-        sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
-        local_bh_enable();
-        return sk;
-}
-EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 {
        return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -555,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 /* called with local bh disabled */
 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
-                                      struct tcp_tw_bucket **twp)
+                                      struct inet_timewait_sock **twp)
 {
        struct inet_sock *inet = inet_sk(sk);
        u32 daddr = inet->rcv_saddr;
        u32 saddr = inet->daddr;
        int dif = sk->sk_bound_dev_if;
-        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+        INET_ADDR_COOKIE(acookie, saddr, daddr)
-        __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
+        const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-        int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
+        const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
-        struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+        struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
        struct sock *sk2;
-        struct hlist_node *node;
+        const struct hlist_node *node;
-        struct tcp_tw_bucket *tw;
+        struct inet_timewait_sock *tw;
        write_lock(&head->lock);
        /* Check TIME-WAIT sockets first. */
-        sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
+        sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-                tw = (struct tcp_tw_bucket *)sk2;
+                tw = inet_twsk(sk2);
-                if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+                if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+                        const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
                        struct tcp_sock *tp = tcp_sk(sk);
                        /* With PAWS, it is safe from the viewpoint
@@ -592,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
                           fall back to VJ's scheme and use initial
                           timestamp retrieved from peer table.
                         */
-                        if (tw->tw_ts_recent_stamp &&
+                        if (tcptw->tw_ts_recent_stamp &&
                            (!twp || (sysctl_tcp_tw_reuse &&
                                      xtime.tv_sec -
-                                      tw->tw_ts_recent_stamp > 1))) {
+                                      tcptw->tw_ts_recent_stamp > 1))) {
-                                if ((tp->write_seq =
+                                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
-                                                tw->tw_snd_nxt + 65535 + 2) == 0)
+                                if (tp->write_seq == 0)
                                        tp->write_seq = 1;
-                                tp->rx_opt.ts_recent       = tw->tw_ts_recent;
+                                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
-                                tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+                                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
                                sock_hold(sk2);
                                goto unique;
                        } else
@@ -611,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
        /* And established part... */
        sk_for_each(sk2, node, &head->chain) {
-                if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+                if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
                        goto not_unique;
        }
@@ -631,10 +199,10 @@ unique:
                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
-                tcp_tw_deschedule(tw);
+                inet_twsk_deschedule(tw, &tcp_death_row);
                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-                tcp_tw_put(tw);
+                inet_twsk_put(tw);
        }
        return 0;
@@ -657,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
 */
 static inline int tcp_v4_hash_connect(struct sock *sk)
 {
-        unsigned short snum = inet_sk(sk)->num;
+        const unsigned short snum = inet_sk(sk)->num;
-        struct tcp_bind_hashbucket *head;
+        struct inet_bind_hashbucket *head;
-        struct tcp_bind_bucket *tb;
+        struct inet_bind_bucket *tb;
        int ret;
        if (!snum) {
@@ -671,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
                static u32 hint;
                u32 offset = hint + connect_port_offset(sk);
                struct hlist_node *node;
-                struct tcp_tw_bucket *tw = NULL;
+                struct inet_timewait_sock *tw = NULL;
                local_bh_disable();
                for (i = 1; i <= range; i++) {
                        port = low + (i + offset) % range;
-                        head = &tcp_bhash[tcp_bhashfn(port)];
+                        head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
                        spin_lock(&head->lock);
                        /* Does not bother with rcv_saddr checks,
                         * because the established check is already
                         * unique enough.
                         */
-                        tb_for_each(tb, node, &head->chain) {
+                        inet_bind_bucket_for_each(tb, node, &head->chain) {
                                if (tb->port == port) {
                                        BUG_TRAP(!hlist_empty(&tb->owners));
                                        if (tb->fastreuse >= 0)
@@ -696,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
                                }
                        }
-                        tb = tcp_bucket_create(head, port);
+                        tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
                        if (!tb) {
                                spin_unlock(&head->lock);
                                break;
@@ -715,27 +283,27 @@ ok:
                hint += i;
                /* Head lock still held and bh's disabled */
-                tcp_bind_hash(sk, tb, port);
+                inet_bind_hash(sk, tb, port);
                if (sk_unhashed(sk)) {
                        inet_sk(sk)->sport = htons(port);
-                        __tcp_v4_hash(sk, 0);
+                        __inet_hash(&tcp_hashinfo, sk, 0);
                }
                spin_unlock(&head->lock);
                if (tw) {
-                        tcp_tw_deschedule(tw);
+                        inet_twsk_deschedule(tw, &tcp_death_row);;
-                        tcp_tw_put(tw);
+                        inet_twsk_put(tw);
                }
                ret = 0;
                goto out;
        }
-        head  = &tcp_bhash[tcp_bhashfn(snum)];
+        head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
-        tb  = tcp_sk(sk)->bind_hash;
+        tb  = inet_csk(sk)->icsk_bind_hash;
        spin_lock_bh(&head->lock);
        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-                __tcp_v4_hash(sk, 0);
+                __inet_hash(&tcp_hashinfo, sk, 0);
                spin_unlock_bh(&head->lock);
                return 0;
        } else {
@@ -798,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                tp->write_seq              = 0;
        }
-        if (sysctl_tcp_tw_recycle &&
+        if (tcp_death_row.sysctl_tw_recycle &&
            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
                struct inet_peer *peer = rt_get_peer(rt);
@@ -837,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                goto failure;
        /* OK, now commit destination to socket.  */
-        __sk_dst_set(sk, &rt->u.dst);
+        sk_setup_caps(sk, &rt->u.dst);
-        tcp_v4_setup_caps(sk, &rt->u.dst);
        if (!tp->write_seq)
                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -864,53 +431,6 @@ failure:
        return err;
 }
-static __inline__ int tcp_v4_iif(struct sk_buff *skb)
-{
-        return ((struct rtable *)skb->dst)->rt_iif;
-}
-static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
-{
-        return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
-}
-static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
-                                              struct request_sock ***prevp,
-                                              __u16 rport,
-                                              __u32 raddr, __u32 laddr)
-{
-        struct listen_sock *lopt = tp->accept_queue.listen_opt;
-        struct request_sock *req, **prev;
-        for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
-             (req = *prev) != NULL;
-             prev = &req->dl_next) {
-                const struct inet_request_sock *ireq = inet_rsk(req);
-                if (ireq->rmt_port == rport &&
-                    ireq->rmt_addr == raddr &&
-                    ireq->loc_addr == laddr &&
-                    TCP_INET_FAMILY(req->rsk_ops->family)) {
-                        BUG_TRAP(!req->sk);
-                        *prevp = prev;
-                        break;
-                }
-        }
-        return req;
-}
-static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct listen_sock *lopt = tp->accept_queue.listen_opt;
-        u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-        reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
-        tcp_synq_added(sk);
-}
 /*
 * This routine does path mtu discovery as defined in RFC1191.
 */
@@ -993,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
                return;
        }
-        sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
+        sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
-                           th->source, tcp_v4_iif(skb));
+                         th->source, inet_iif(skb));
        if (!sk) {
                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
                return;
        }
        if (sk->sk_state == TCP_TIME_WAIT) {
-                tcp_tw_put((struct tcp_tw_bucket *)sk);
+                inet_twsk_put((struct inet_timewait_sock *)sk);
                return;
        }
@@ -1054,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
                if (sock_owned_by_user(sk))
                        goto out;
-                req = tcp_v4_search_req(tp, &prev, th->dest,
+                req = inet_csk_search_req(sk, &prev, th->dest,
-                                        iph->daddr, iph->saddr);
+                                          iph->daddr, iph->saddr);
                if (!req)
                        goto out;
@@ -1075,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
                 * created socket, and POSIX does not want network
                 * errors returned from accept().
                 */
-                tcp_synq_drop(sk, req, prev);
+                inet_csk_reqsk_queue_drop(sk, req, prev);
                goto out;
        case TCP_SYN_SENT:
@@ -1245,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
-        struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+        struct inet_timewait_sock *tw = inet_twsk(sk);
+        const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-        tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
+        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
-                        tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
-        tcp_tw_put(tw);
+        inet_twsk_put(tw);
 }
 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1259,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
                        req->ts_recent);
 }
-static struct dst_entry* tcp_v4_route_req(struct sock *sk,
-                                          struct request_sock *req)
-{
-        struct rtable *rt;
-        const struct inet_request_sock *ireq = inet_rsk(req);
-        struct ip_options *opt = inet_rsk(req)->opt;
-        struct flowi fl = { .oif = sk->sk_bound_dev_if,
-                            .nl_u = { .ip4_u =
-                                      { .daddr = ((opt && opt->srr) ?
-                                                  opt->faddr :
-                                                  ireq->rmt_addr),
-                                        .saddr = ireq->loc_addr,
-                                        .tos = RT_CONN_FLAGS(sk) } },
-                            .proto = IPPROTO_TCP,
-                            .uli_u = { .ports =
-                                       { .sport = inet_sk(sk)->sport,
-                                         .dport = ireq->rmt_port } } };
-        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
-                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
-                return NULL;
-        }
-        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
-                ip_rt_put(rt);
-                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
-                return NULL;
-        }
-        return &rt->u.dst;
-}
 /*
 *      Send a SYN-ACK after having received an ACK.
 *      This still operates on a request_sock only, not on a big
@@ -1302,7 +793,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
        struct sk_buff * skb;
        /* First, grab a route. */
-        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
                goto out;
        skb = tcp_make_synack(sk, dst, req);
@@ -1404,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
         * limitations, they conserve resources and peer is
         * evidently real one.
         */
-        if (tcp_synq_is_full(sk) && !isn) {
+        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 #ifdef CONFIG_SYN_COOKIES
                if (sysctl_tcp_syncookies) {
                        want_cookie = 1;
@@ -1418,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
         * clogging syn queue with openreqs with exponentially increasing
         * timeout.
         */
-        if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
                goto drop;
        req = reqsk_alloc(&tcp_request_sock_ops);
@@ -1474,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                 * are made in the function processing timewait state.
                 */
                if (tmp_opt.saw_tstamp &&
-                    sysctl_tcp_tw_recycle &&
+                    tcp_death_row.sysctl_tw_recycle &&
-                    (dst = tcp_v4_route_req(sk, req)) != NULL &&
+                    (dst = inet_csk_route_req(sk, req)) != NULL &&
                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
                    peer->v4daddr == saddr) {
                        if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1488,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                }
                /* Kill the following clause, if you dislike this way. */
                else if (!sysctl_tcp_syncookies &&
-                         (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                          (sysctl_max_syn_backlog >> 2)) &&
                         (!peer || !peer->tcp_ts_stamp) &&
                         (!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1499,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                         * to destinations, already remembered
                         * to the moment of synflood.
                         */
-                        LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
+                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
-                                              "request from %u.%u."
+                                       "request from %u.%u.%u.%u/%u\n",
-                                              "%u.%u/%u\n",
+                                       NIPQUAD(saddr),
-                                              NIPQUAD(saddr),
+                                       ntohs(skb->h.th->source));
-                                              ntohs(skb->h.th->source)));
                        dst_release(dst);
                        goto drop_and_free;
                }
@@ -1518,7 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        if (want_cookie) {
                reqsk_free(req);
        } else {
-                tcp_v4_synq_add(sk, req);
+                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
        }
        return 0;
@@ -1546,15 +1036,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        if (sk_acceptq_is_full(sk))
                goto exit_overflow;
-        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
                goto exit;
        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
                goto exit;
-        newsk->sk_dst_cache = dst;
+        sk_setup_caps(newsk, dst);
-        tcp_v4_setup_caps(newsk, dst);
        newtp                 = tcp_sk(newsk);
        newinet               = inet_sk(newsk);
@@ -1564,7 +1053,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        newinet->saddr        = ireq->loc_addr;
        newinet->opt          = ireq->opt;
        ireq->opt             = NULL;
-        newinet->mc_index     = tcp_v4_iif(skb);
+        newinet->mc_index     = inet_iif(skb);
        newinet->mc_ttl       = skb->nh.iph->ttl;
        newtp->ext_header_len = 0;
        if (newinet->opt)
@@ -1575,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
        tcp_initialize_rcv_mss(newsk);
-        __tcp_v4_hash(newsk, 0);
+        __inet_hash(&tcp_hashinfo, newsk, 0);
-        __tcp_inherit_port(sk, newsk);
+        __inet_inherit_port(&tcp_hashinfo, sk, newsk);
        return newsk;
@@ -1592,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 {
        struct tcphdr *th = skb->h.th;
        struct iphdr *iph = skb->nh.iph;
-        struct tcp_sock *tp = tcp_sk(sk);
        struct sock *nsk;
        struct request_sock **prev;
        /* Find possible connection requests. */
-        struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
+        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
-                                                     iph->saddr, iph->daddr);
+                                                       iph->saddr, iph->daddr);
        if (req)
                return tcp_check_req(sk, skb, req, prev);
-        nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
+        nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
-                                          th->source,
+                                        th->source, skb->nh.iph->daddr,
-                                          skb->nh.iph->daddr,
+                                        ntohs(th->dest), inet_iif(skb));
-                                          ntohs(th->dest),
-                                          tcp_v4_iif(skb));
        if (nsk) {
                if (nsk->sk_state != TCP_TIME_WAIT) {
                        bh_lock_sock(nsk);
                        return nsk;
                }
-                tcp_tw_put((struct tcp_tw_bucket *)nsk);
+                inet_twsk_put((struct inet_timewait_sock *)nsk);
                return NULL;
        }
@@ -1631,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
                                  skb->nh.iph->daddr, skb->csum))
                        return 0;
-                LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
                skb->ip_summed = CHECKSUM_NONE;
        }
        if (skb->len <= 76) {
@@ -1747,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
        TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
        TCP_SKB_CB(skb)->sacked  = 0;
-        sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
+        sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
-                             skb->nh.iph->daddr, ntohs(th->dest),
+                           skb->nh.iph->daddr, ntohs(th->dest),
-                             tcp_v4_iif(skb));
+                           inet_iif(skb));
        if (!sk)
                goto no_tcp_socket;
@@ -1801,24 +1287,26 @@ discard_and_relse:
 do_time_wait:
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-                tcp_tw_put((struct tcp_tw_bucket *) sk);
+                inet_twsk_put((struct inet_timewait_sock *) sk);
                goto discard_it;
        }
        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
                TCP_INC_STATS_BH(TCP_MIB_INERRS);
-                tcp_tw_put((struct tcp_tw_bucket *) sk);
+                inet_twsk_put((struct inet_timewait_sock *) sk);
                goto discard_it;
        }
-        switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+        switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
-                                           skb, th, skb->len)) {
+                                           skb, th)) {
        case TCP_TW_SYN: {
-                struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
+                struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
-                                                          ntohs(th->dest),
+                                                        skb->nh.iph->daddr,
-                                                          tcp_v4_iif(skb));
+                                                        ntohs(th->dest),
+                                                        inet_iif(skb));
                if (sk2) {
-                        tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
+                        inet_twsk_deschedule((struct inet_timewait_sock *)sk,
-                        tcp_tw_put((struct tcp_tw_bucket *)sk);
+                                             &tcp_death_row);
+                        inet_twsk_put((struct inet_timewait_sock *)sk);
                        sk = sk2;
                        goto process;
                }
@@ -1834,112 +1322,6 @@ do_time_wait:
        goto discard_it;
 }
-/* With per-bucket locks this operation is not-atomic, so that
- * this version is not worse.
- */
-static void __tcp_v4_rehash(struct sock *sk)
-{
-        sk->sk_prot->unhash(sk);
-        sk->sk_prot->hash(sk);
-}
-static int tcp_v4_reselect_saddr(struct sock *sk)
-{
-        struct inet_sock *inet = inet_sk(sk);
-        int err;
-        struct rtable *rt;
-        __u32 old_saddr = inet->saddr;
-        __u32 new_saddr;
-        __u32 daddr = inet->daddr;
-        if (inet->opt && inet->opt->srr)
-                daddr = inet->opt->faddr;
-        /* Query new route. */
-        err = ip_route_connect(&rt, daddr, 0,
-                               RT_CONN_FLAGS(sk),
-                               sk->sk_bound_dev_if,
-                               IPPROTO_TCP,
-                               inet->sport, inet->dport, sk);
-        if (err)
-                return err;
-        __sk_dst_set(sk, &rt->u.dst);
-        tcp_v4_setup_caps(sk, &rt->u.dst);
-        new_saddr = rt->rt_src;
-        if (new_saddr == old_saddr)
-                return 0;
-        if (sysctl_ip_dynaddr > 1) {
-                printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
-                                 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
-                       NIPQUAD(old_saddr),
-                       NIPQUAD(new_saddr));
-        }
-        inet->saddr = new_saddr;
-        inet->rcv_saddr = new_saddr;
-        /* XXX The only one ugly spot where we need to
-         * XXX really change the sockets identity after
-         * XXX it has entered the hashes. -DaveM
-         *
-         * Besides that, it does not check for connection
-         * uniqueness. Wait for troubles.
-         */
-        __tcp_v4_rehash(sk);
-        return 0;
-}
-int tcp_v4_rebuild_header(struct sock *sk)
-{
-        struct inet_sock *inet = inet_sk(sk);
-        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
-        u32 daddr;
-        int err;
-        /* Route is OK, nothing to do. */
-        if (rt)
-                return 0;
-        /* Reroute. */
-        daddr = inet->daddr;
-        if (inet->opt && inet->opt->srr)
-                daddr = inet->opt->faddr;
-        {
-                struct flowi fl = { .oif = sk->sk_bound_dev_if,
-                                    .nl_u = { .ip4_u =
-                                              { .daddr = daddr,
-                                                .saddr = inet->saddr,
-                                                .tos = RT_CONN_FLAGS(sk) } },
-                                    .proto = IPPROTO_TCP,
-                                    .uli_u = { .ports =
-                                               { .sport = inet->sport,
-                                                 .dport = inet->dport } } };
-                                                
-                err = ip_route_output_flow(&rt, &fl, sk, 0);
-        }
-        if (!err) {
-                __sk_dst_set(sk, &rt->u.dst);
-                tcp_v4_setup_caps(sk, &rt->u.dst);
-                return 0;
-        }
-        /* Routing failed... */
-        sk->sk_route_caps = 0;
-        if (!sysctl_ip_dynaddr ||
-            sk->sk_state != TCP_SYN_SENT ||
-            (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
-            (err = tcp_v4_reselect_saddr(sk)) != 0)
-                sk->sk_err_soft = -err;
-        return err;
-}
 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
 {
        struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1988,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
        return 0;
 }
-int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
 {
-        struct inet_peer *peer = NULL;
+        struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
-        peer = inet_getpeer(tw->tw_daddr, 1);
        if (peer) {
-                if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
-                     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
+                     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
-                        peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
+                        peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
-                        peer->tcp_ts = tw->tw_ts_recent;
+                        peer->tcp_ts       = tcptw->tw_ts_recent;
                }
                inet_putpeer(peer);
                return 1;
@@ -2011,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
 struct tcp_func ipv4_specific = {
        .queue_xmit     =       ip_queue_xmit,
        .send_check     =       tcp_v4_send_check,
-        .rebuild_header =       tcp_v4_rebuild_header,
+        .rebuild_header =       inet_sk_rebuild_header,
        .conn_request   =       tcp_v4_conn_request,
        .syn_recv_sock  =       tcp_v4_syn_recv_sock,
        .remember_stamp =       tcp_v4_remember_stamp,
@@ -2027,13 +1409,14 @@ struct tcp_func ipv4_specific = {
 */
 static int tcp_v4_init_sock(struct sock *sk)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        skb_queue_head_init(&tp->out_of_order_queue);
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
-        tp->rto  = TCP_TIMEOUT_INIT;
+        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        tp->mdev = TCP_TIMEOUT_INIT;
        /* So many TCP implementations out there (incorrectly) count the
@@ -2051,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk)
        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
-        tp->ca_ops = &tcp_init_congestion_ops;
+        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
        sk->sk_state = TCP_CLOSE;
@@ -2074,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
        tcp_clear_xmit_timers(sk);
-        tcp_cleanup_congestion_control(tp);
+        tcp_cleanup_congestion_control(sk);
        /* Cleanup up the write buffer. */
        sk_stream_writequeue_purge(sk);
@@ -2086,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
        __skb_queue_purge(&tp->ucopy.prequeue);
        /* Clean up a referenced TCP bind bucket. */
-        if (tp->bind_hash)
+        if (inet_csk(sk)->icsk_bind_hash)
-                tcp_put_port(sk);
+                inet_put_port(&tcp_hashinfo, sk);
        /*
         * If sendmsg cached page exists, toss it.
@@ -2107,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
-static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
 {
        return hlist_empty(head) ? NULL :
-                list_entry(head->first, struct tcp_tw_bucket, tw_node);
+                list_entry(head->first, struct inet_timewait_sock, tw_node);
 }
-static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
 {
        return tw->tw_node.next ?
                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2121,14 +1504,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
-        struct tcp_sock *tp;
+        struct inet_connection_sock *icsk;
        struct hlist_node *node;
        struct sock *sk = cur;
        struct tcp_iter_state* st = seq->private;
        if (!sk) {
                st->bucket = 0;
-                sk = sk_head(&tcp_listening_hash[0]);
+                sk = sk_head(&tcp_hashinfo.listening_hash[0]);
                goto get_sk;
        }
@@ -2137,7 +1520,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
        if (st->state == TCP_SEQ_STATE_OPENREQ) {
                struct request_sock *req = cur;
-                tp = tcp_sk(st->syn_wait_sk);
+                icsk = inet_csk(st->syn_wait_sk);
                req = req->dl_next;
                while (1) {
                        while (req) {
@@ -2150,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
                        if (++st->sbucket >= TCP_SYNQ_HSIZE)
                                break;
 get_req:
-                        req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
+                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
                }
                sk        = sk_next(st->syn_wait_sk);
                st->state = TCP_SEQ_STATE_LISTENING;
-                read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
        } else {
-                tp = tcp_sk(sk);
+                icsk = inet_csk(sk);
-                read_lock_bh(&tp->accept_queue.syn_wait_lock);
+                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-                if (reqsk_queue_len(&tp->accept_queue))
+                if (reqsk_queue_len(&icsk->icsk_accept_queue))
                        goto start_req;
-                read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
                sk = sk_next(sk);
        }
 get_sk:
@@ -2169,9 +1552,9 @@ get_sk:
                        cur = sk;
                        goto out;
                }
-                tp = tcp_sk(sk);
+                icsk = inet_csk(sk);
-                read_lock_bh(&tp->accept_queue.syn_wait_lock);
+                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-                if (reqsk_queue_len(&tp->accept_queue)) {
+                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
 start_req:
                        st->uid         = sock_i_uid(sk);
                        st->syn_wait_sk = sk;
@@ -2179,10 +1562,10 @@ start_req:
                        st->sbucket     = 0;
                        goto get_req;
                }
-                read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
        }
-        if (++st->bucket < TCP_LHTABLE_SIZE) {
+        if (++st->bucket < INET_LHTABLE_SIZE) {
-                sk = sk_head(&tcp_listening_hash[st->bucket]);
+                sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
                goto get_sk;
        }
        cur = NULL;
@@ -2206,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq)
        struct tcp_iter_state* st = seq->private;
        void *rc = NULL;
-        for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+        for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
                struct sock *sk;
                struct hlist_node *node;
-                struct tcp_tw_bucket *tw;
+                struct inet_timewait_sock *tw;
                /* We can reschedule _before_ having picked the target: */
                cond_resched_softirq();
-                read_lock(&tcp_ehash[st->bucket].lock);
+                read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
-                sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+                sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
                        if (sk->sk_family != st->family) {
                                continue;
                        }
@@ -2223,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq)
                        goto out;
                }
                st->state = TCP_SEQ_STATE_TIME_WAIT;
-                tw_for_each(tw, node,
+                inet_twsk_for_each(tw, node,
-                            &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+                                   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
                        if (tw->tw_family != st->family) {
                                continue;
                        }
                        rc = tw;
                        goto out;
                }
-                read_unlock(&tcp_ehash[st->bucket].lock);
+                read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
                st->state = TCP_SEQ_STATE_ESTABLISHED;
        }
 out:
@@ -2241,7 +1624,7 @@ out:
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
        struct sock *sk = cur;
-        struct tcp_tw_bucket *tw;
+        struct inet_timewait_sock *tw;
        struct hlist_node *node;
        struct tcp_iter_state* st = seq->private;
@@ -2258,15 +1641,15 @@ get_tw:
                        cur = tw;
                        goto out;
                }
-                read_unlock(&tcp_ehash[st->bucket].lock);
+                read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                /* We can reschedule between buckets: */
                cond_resched_softirq();
-                if (++st->bucket < tcp_ehash_size) {
+                if (++st->bucket < tcp_hashinfo.ehash_size) {
-                        read_lock(&tcp_ehash[st->bucket].lock);
+                        read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
-                        sk = sk_head(&tcp_ehash[st->bucket].chain);
+                        sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
                } else {
                        cur = NULL;
                        goto out;
@@ -2280,7 +1663,7 @@ get_tw:
        }
        st->state = TCP_SEQ_STATE_TIME_WAIT;
-        tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+        tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
        goto get_tw;
 found:
        cur = sk;
@@ -2304,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
        void *rc;
        struct tcp_iter_state* st = seq->private;
-        tcp_listen_lock();
+        inet_listen_lock(&tcp_hashinfo);
        st->state = TCP_SEQ_STATE_LISTENING;
        rc        = listening_get_idx(seq, &pos);
        if (!rc) {
-                tcp_listen_unlock();
+                inet_listen_unlock(&tcp_hashinfo);
                local_bh_disable();
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                rc        = established_get_idx(seq, pos);
@@ -2342,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        case TCP_SEQ_STATE_LISTENING:
                rc = listening_get_next(seq, v);
                if (!rc) {
-                        tcp_listen_unlock();
+                        inet_listen_unlock(&tcp_hashinfo);
                        local_bh_disable();
                        st->state = TCP_SEQ_STATE_ESTABLISHED;
                        rc        = established_get_first(seq);
@@ -2365,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
        switch (st->state) {
        case TCP_SEQ_STATE_OPENREQ:
                if (v) {
-                        struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
+                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
-                        read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
                }
        case TCP_SEQ_STATE_LISTENING:
                if (v != SEQ_START_TOKEN)
-                        tcp_listen_unlock();
+                        inet_listen_unlock(&tcp_hashinfo);
                break;
        case TCP_SEQ_STATE_TIME_WAIT:
        case TCP_SEQ_STATE_ESTABLISHED:
                if (v)
-                        read_unlock(&tcp_ehash[st->bucket].lock);
+                        read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
                local_bh_enable();
                break;
        }
@@ -2472,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
        int timer_active;
        unsigned long timer_expires;
        struct tcp_sock *tp = tcp_sk(sp);
+        const struct inet_connection_sock *icsk = inet_csk(sp);
        struct inet_sock *inet = inet_sk(sp);
        unsigned int dest = inet->daddr;
        unsigned int src = inet->rcv_saddr;
        __u16 destp = ntohs(inet->dport);
        __u16 srcp = ntohs(inet->sport);
-        if (tp->pending == TCP_TIME_RETRANS) {
+        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
                timer_active    = 1;
-                timer_expires   = tp->timeout;
+                timer_expires   = icsk->icsk_timeout;
-        } else if (tp->pending == TCP_TIME_PROBE0) {
+        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
                timer_active    = 4;
-                timer_expires   = tp->timeout;
+                timer_expires   = icsk->icsk_timeout;
        } else if (timer_pending(&sp->sk_timer)) {
                timer_active    = 2;
                timer_expires   = sp->sk_timer.expires;
@@ -2498,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
                tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
                timer_active,
                jiffies_to_clock_t(timer_expires - jiffies),
-                tp->retransmits,
+                icsk->icsk_retransmits,
                sock_i_uid(sp),
-                tp->probes_out,
+                icsk->icsk_probes_out,
                sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp,
-                tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+                icsk->icsk_rto,
+                icsk->icsk_ack.ato,
+                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
                tp->snd_cwnd,
                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
 }
-static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
 {
        unsigned int dest, src;
        __u16 destp, srcp;
@@ -2588,7 +1974,7 @@ struct proto tcp_prot = {
        .close                  = tcp_close,
        .connect                = tcp_v4_connect,
        .disconnect             = tcp_disconnect,
-        .accept                 = tcp_accept,
+        .accept                 = inet_csk_accept,
        .ioctl                  = tcp_ioctl,
        .init                   = tcp_v4_init_sock,
        .destroy                = tcp_v4_destroy_sock,
@@ -2603,6 +1989,7 @@ struct proto tcp_prot = {
        .get_port               = tcp_v4_get_port,
        .enter_memory_pressure  = tcp_enter_memory_pressure,
        .sockets_allocated      = &tcp_sockets_allocated,
+        .orphan_count           = &tcp_orphan_count,
        .memory_allocated       = &tcp_memory_allocated,
        .memory_pressure        = &tcp_memory_pressure,
        .sysctl_mem             = sysctl_tcp_mem,
@@ -2610,6 +1997,7 @@ struct proto tcp_prot = {
        .sysctl_rmem            = sysctl_tcp_rmem,
        .max_header             = MAX_TCP_HEADER,
        .obj_size               = sizeof(struct tcp_sock),
+        .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
        .rsk_prot               = &tcp_request_sock_ops,
 };
@@ -2631,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
 }
 EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_bind_hash);
+EXPORT_SYMBOL(inet_bind_bucket_create);
-EXPORT_SYMBOL(tcp_bucket_create);
 EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_inherit_port);
-EXPORT_SYMBOL(tcp_listen_wlock);
-EXPORT_SYMBOL(tcp_port_rover);
 EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_put_port);
 EXPORT_SYMBOL(tcp_unhash);
 EXPORT_SYMBOL(tcp_v4_conn_request);
 EXPORT_SYMBOL(tcp_v4_connect);
 EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_rebuild_header);
 EXPORT_SYMBOL(tcp_v4_remember_stamp);
 EXPORT_SYMBOL(tcp_v4_send_check);
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f42a284164b7..a88db28b0af7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
 #define SYNC_INIT 1
 #endif
-int sysctl_tcp_tw_recycle;
-int sysctl_tcp_max_tw_buckets = NR_FILE*2;
 int sysctl_tcp_syncookies = SYNC_INIT; 
 int sysctl_tcp_abort_on_overflow;
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+struct inet_timewait_death_row tcp_death_row = {
+        .sysctl_max_tw_buckets = NR_FILE * 2,
+        .period         = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+        .death_lock     = SPIN_LOCK_UNLOCKED,
+        .hashinfo       = &tcp_hashinfo,
+        .tw_timer       = TIMER_INITIALIZER(inet_twdr_hangman, 0,
+                                            (unsigned long)&tcp_death_row),
+        .twkill_work    = __WORK_INITIALIZER(tcp_death_row.twkill_work,
+                                             inet_twdr_twkill_work,
+                                             &tcp_death_row),
+/* Short-time timewait calendar */
+        .twcal_hand     = -1,
+        .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+                                            (unsigned long)&tcp_death_row),
+};
+EXPORT_SYMBOL_GPL(tcp_death_row);
 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
        return (seq == e_win && seq == end_seq);
 }
-/* New-style handling of TIME_WAIT sockets. */
-int tcp_tw_count;
-/* Must be called with locally disabled BHs. */
-static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
-{
-        struct tcp_ehash_bucket *ehead;
-        struct tcp_bind_hashbucket *bhead;
-        struct tcp_bind_bucket *tb;
-        /* Unlink from established hashes. */
-        ehead = &tcp_ehash[tw->tw_hashent];
-        write_lock(&ehead->lock);
-        if (hlist_unhashed(&tw->tw_node)) {
-                write_unlock(&ehead->lock);
-                return;
-        }
-        __hlist_del(&tw->tw_node);
-        sk_node_init(&tw->tw_node);
-        write_unlock(&ehead->lock);
-        /* Disassociate with bind bucket. */
-        bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
-        spin_lock(&bhead->lock);
-        tb = tw->tw_tb;
-        __hlist_del(&tw->tw_bind_node);
-        tw->tw_tb = NULL;
-        tcp_bucket_destroy(tb);
-        spin_unlock(&bhead->lock);
-#ifdef INET_REFCNT_DEBUG
-        if (atomic_read(&tw->tw_refcnt) != 1) {
-                printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
-                       atomic_read(&tw->tw_refcnt));
-        }
-#endif
-        tcp_tw_put(tw);
-}
 /* 
 * * Main purpose of TIME-WAIT state is to close connection gracefully,
 *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 * to avoid misread sequence numbers, states etc.  --ANK
 */
 enum tcp_tw_status
-tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
-                           struct tcphdr *th, unsigned len)
+                           const struct tcphdr *th)
 {
+        struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
        struct tcp_options_received tmp_opt;
        int paws_reject = 0;
        tmp_opt.saw_tstamp = 0;
-        if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+        if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
                tcp_parse_options(skb, &tmp_opt, 0);
                if (tmp_opt.saw_tstamp) {
-                        tmp_opt.ts_recent          = tw->tw_ts_recent;
+                        tmp_opt.ts_recent       = tcptw->tw_ts_recent;
-                        tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+                        tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
                        paws_reject = tcp_paws_check(&tmp_opt, th->rst);
                }
        }
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
                /* Out of window, send ACK */
                if (paws_reject ||
                    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-                                   tw->tw_rcv_nxt,
+                                   tcptw->tw_rcv_nxt,
-                                   tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+                                   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
                        return TCP_TW_ACK;
                if (th->rst)
                        goto kill;
-                if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+                if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
                        goto kill_with_rst;
                /* Dup ACK? */
-                if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+                if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
                    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
-                        tcp_tw_put(tw);
+                        inet_twsk_put(tw);
                        return TCP_TW_SUCCESS;
                }
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
                 * reset.
                 */
                if (!th->fin ||
-                    TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+                    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 kill_with_rst:
-                        tcp_tw_deschedule(tw);
+                        inet_twsk_deschedule(tw, &tcp_death_row);
-                        tcp_tw_put(tw);
+                        inet_twsk_put(tw);
                        return TCP_TW_RST;
                }
                /* FIN arrived, enter true time-wait state. */
-                tw->tw_substate = TCP_TIME_WAIT;
+                tw->tw_substate   = TCP_TIME_WAIT;
-                tw->tw_rcv_nxt  = TCP_SKB_CB(skb)->end_seq;
+                tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                if (tmp_opt.saw_tstamp) {
-                        tw->tw_ts_recent_stamp  = xtime.tv_sec;
+                        tcptw->tw_ts_recent_stamp = xtime.tv_sec;
-                        tw->tw_ts_recent        = tmp_opt.rcv_tsval;
+                        tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
                }
                /* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
                 * do not undertsnad recycling in any case, it not
                 * a big problem in practice. --ANK */
                if (tw->tw_family == AF_INET &&
-                    sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+                    tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
                    tcp_v4_tw_remember_stamp(tw))
-                        tcp_tw_schedule(tw, tw->tw_timeout);
+                        inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
+                                           TCP_TIMEWAIT_LEN);
                else
-                        tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+                        inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+                                           TCP_TIMEWAIT_LEN);
                return TCP_TW_ACK;
        }
@@ -213,7 +189,7 @@ kill_with_rst:
         */
        if (!paws_reject &&
-            (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+            (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
             (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
                /* In window segment, it may be only reset or bare ack. */
@@ -224,19 +200,20 @@ kill_with_rst:
                         */
                        if (sysctl_tcp_rfc1337 == 0) {
 kill:
-                                tcp_tw_deschedule(tw);
+                                inet_twsk_deschedule(tw, &tcp_death_row);
-                                tcp_tw_put(tw);
+                                inet_twsk_put(tw);
                                return TCP_TW_SUCCESS;
                        }
                }
-                tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+                inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+                                   TCP_TIMEWAIT_LEN);
                if (tmp_opt.saw_tstamp) {
-                        tw->tw_ts_recent        = tmp_opt.rcv_tsval;
+                        tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
-                        tw->tw_ts_recent_stamp  = xtime.tv_sec;
+                        tcptw->tw_ts_recent_stamp = xtime.tv_sec;
                }
-                tcp_tw_put(tw);
+                inet_twsk_put(tw);
                return TCP_TW_SUCCESS;
        }
@@ -258,9 +235,10 @@ kill:
         */
        if (th->syn && !th->rst && !th->ack && !paws_reject &&
-            (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
+            (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
-             (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+             (tmp_opt.saw_tstamp &&
-                u32 isn = tw->tw_snd_nxt + 65535 + 2;
+              (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+                u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
                if (isn == 0)
                        isn++;
                TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
                 * Do not reschedule in the last case.
                 */
                if (paws_reject || th->ack)
-                        tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+                        inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+                                           TCP_TIMEWAIT_LEN);
                /* Send ACK. Note, we do not put the bucket,
                 * it will be released by caller.
                 */
                return TCP_TW_ACK;
        }
-        tcp_tw_put(tw);
+        inet_twsk_put(tw);
        return TCP_TW_SUCCESS;
 }
-/* Enter the time wait state.  This is called with locally disabled BH.
- * Essentially we whip up a timewait bucket, copy the
- * relevant info into it from the SK, and mess with hash chains
- * and list linkage.
- */
-static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
-{
-        struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
-        struct tcp_bind_hashbucket *bhead;
-        /* Step 1: Put TW into bind hash. Original socket stays there too.
-           Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
-           binding cache, even if it is closed.
-         */
-        bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
-        spin_lock(&bhead->lock);
-        tw->tw_tb = tcp_sk(sk)->bind_hash;
-        BUG_TRAP(tcp_sk(sk)->bind_hash);
-        tw_add_bind_node(tw, &tw->tw_tb->owners);
-        spin_unlock(&bhead->lock);
-        write_lock(&ehead->lock);
-        /* Step 2: Remove SK from established hash. */
-        if (__sk_del_node_init(sk))
-                sock_prot_dec_use(sk->sk_prot);
-        /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
-        tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
-        atomic_inc(&tw->tw_refcnt);
-        write_unlock(&ehead->lock);
-}
 /* 
 * Move a socket to time-wait or dead fin-wait-2 state.
 */ 
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
-        struct tcp_tw_bucket *tw = NULL;
+        struct inet_timewait_sock *tw = NULL;
-        struct tcp_sock *tp = tcp_sk(sk);
+        const struct tcp_sock *tp = tcp_sk(sk);
        int recycle_ok = 0;
-        if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+        if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
                recycle_ok = tp->af_specific->remember_stamp(sk);
-        if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
+        if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
-                tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+                tw = inet_twsk_alloc(sk, state);
-        if(tw != NULL) {
-                struct inet_sock *inet = inet_sk(sk);
-                int rto = (tp->rto<<2) - (tp->rto>>1);
-                /* Give us an identity. */
-                tw->tw_daddr            = inet->daddr;
-                tw->tw_rcv_saddr        = inet->rcv_saddr;
-                tw->tw_bound_dev_if     = sk->sk_bound_dev_if;
-                tw->tw_num              = inet->num;
-                tw->tw_state            = TCP_TIME_WAIT;
-                tw->tw_substate         = state;
-                tw->tw_sport            = inet->sport;
-                tw->tw_dport            = inet->dport;
-                tw->tw_family           = sk->sk_family;
-                tw->tw_reuse            = sk->sk_reuse;
-                tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
-                atomic_set(&tw->tw_refcnt, 1);
-                tw->tw_hashent          = sk->sk_hashent;
+        if (tw != NULL) {
-                tw->tw_rcv_nxt          = tp->rcv_nxt;
+                struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
-                tw->tw_snd_nxt          = tp->snd_nxt;
+                const struct inet_connection_sock *icsk = inet_csk(sk);
-                tw->tw_rcv_wnd          = tcp_receive_window(tp);
+                const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-                tw->tw_ts_recent        = tp->rx_opt.ts_recent;
-                tw->tw_ts_recent_stamp  = tp->rx_opt.ts_recent_stamp;
+                tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
-                tw_dead_node_init(tw);
+                tcptw->tw_rcv_nxt       = tp->rcv_nxt;
+                tcptw->tw_snd_nxt       = tp->snd_nxt;
+                tcptw->tw_rcv_wnd       = tcp_receive_window(tp);
+                tcptw->tw_ts_recent     = tp->rx_opt.ts_recent;
+                tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                if (tw->tw_family == PF_INET6) {
                        struct ipv6_pinfo *np = inet6_sk(sk);
+                        struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
-                        ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
+                        ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
-                        ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+                        ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
-                        tw->tw_v6_ipv6only = np->ipv6only;
+                        tw->tw_ipv6only = np->ipv6only;
-                } else {
-                        memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
-                        memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
-                        tw->tw_v6_ipv6only = 0;
                }
 #endif
                /* Linkage updates. */
-                __tcp_tw_hashdance(sk, tw);
+                __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
                /* Get the TIME_WAIT timeout firing. */
                if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                                timeo = TCP_TIMEWAIT_LEN;
                }
-                tcp_tw_schedule(tw, timeo);
+                inet_twsk_schedule(tw, &tcp_death_row, timeo,
-                tcp_tw_put(tw);
+                                   TCP_TIMEWAIT_LEN);
+                inet_twsk_put(tw);
        } else {
                /* Sorry, if we're out of memory, just CLOSE this
                 * socket up.  We've got bigger problems than
@@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
        tcp_done(sk);
 }
-/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-static int tcp_tw_death_row_slot;
-static void tcp_twkill(unsigned long);
-/* TIME_WAIT reaping mechanism. */
-#define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
-#define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
-#define TCP_TWKILL_QUOTA        100
-static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
-static DEFINE_SPINLOCK(tw_death_lock);
-static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
-static void twkill_work(void *);
-static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
-static u32 twkill_thread_slots;
-/* Returns non-zero if quota exceeded.  */
-static int tcp_do_twkill_work(int slot, unsigned int quota)
-{
-        struct tcp_tw_bucket *tw;
-        struct hlist_node *node;
-        unsigned int killed;
-        int ret;
-        /* NOTE: compare this to previous version where lock
-         * was released after detaching chain. It was racy,
-         * because tw buckets are scheduled in not serialized context
-         * in 2.3 (with netfilter), and with softnet it is common, because
-         * soft irqs are not sequenced.
-         */
-        killed = 0;
-        ret = 0;
-rescan:
-        tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
-                __tw_del_dead_node(tw);
-                spin_unlock(&tw_death_lock);
-                tcp_timewait_kill(tw);
-                tcp_tw_put(tw);
-                killed++;
-                spin_lock(&tw_death_lock);
-                if (killed > quota) {
-                        ret = 1;
-                        break;
-                }
-                /* While we dropped tw_death_lock, another cpu may have
-                 * killed off the next TW bucket in the list, therefore
-                 * do a fresh re-read of the hlist head node with the
-                 * lock reacquired.  We still use the hlist traversal
-                 * macro in order to get the prefetches.
-                 */
-                goto rescan;
-        }
-        tcp_tw_count -= killed;
-        NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-        return ret;
-}
-static void tcp_twkill(unsigned long dummy)
-{
-        int need_timer, ret;
-        spin_lock(&tw_death_lock);
-        if (tcp_tw_count == 0)
-                goto out;
-        need_timer = 0;
-        ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
-        if (ret) {
-                twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
-                mb();
-                schedule_work(&tcp_twkill_work);
-                need_timer = 1;
-        } else {
-                /* We purged the entire slot, anything left?  */
-                if (tcp_tw_count)
-                        need_timer = 1;
-        }
-        tcp_tw_death_row_slot =
-                ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
-        if (need_timer)
-                mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
-out:
-        spin_unlock(&tw_death_lock);
-}
-extern void twkill_slots_invalid(void);
-static void twkill_work(void *dummy)
-{
-        int i;
-        if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
-                twkill_slots_invalid();
-        while (twkill_thread_slots) {
-                spin_lock_bh(&tw_death_lock);
-                for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
-                        if (!(twkill_thread_slots & (1 << i)))
-                                continue;
-                        while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
-                                if (need_resched()) {
-                                        spin_unlock_bh(&tw_death_lock);
-                                        schedule();
-                                        spin_lock_bh(&tw_death_lock);
-                                }
-                        }
-                        twkill_thread_slots &= ~(1 << i);
-                }
-                spin_unlock_bh(&tw_death_lock);
-        }
-}
-/* These are always called from BH context.  See callers in
- * tcp_input.c to verify this.
- */
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
-{
-        spin_lock(&tw_death_lock);
-        if (tw_del_dead_node(tw)) {
-                tcp_tw_put(tw);
-                if (--tcp_tw_count == 0)
-                        del_timer(&tcp_tw_timer);
-        }
-        spin_unlock(&tw_death_lock);
-        tcp_timewait_kill(tw);
-}
-/* Short-time timewait calendar */
-static int tcp_twcal_hand = -1;
-static int tcp_twcal_jiffie;
-static void tcp_twcal_tick(unsigned long);
-static struct timer_list tcp_twcal_timer =
-                TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
-static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
-{
-        struct hlist_head *list;
-        int slot;
-        /* timeout := RTO * 3.5
-         *
-         * 3.5 = 1+2+0.5 to wait for two retransmits.
-         *
-         * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
-         * our ACK acking that FIN can be lost. If N subsequent retransmitted
-         * FINs (or previous seqments) are lost (probability of such event
-         * is p^(N+1), where p is probability to lose single packet and
-         * time to detect the loss is about RTO*(2^N - 1) with exponential
-         * backoff). Normal timewait length is calculated so, that we
-         * waited at least for one retransmitted FIN (maximal RTO is 120sec).
-         * [ BTW Linux. following BSD, violates this requirement waiting
-         *   only for 60sec, we should wait at least for 240 secs.
-         *   Well, 240 consumes too much of resources 8)
-         * ]
-         * This interval is not reduced to catch old duplicate and
-         * responces to our wandering segments living for two MSLs.
-         * However, if we use PAWS to detect
-         * old duplicates, we can reduce the interval to bounds required
-         * by RTO, rather than MSL. So, if peer understands PAWS, we
-         * kill tw bucket after 3.5*RTO (it is important that this number
-         * is greater than TS tick!) and detect old duplicates with help
-         * of PAWS.
-         */
-        slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
-        spin_lock(&tw_death_lock);
-        /* Unlink it, if it was scheduled */
-        if (tw_del_dead_node(tw))
-                tcp_tw_count--;
-        else
-                atomic_inc(&tw->tw_refcnt);
-        if (slot >= TCP_TW_RECYCLE_SLOTS) {
-                /* Schedule to slow timer */
-                if (timeo >= TCP_TIMEWAIT_LEN) {
-                        slot = TCP_TWKILL_SLOTS-1;
-                } else {
-                        slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
-                        if (slot >= TCP_TWKILL_SLOTS)
-                                slot = TCP_TWKILL_SLOTS-1;
-                }
-                tw->tw_ttd = jiffies + timeo;
-                slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
-                list = &tcp_tw_death_row[slot];
-        } else {
-                tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
-                if (tcp_twcal_hand < 0) {
-                        tcp_twcal_hand = 0;
-                        tcp_twcal_jiffie = jiffies;
-                        tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
-                        add_timer(&tcp_twcal_timer);
-                } else {
-                        if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
-                                mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
-                        slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
-                }
-                list = &tcp_twcal_row[slot];
-        }
-        hlist_add_head(&tw->tw_death_node, list);
-        if (tcp_tw_count++ == 0)
-                mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
-        spin_unlock(&tw_death_lock);
-}
-void tcp_twcal_tick(unsigned long dummy)
-{
-        int n, slot;
-        unsigned long j;
-        unsigned long now = jiffies;
-        int killed = 0;
-        int adv = 0;
-        spin_lock(&tw_death_lock);
-        if (tcp_twcal_hand < 0)
-                goto out;
-        slot = tcp_twcal_hand;
-        j = tcp_twcal_jiffie;
-        for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
-                if (time_before_eq(j, now)) {
-                        struct hlist_node *node, *safe;
-                        struct tcp_tw_bucket *tw;
-                        tw_for_each_inmate_safe(tw, node, safe,
-                                           &tcp_twcal_row[slot]) {
-                                __tw_del_dead_node(tw);
-                                tcp_timewait_kill(tw);
-                                tcp_tw_put(tw);
-                                killed++;
-                        }
-                } else {
-                        if (!adv) {
-                                adv = 1;
-                                tcp_twcal_jiffie = j;
-                                tcp_twcal_hand = slot;
-                        }
-                        if (!hlist_empty(&tcp_twcal_row[slot])) {
-                                mod_timer(&tcp_twcal_timer, j);
-                                goto out;
-                        }
-                }
-                j += (1<<TCP_TW_RECYCLE_TICK);
-                slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
-        }
-        tcp_twcal_hand = -1;
-out:
-        if ((tcp_tw_count -= killed) == 0)
-                del_timer(&tcp_tw_timer);
-        NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
-        spin_unlock(&tw_death_lock);
-}
 /* This is not only more efficient than what we used to do, it eliminates
 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 *
@@ -686,75 +344,27 @@ out:
 */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
-        /* allocate the newsk from the same slab of the master sock,
+        struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
-         * if not, at sk_free time we'll try to free it from the wrong
-         * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
-        struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
-        if(newsk != NULL) {
+        if (newsk != NULL) {
-                struct inet_request_sock *ireq = inet_rsk(req);
+                const struct inet_request_sock *ireq = inet_rsk(req);
                struct tcp_request_sock *treq = tcp_rsk(req);
+                struct inet_connection_sock *newicsk = inet_csk(sk);
                struct tcp_sock *newtp;
-                struct sk_filter *filter;
-                memcpy(newsk, sk, sizeof(struct tcp_sock));
-                newsk->sk_state = TCP_SYN_RECV;
-                /* SANITY */
-                sk_node_init(&newsk->sk_node);
-                tcp_sk(newsk)->bind_hash = NULL;
-                /* Clone the TCP header template */
-                inet_sk(newsk)->dport = ireq->rmt_port;
-                sock_lock_init(newsk);
-                bh_lock_sock(newsk);
-                rwlock_init(&newsk->sk_dst_lock);
-                atomic_set(&newsk->sk_rmem_alloc, 0);
-                skb_queue_head_init(&newsk->sk_receive_queue);
-                atomic_set(&newsk->sk_wmem_alloc, 0);
-                skb_queue_head_init(&newsk->sk_write_queue);
-                atomic_set(&newsk->sk_omem_alloc, 0);
-                newsk->sk_wmem_queued = 0;
-                newsk->sk_forward_alloc = 0;
-                sock_reset_flag(newsk, SOCK_DONE);
-                newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
-                newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
-                newsk->sk_send_head = NULL;
-                rwlock_init(&newsk->sk_callback_lock);
-                skb_queue_head_init(&newsk->sk_error_queue);
-                newsk->sk_write_space = sk_stream_write_space;
-                if ((filter = newsk->sk_filter) != NULL)
-                        sk_filter_charge(newsk, filter);
-                if (unlikely(xfrm_sk_clone_policy(newsk))) {
-                        /* It is still raw copy of parent, so invalidate
-                         * destructor and make plain sk_free() */
-                        newsk->sk_destruct = NULL;
-                        sk_free(newsk);
-                        return NULL;
-                }
                /* Now setup tcp_sock */
                newtp = tcp_sk(newsk);
                newtp->pred_flags = 0;
                newtp->rcv_nxt = treq->rcv_isn + 1;
-                newtp->snd_nxt = treq->snt_isn + 1;
+                newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
-                newtp->snd_una = treq->snt_isn + 1;
-                newtp->snd_sml = treq->snt_isn + 1;
                tcp_prequeue_init(newtp);
                tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
-                newtp->retransmits = 0;
-                newtp->backoff = 0;
                newtp->srtt = 0;
                newtp->mdev = TCP_TIMEOUT_INIT;
-                newtp->rto = TCP_TIMEOUT_INIT;
+                newicsk->icsk_rto = TCP_TIMEOUT_INIT;
                newtp->packets_out = 0;
                newtp->left_out = 0;
@@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->frto_counter = 0;
                newtp->frto_highmark = 0;
-                newtp->ca_ops = &tcp_reno;
+                newicsk->icsk_ca_ops = &tcp_reno;
-                tcp_set_ca_state(newtp, TCP_CA_Open);
+                tcp_set_ca_state(newsk, TCP_CA_Open);
                tcp_init_xmit_timers(newsk);
                skb_queue_head_init(&newtp->out_of_order_queue);
                newtp->rcv_wup = treq->rcv_isn + 1;
@@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->rx_opt.dsack = 0;
                newtp->rx_opt.eff_sacks = 0;
-                newtp->probes_out = 0;
                newtp->rx_opt.num_sacks = 0;
                newtp->urg_data = 0;
-                /* Deinitialize accept_queue to trap illegal accesses. */
-                memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
-                /* Back to base struct sock members. */
-                newsk->sk_err = 0;
-                newsk->sk_priority = 0;
-                atomic_set(&newsk->sk_refcnt, 2);
-#ifdef INET_REFCNT_DEBUG
-                atomic_inc(&inet_sock_nr);
-#endif
-                atomic_inc(&tcp_sockets_allocated);
                if (sock_flag(newsk, SOCK_KEEPOPEN))
-                        tcp_reset_keepalive_timer(newsk,
+                        inet_csk_reset_keepalive_timer(newsk,
-                                                  keepalive_time_when(newtp));
+                                                       keepalive_time_when(newtp));
-                newsk->sk_socket = NULL;
-                newsk->sk_sleep = NULL;
                newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
                if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
@@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                        newtp->tcp_header_len = sizeof(struct tcphdr);
                }
                if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
-                        newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+                        newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
                newtp->rx_opt.mss_clamp = req->mss;
                TCP_ECN_openreq_child(newtp, req);
                if (newtp->ecn_flags&TCP_ECN_OK)
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
           does sequence test, SYN is truncated, and thus we consider
           it a bare ACK.
-           If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
+           If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
-           we create an established connection.  Both ends (listening sockets)
+           bare ACK.  Otherwise, we create an established connection.  Both
-           accept the new incoming connection and try to talk to each other. 8-)
+           ends (listening sockets) accept the new incoming connection and try
+           to talk to each other. 8-)
           Note: This case is both harmless, and rare.  Possibility is about the
           same as us discovering intelligent life on another plant tomorrow.
@@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                        return NULL;
                /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
-                if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+                if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+                    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
                        inet_rsk(req)->acked = 1;
                        return NULL;
                }
@@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                if (child == NULL)
                        goto listen_overflow;
-                tcp_synq_unlink(tp, req, prev);
+                inet_csk_reqsk_queue_unlink(sk, req, prev);
-                tcp_synq_removed(sk, req);
+                inet_csk_reqsk_queue_removed(sk, req);
-                tcp_acceptq_queue(sk, req, child);
+                inet_csk_reqsk_queue_add(sk, req, child);
                return child;
        listen_overflow:
@@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                if (!(flg & TCP_FLAG_RST))
                        req->rsk_ops->send_reset(skb);
-                tcp_synq_drop(sk, req, prev);
+                inet_csk_reqsk_queue_drop(sk, req, prev);
                return NULL;
 }
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
 EXPORT_SYMBOL(tcp_child_process);
 EXPORT_SYMBOL(tcp_create_openreq_child);
 EXPORT_SYMBOL(tcp_timewait_state_process);
-EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dd30dd137b74..75b68116682a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
 * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        s32 delta = tcp_time_stamp - tp->lsndtime;
        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
        u32 cwnd = tp->snd_cwnd;
-        tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
+        tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
-        tp->snd_ssthresh = tcp_current_ssthresh(tp);
+        tp->snd_ssthresh = tcp_current_ssthresh(sk);
        restart_cwnd = min(restart_cwnd, cwnd);
-        while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+        while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
                cwnd >>= 1;
        tp->snd_cwnd = max(cwnd, restart_cwnd);
        tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
 static inline void tcp_event_data_sent(struct tcp_sock *tp,
                                       struct sk_buff *skb, struct sock *sk)
 {
-        u32 now = tcp_time_stamp;
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        const u32 now = tcp_time_stamp;
-        if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
+        if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
-                tcp_cwnd_restart(tp, __sk_dst_get(sk));
+                tcp_cwnd_restart(sk, __sk_dst_get(sk));
        tp->lsndtime = now;
        /* If it is a reply for ato after last received
         * packet, enter pingpong mode.
         */
-        if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
+        if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
-                tp->ack.pingpong = 1;
+                icsk->icsk_ack.pingpong = 1;
 }
 static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        tcp_dec_quickack_mode(sk, pkts);
+        inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
-        tcp_dec_quickack_mode(tp, pkts);
-        tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 }
 /* Determine a window scaling and initial window to offer.
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 {
        if (skb != NULL) {
+                const struct inet_connection_sock *icsk = inet_csk(sk);
                struct inet_sock *inet = inet_sk(sk);
                struct tcp_sock *tp = tcp_sk(sk);
                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_SACK        0x4
                /* If congestion control is doing timestamping */
-                if (tp->ca_ops->rtt_sample)
+                if (icsk->icsk_ca_ops->rtt_sample)
-                        do_gettimeofday(&skb->stamp);
+                        __net_timestamp(skb);
                sysctl_flags = 0;
                if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                }
                
                if (tcp_packets_in_flight(tp) == 0)
-                        tcp_ca_event(tp, CA_EVENT_TX_START);
+                        tcp_ca_event(sk, CA_EVENT_TX_START);
                th = (struct tcphdr *) skb_push(skb, tcp_header_size);
                skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                if (err <= 0)
                        return err;
-                tcp_enter_cwr(tp);
+                tcp_enter_cwr(sk);
                /* NET_XMIT_CN is special. It does not guarantee,
                 * that this packet is lost. It tells that device
@@ -482,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
         * skbs, which it never sent before. --ANK
         */
        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
-        buff->stamp = skb->stamp;
+        buff->tstamp = skb->tstamp;
        if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
                tp->lost_out -= tcp_skb_pcount(skb);
@@ -505,7 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
-        __skb_append(skb, buff);
+        __skb_append(skb, buff, &sk->sk_write_queue);
        return 0;
 }
@@ -696,7 +697,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
                if (tp->packets_out > tp->snd_cwnd_used)
                        tp->snd_cwnd_used = tp->packets_out;
-                if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+                if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
                        tcp_cwnd_application_limited(sk);
        }
 }
@@ -893,7 +894,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
-        __skb_append(skb, buff);
+        __skb_append(skb, buff, &sk->sk_write_queue);
        return 0;
 }
@@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 */
 static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 send_win, cong_win, limit, in_flight;
        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
                return 0;
-        if (tp->ca_state != TCP_CA_Open)
+        if (icsk->icsk_ca_state != TCP_CA_Open)
                return 0;
        in_flight = tcp_packets_in_flight(tp);
@@ -1147,6 +1149,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 */
 u32 __tcp_select_window(struct sock *sk)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        /* MSS for the peer's data.  Previous verions used mss_clamp
         * here.  I don't know if the value based on our guesses
@@ -1154,7 +1157,7 @@ u32 __tcp_select_window(struct sock *sk)
         * but may be worse for the performance because of rcv_mss
         * fluctuations.  --SAW  1998/11/1
         */
-        int mss = tp->ack.rcv_mss;
+        int mss = icsk->icsk_ack.rcv_mss;
        int free_space = tcp_space(sk);
        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
        int window;
@@ -1163,7 +1166,7 @@ u32 __tcp_select_window(struct sock *sk)
                mss = full_space; 
        if (free_space < full_space/2) {
-                tp->ack.quick = 0;
+                icsk->icsk_ack.quick = 0;
                if (tcp_memory_pressure)
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1238,7 +1241,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
                       tcp_skb_pcount(next_skb) != 1);
                /* Ok.  We will be able to collapse the packet. */
-                __skb_unlink(next_skb, next_skb->list);
+                __skb_unlink(next_skb, &sk->sk_write_queue);
                memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1286,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
 */ 
 void tcp_simple_retransmit(struct sock *sk)
 {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        unsigned int mss = tcp_current_mss(sk, 0);
@@ -1316,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk)
         * in network, but units changed and effective
         * cwnd/ssthresh really reduced now.
         */
-        if (tp->ca_state != TCP_CA_Loss) {
+        if (icsk->icsk_ca_state != TCP_CA_Loss) {
                tp->high_seq = tp->snd_nxt;
-                tp->snd_ssthresh = tcp_current_ssthresh(tp);
+                tp->snd_ssthresh = tcp_current_ssthresh(sk);
                tp->prior_ssthresh = 0;
                tp->undo_marker = 0;
-                tcp_set_ca_state(tp, TCP_CA_Loss);
+                tcp_set_ca_state(sk, TCP_CA_Loss);
        }
        tcp_xmit_retransmit_queue(sk);
 }
@@ -1461,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int packet_cnt = tp->lost_out;
@@ -1484,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
                                        if (tcp_retransmit_skb(sk, skb))
                                                return;
-                                        if (tp->ca_state != TCP_CA_Loss)
+                                        if (icsk->icsk_ca_state != TCP_CA_Loss)
                                                NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
                                        else
                                                NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
                                        if (skb ==
                                            skb_peek(&sk->sk_write_queue))
-                                                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                                                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                                                          inet_csk(sk)->icsk_rto,
+                                                                          TCP_RTO_MAX);
                                }
                                packet_cnt -= tcp_skb_pcount(skb);
@@ -1504,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
        /* OK, demanded retransmission is finished. */
        /* Forward retransmissions are possible only during Recovery. */
-        if (tp->ca_state != TCP_CA_Recovery)
+        if (icsk->icsk_ca_state != TCP_CA_Recovery)
                return;
        /* No forward retransmissions in Reno are possible. */
@@ -1544,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                        break;
                if (skb == skb_peek(&sk->sk_write_queue))
-                        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                                  inet_csk(sk)->icsk_rto,
+                                                  TCP_RTO_MAX);
                NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
        }
@@ -1573,7 +1582,7 @@ void tcp_send_fin(struct sock *sk)
        } else {
                /* Socket is locked, keep trying until memory is available. */
                for (;;) {
-                        skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+                        skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
                        if (skb)
                                break;
                        yield();
@@ -1780,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk)
        tp->rcv_wup = 0;
        tp->copied_seq = 0;
-        tp->rto = TCP_TIMEOUT_INIT;
+        inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
-        tp->retransmits = 0;
+        inet_csk(sk)->icsk_retransmits = 0;
        tcp_clear_retrans(tp);
 }
@@ -1795,7 +1804,7 @@ int tcp_connect(struct sock *sk)
        tcp_connect_init(sk);
-        buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+        buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
        if (unlikely(buff == NULL))
                return -ENOBUFS;
@@ -1824,7 +1833,8 @@ int tcp_connect(struct sock *sk)
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
        /* Timer for repeating the SYN until an answer. */
-        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
        return 0;
 }
@@ -1834,20 +1844,21 @@ int tcp_connect(struct sock *sk)
 */
 void tcp_send_delayed_ack(struct sock *sk)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
-        int ato = tp->ack.ato;
+        int ato = icsk->icsk_ack.ato;
        unsigned long timeout;
        if (ato > TCP_DELACK_MIN) {
+                const struct tcp_sock *tp = tcp_sk(sk);
                int max_ato = HZ/2;
-                if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+                if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
                        max_ato = TCP_DELACK_MAX;
                /* Slow path, intersegment interval is "high". */
                /* If some rtt estimate is known, use it to bound delayed ack.
-                 * Do not use tp->rto here, use results of rtt measurements
+                 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
                 * directly.
                 */
                if (tp->srtt) {
@@ -1864,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk)
        timeout = jiffies + ato;
        /* Use new timeout only if there wasn't a older one earlier. */
-        if (tp->ack.pending&TCP_ACK_TIMER) {
+        if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
                /* If delack timer was blocked or is about to expire,
                 * send ACK now.
                 */
-                if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+                if (icsk->icsk_ack.blocked ||
+                    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
                        tcp_send_ack(sk);
                        return;
                }
-                if (!time_before(timeout, tp->ack.timeout))
+                if (!time_before(timeout, icsk->icsk_ack.timeout))
-                        timeout = tp->ack.timeout;
+                        timeout = icsk->icsk_ack.timeout;
        }
-        tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
+        icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
-        tp->ack.timeout = timeout;
+        icsk->icsk_ack.timeout = timeout;
-        sk_reset_timer(sk, &tp->delack_timer, timeout);
+        sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
 }
 /* This routine sends an ack and also updates the window. */
@@ -1895,9 +1907,10 @@ void tcp_send_ack(struct sock *sk)
                 */
                buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
                if (buff == NULL) {
-                        tcp_schedule_ack(tp);
+                        inet_csk_schedule_ack(sk);
-                        tp->ack.ato = TCP_ATO_MIN;
+                        inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
-                        tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                  TCP_DELACK_MAX, TCP_RTO_MAX);
                        return;
                }
@@ -2011,6 +2024,7 @@ int tcp_write_wakeup(struct sock *sk)
 */
 void tcp_send_probe0(struct sock *sk)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int err;
@@ -2018,28 +2032,31 @@ void tcp_send_probe0(struct sock *sk)
        if (tp->packets_out || !sk->sk_send_head) {
                /* Cancel probe timer, if it is not required. */
-                tp->probes_out = 0;
+                icsk->icsk_probes_out = 0;
-                tp->backoff = 0;
+                icsk->icsk_backoff = 0;
                return;
        }
        if (err <= 0) {
-                if (tp->backoff < sysctl_tcp_retries2)
+                if (icsk->icsk_backoff < sysctl_tcp_retries2)
-                        tp->backoff++;
+                        icsk->icsk_backoff++;
-                tp->probes_out++;
+                icsk->icsk_probes_out++;
-                tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
-                                      min(tp->rto << tp->backoff, TCP_RTO_MAX));
+                                          min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+                                          TCP_RTO_MAX);
        } else {
                /* If packet was not sent due to local congestion,
-                 * do not backoff and do not remember probes_out.
+                 * do not backoff and do not remember icsk_probes_out.
                 * Let local senders to fight for local resources.
                 *
                 * Use accumulated backoff yet.
                 */
-                if (!tp->probes_out)
+                if (!icsk->icsk_probes_out)
-                        tp->probes_out=1;
+                        icsk->icsk_probes_out = 1;
-                tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
-                                      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+                                          min(icsk->icsk_rto << icsk->icsk_backoff,
+                                              TCP_RESOURCE_PROBE_INTERVAL),
+                                          TCP_RTO_MAX);
        }
 }
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 70e108e15c71..327770bf5522 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -16,9 +16,10 @@
 #define TCP_SCALABLE_AI_CNT     50U
 #define TCP_SCALABLE_MD_SCALE   3
-static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
                                    u32 in_flight, int flag)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
        if (in_flight < tp->snd_cwnd)
                return;
@@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+static u32 tcp_scalable_ssthresh(struct sock *sk)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0084227438c2..415ee47ac1c5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
 static void tcp_delack_timer(unsigned long);
 static void tcp_keepalive_timer (unsigned long data);
-#ifdef TCP_DEBUG
-const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
-EXPORT_SYMBOL(tcp_timer_bug_msg);
-#endif
-/*
- * Using different timers for retransmit, delayed acks and probes
- * We may wish use just one timer maintaining a list of expire jiffies 
- * to optimize.
- */
 void tcp_init_xmit_timers(struct sock *sk)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+                                  &tcp_keepalive_timer);
-        init_timer(&tp->retransmit_timer);
-        tp->retransmit_timer.function=&tcp_write_timer;
-        tp->retransmit_timer.data = (unsigned long) sk;
-        tp->pending = 0;
-        init_timer(&tp->delack_timer);
-        tp->delack_timer.function=&tcp_delack_timer;
-        tp->delack_timer.data = (unsigned long) sk;
-        tp->ack.pending = 0;
-        init_timer(&sk->sk_timer);
-        sk->sk_timer.function   = &tcp_keepalive_timer;
-        sk->sk_timer.data       = (unsigned long)sk;
 }
-void tcp_clear_xmit_timers(struct sock *sk)
+EXPORT_SYMBOL(tcp_init_xmit_timers);
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tp->pending = 0;
-        sk_stop_timer(sk, &tp->retransmit_timer);
-        tp->ack.pending = 0;
-        tp->ack.blocked = 0;
-        sk_stop_timer(sk, &tp->delack_timer);
-        sk_stop_timer(sk, &sk->sk_timer);
-}
 static void tcp_write_err(struct sock *sk)
 {
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
 /* A write timeout has occurred. Process the after effects. */
 static int tcp_write_timeout(struct sock *sk)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        int retry_until;
        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-                if (tp->retransmits)
+                if (icsk->icsk_retransmits)
                        dst_negative_advice(&sk->sk_dst_cache);
-                retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+                retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
        } else {
-                if (tp->retransmits >= sysctl_tcp_retries1) {
+                if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
                        /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
                           hole detection. :-(
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
                retry_until = sysctl_tcp_retries2;
                if (sock_flag(sk, SOCK_DEAD)) {
-                        int alive = (tp->rto < TCP_RTO_MAX);
+                        const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 
                        retry_until = tcp_orphan_retries(sk, alive);
-                        if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+                        if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
                                return 1;
                }
        }
-        if (tp->retransmits >= retry_until) {
+        if (icsk->icsk_retransmits >= retry_until) {
                /* Has it gone just too far? */
                tcp_write_err(sk);
                return 1;
@@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data)
 {
        struct sock *sk = (struct sock*)data;
        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                /* Try again later. */
-                tp->ack.blocked = 1;
+                icsk->icsk_ack.blocked = 1;
                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
-                sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+                sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
                goto out_unlock;
        }
        sk_stream_mem_reclaim(sk);
-        if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+        if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
                goto out;
-        if (time_after(tp->ack.timeout, jiffies)) {
+        if (time_after(icsk->icsk_ack.timeout, jiffies)) {
-                sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+                sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
                goto out;
        }
-        tp->ack.pending &= ~TCP_ACK_TIMER;
+        icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
        if (!skb_queue_empty(&tp->ucopy.prequeue)) {
                struct sk_buff *skb;
@@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
                tp->ucopy.memory = 0;
        }
-        if (tcp_ack_scheduled(tp)) {
+        if (inet_csk_ack_scheduled(sk)) {
-                if (!tp->ack.pingpong) {
+                if (!icsk->icsk_ack.pingpong) {
                        /* Delayed ACK missed: inflate ATO. */
-                        tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+                        icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
                } else {
                        /* Delayed ACK missed: leave pingpong mode and
                         * deflate ATO.
                         */
-                        tp->ack.pingpong = 0;
+                        icsk->icsk_ack.pingpong = 0;
-                        tp->ack.ato = TCP_ATO_MIN;
+                        icsk->icsk_ack.ato      = TCP_ATO_MIN;
                }
                tcp_send_ack(sk);
                NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -268,11 +233,12 @@ out_unlock:
 static void tcp_probe_timer(struct sock *sk)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int max_probes;
        if (tp->packets_out || !sk->sk_send_head) {
-                tp->probes_out = 0;
+                icsk->icsk_probes_out = 0;
                return;
        }
@@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
         * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
         * this behaviour in Solaris down as a bug fix. [AC]
         *
-         * Let me to explain. probes_out is zeroed by incoming ACKs
+         * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
         * even if they advertise zero window. Hence, connection is killed only
         * if we received no ACKs for normal connection timeout. It is not killed
         * only because window stays zero for some time, window may be zero
@@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
        max_probes = sysctl_tcp_retries2;
        if (sock_flag(sk, SOCK_DEAD)) {
-                int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+                const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
 
                max_probes = tcp_orphan_retries(sk, alive);
-                if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+                if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
                        return;
        }
-        if (tp->probes_out > max_probes) {
+        if (icsk->icsk_probes_out > max_probes) {
                tcp_write_err(sk);
        } else {
                /* Only send another probe if we didn't close things up. */
@@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
 static void tcp_retransmit_timer(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        if (!tp->packets_out)
                goto out;
@@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
        if (tcp_write_timeout(sk))
                goto out;
-        if (tp->retransmits == 0) {
+        if (icsk->icsk_retransmits == 0) {
-                if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+                if (icsk->icsk_ca_state == TCP_CA_Disorder ||
+                    icsk->icsk_ca_state == TCP_CA_Recovery) {
                        if (tp->rx_opt.sack_ok) {
-                                if (tp->ca_state == TCP_CA_Recovery)
+                                if (icsk->icsk_ca_state == TCP_CA_Recovery)
                                        NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
                                else
                                        NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
                        } else {
-                                if (tp->ca_state == TCP_CA_Recovery)
+                                if (icsk->icsk_ca_state == TCP_CA_Recovery)
                                        NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
                                else
                                        NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
                        }
-                } else if (tp->ca_state == TCP_CA_Loss) {
+                } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
                        NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
                } else {
                        NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
                /* Retransmission failed because of local congestion,
                 * do not backoff.
                 */
-                if (!tp->retransmits)
+                if (!icsk->icsk_retransmits)
-                        tp->retransmits=1;
+                        icsk->icsk_retransmits = 1;
-                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                                     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+                                          min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+                                          TCP_RTO_MAX);
                goto out;
        }
@@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
         * implemented ftp to mars will work nicely. We will have to fix
         * the 120 second clamps though!
         */
-        tp->backoff++;
+        icsk->icsk_backoff++;
-        tp->retransmits++;
+        icsk->icsk_retransmits++;
 out_reset_timer:
-        tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
+        icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
-        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-        if (tp->retransmits > sysctl_tcp_retries1)
+        if (icsk->icsk_retransmits > sysctl_tcp_retries1)
                __sk_dst_reset(sk);
 out:;
@@ -418,32 +387,32 @@ out:;
 static void tcp_write_timer(unsigned long data)
 {
        struct sock *sk = (struct sock*)data;
-        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        int event;
        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                /* Try again later */
-                sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
                goto out_unlock;
        }
-        if (sk->sk_state == TCP_CLOSE || !tp->pending)
+        if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
                goto out;
-        if (time_after(tp->timeout, jiffies)) {
+        if (time_after(icsk->icsk_timeout, jiffies)) {
-                sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
                goto out;
        }
-        event = tp->pending;
+        event = icsk->icsk_pending;
-        tp->pending = 0;
+        icsk->icsk_pending = 0;
        switch (event) {
-        case TCP_TIME_RETRANS:
+        case ICSK_TIME_RETRANS:
                tcp_retransmit_timer(sk);
                break;
-        case TCP_TIME_PROBE0:
+        case ICSK_TIME_PROBE0:
                tcp_probe_timer(sk);
                break;
        }
@@ -462,96 +431,8 @@ out_unlock:
 static void tcp_synack_timer(struct sock *sk)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
-        struct listen_sock *lopt = tp->accept_queue.listen_opt;
+                                   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
-        int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
-        int thresh = max_retries;
-        unsigned long now = jiffies;
-        struct request_sock **reqp, *req;
-        int i, budget;
-        if (lopt == NULL || lopt->qlen == 0)
-                return;
-        /* Normally all the openreqs are young and become mature
-         * (i.e. converted to established socket) for first timeout.
-         * If synack was not acknowledged for 3 seconds, it means
-         * one of the following things: synack was lost, ack was lost,
-         * rtt is high or nobody planned to ack (i.e. synflood).
-         * When server is a bit loaded, queue is populated with old
-         * open requests, reducing effective size of queue.
-         * When server is well loaded, queue size reduces to zero
-         * after several minutes of work. It is not synflood,
-         * it is normal operation. The solution is pruning
-         * too old entries overriding normal timeout, when
-         * situation becomes dangerous.
-         *
-         * Essentially, we reserve half of room for young
-         * embrions; and abort old ones without pity, if old
-         * ones are about to clog our table.
-         */
-        if (lopt->qlen>>(lopt->max_qlen_log-1)) {
-                int young = (lopt->qlen_young<<1);
-                while (thresh > 2) {
-                        if (lopt->qlen < young)
-                                break;
-                        thresh--;
-                        young <<= 1;
-                }
-        }
-        if (tp->defer_accept)
-                max_retries = tp->defer_accept;
-        budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
-        i = lopt->clock_hand;
-        do {
-                reqp=&lopt->syn_table[i];
-                while ((req = *reqp) != NULL) {
-                        if (time_after_eq(now, req->expires)) {
-                                if ((req->retrans < thresh ||
-                                     (inet_rsk(req)->acked && req->retrans < max_retries))
-                                    && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
-                                        unsigned long timeo;
-                                        if (req->retrans++ == 0)
-                                                lopt->qlen_young--;
-                                        timeo = min((TCP_TIMEOUT_INIT << req->retrans),
-                                                    TCP_RTO_MAX);
-                                        req->expires = now + timeo;
-                                        reqp = &req->dl_next;
-                                        continue;
-                                }
-                                /* Drop this request */
-                                tcp_synq_unlink(tp, req, reqp);
-                                reqsk_queue_removed(&tp->accept_queue, req);
-                                reqsk_free(req);
-                                continue;
-                        }
-                        reqp = &req->dl_next;
-                }
-                i = (i+1)&(TCP_SYNQ_HSIZE-1);
-        } while (--budget > 0);
-        lopt->clock_hand = i;
-        if (lopt->qlen)
-                tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
-}
-void tcp_delete_keepalive_timer (struct sock *sk)
-{
-        sk_stop_timer(sk, &sk->sk_timer);
-}
-void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
-{
-        sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
 }
 void tcp_set_keepalive(struct sock *sk, int val)
@@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
                return;
        if (val && !sock_flag(sk, SOCK_KEEPOPEN))
-                tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
        else if (!val)
-                tcp_delete_keepalive_timer(sk);
+                inet_csk_delete_keepalive_timer(sk);
 }
 static void tcp_keepalive_timer (unsigned long data)
 {
        struct sock *sk = (struct sock *) data;
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        __u32 elapsed;
@@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                /* Try again later. */ 
-                tcp_reset_keepalive_timer (sk, HZ/20);
+                inet_csk_reset_keepalive_timer (sk, HZ/20);
                goto out;
        }
@@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
        if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
                if (tp->linger2 >= 0) {
-                        int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+                        const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
                        if (tmo > 0) {
                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
        elapsed = tcp_time_stamp - tp->rcv_tstamp;
        if (elapsed >= keepalive_time_when(tp)) {
-                if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
+                if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
-                     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+                     (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
                        tcp_send_active_reset(sk, GFP_ATOMIC);
                        tcp_write_err(sk);
                        goto out;
                }
                if (tcp_write_wakeup(sk) <= 0) {
-                        tp->probes_out++;
+                        icsk->icsk_probes_out++;
                        elapsed = keepalive_intvl_when(tp);
                } else {
                        /* If keepalive was lost due to local congestion,
@@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
        sk_stream_mem_reclaim(sk);
 resched:
-        tcp_reset_keepalive_timer (sk, elapsed);
+        inet_csk_reset_keepalive_timer (sk, elapsed);
        goto out;
 death:  
@@ -644,8 +526,3 @@ out:
        bh_unlock_sock(sk);
        sock_put(sk);
 }
-EXPORT_SYMBOL(tcp_clear_xmit_timers);
-EXPORT_SYMBOL(tcp_delete_keepalive_timer);
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9bd443db5193..93c5f92070f9 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -35,7 +35,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
 #include <net/tcp.h>
@@ -82,9 +82,10 @@ struct vegas {
 * Instead we must wait until the completion of an RTT during
 * which we actually receive ACKs.
 */
-static inline void vegas_enable(struct tcp_sock *tp)
+static inline void vegas_enable(struct sock *sk)
 {
-        struct vegas *vegas = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct vegas *vegas = inet_csk_ca(sk);
        /* Begin taking Vegas samples next time we send something. */
        vegas->doing_vegas_now = 1;
@@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp)
 }
 /* Stop taking Vegas samples for now. */
-static inline void vegas_disable(struct tcp_sock *tp)
+static inline void vegas_disable(struct sock *sk)
 {
-        struct vegas *vegas = tcp_ca(tp);
+        struct vegas *vegas = inet_csk_ca(sk);
        vegas->doing_vegas_now = 0;
 }
-static void tcp_vegas_init(struct tcp_sock *tp)
+static void tcp_vegas_init(struct sock *sk)
 {
-        struct vegas *vegas = tcp_ca(tp);
+        struct vegas *vegas = inet_csk_ca(sk);
        vegas->baseRTT = 0x7fffffff;
-        vegas_enable(tp);
+        vegas_enable(sk);
 }
 /* Do RTT sampling needed for Vegas.
@@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp)
 *   o min-filter RTT samples from a much longer window (forever for now)
 *     to find the propagation delay (baseRTT)
 */
-static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
 {
-        struct vegas *vegas = tcp_ca(tp);
+        struct vegas *vegas = inet_csk_ca(sk);
        u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
        /* Filter to find propagation delay: */
@@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
        vegas->cntRTT++;
 }
-static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+static void tcp_vegas_state(struct sock *sk, u8 ca_state)
 {
        if (ca_state == TCP_CA_Open)
-                vegas_enable(tp);
+                vegas_enable(sk);
        else
-                vegas_disable(tp);
+                vegas_disable(sk);
 }
 /*
@@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
 * packets, _then_ we can make Vegas calculations
 * again.
 */
-static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 {
        if (event == CA_EVENT_CWND_RESTART ||
            event == CA_EVENT_TX_START)
-                tcp_vegas_init(tp);
+                tcp_vegas_init(sk);
 }
-static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
                                 u32 seq_rtt, u32 in_flight, int flag)
 {
-        struct vegas *vegas = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct vegas *vegas = inet_csk_ca(sk);
        if (!vegas->doing_vegas_now)
-                return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+                return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
        /* The key players are v_beg_snd_una and v_beg_snd_nxt.
         *
@@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
                 * but that's not too awful, since we're taking the min,
                 * rather than averaging.
                 */
-                tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+                tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
                /* We do the Vegas calculations only if we got enough RTT
                 * samples that we can be reasonably sure that we got
@@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
 }
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+static void tcp_vegas_get_info(struct sock *sk, u32 ext,
                               struct sk_buff *skb)
 {
-        const struct vegas *ca = tcp_ca(tp);
+        const struct vegas *ca = inet_csk_ca(sk);
-        if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+        if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
                struct tcpvegas_info *info;
-                info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+                info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
                                          sizeof(*info)));
                info->tcpv_enabled = ca->doing_vegas_now;
@@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = {
 static int __init tcp_vegas_register(void)
 {
-        BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+        BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
        tcp_register_congestion_control(&tcp_vegas);
        return 0;
 }
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index ef827242c940..0c340c3756c2 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -8,7 +8,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
 #include <net/tcp.h>
 /* TCP Westwood structure */
@@ -40,9 +40,9 @@ struct westwood {
 * way as soon as possible. It will reasonably happen within the first
 * RTT period of the connection lifetime.
 */
-static void tcp_westwood_init(struct tcp_sock *tp)
+static void tcp_westwood_init(struct sock *sk)
 {
-        struct westwood *w = tcp_ca(tp);
+        struct westwood *w = inet_csk_ca(sk);
        w->bk = 0;
        w->bw_ns_est = 0;
@@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp)
        w->cumul_ack = 0;
        w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
        w->rtt_win_sx = tcp_time_stamp;
-        w->snd_una = tp->snd_una;
+        w->snd_una = tcp_sk(sk)->snd_una;
 }
 /*
@@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta)
 * Called after processing group of packets.
 * but all westwood needs is the last sample of srtt.
 */
-static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
 {
-        struct westwood *w = tcp_ca(tp);
+        struct westwood *w = inet_csk_ca(sk);
        if (cnt > 0)
-                w->rtt = tp->srtt >> 3;
+                w->rtt = tcp_sk(sk)->srtt >> 3;
 }
 /*
@@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
 * It updates RTT evaluation window if it is the right moment to do
 * it. If so it calls filter for evaluating bandwidth.
 */
-static void westwood_update_window(struct tcp_sock *tp)
+static void westwood_update_window(struct sock *sk)
 {
-        struct westwood *w = tcp_ca(tp);
+        struct westwood *w = inet_csk_ca(sk);
        s32 delta = tcp_time_stamp - w->rtt_win_sx;
        /*
@@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp)
 * header prediction is successful. In such case in fact update is
 * straight forward and doesn't need any particular care.
 */
-static inline void westwood_fast_bw(struct tcp_sock *tp)
+static inline void westwood_fast_bw(struct sock *sk)
 {
-        struct westwood *w = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct westwood *w = inet_csk_ca(sk);
-        westwood_update_window(tp);
+        westwood_update_window(sk);
        w->bk += tp->snd_una - w->snd_una;
        w->snd_una = tp->snd_una;
@@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp)
 * This function evaluates cumul_ack for evaluating bk in case of
 * delayed or partial acks.
 */
-static inline u32 westwood_acked_count(struct tcp_sock *tp)
+static inline u32 westwood_acked_count(struct sock *sk)
 {
-        struct westwood *w = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct westwood *w = inet_csk_ca(sk);
        w->cumul_ack = tp->snd_una - w->snd_una;
@@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp)
        return w->cumul_ack;
 }
-static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+static inline u32 westwood_bw_rttmin(const struct sock *sk)
 {
-        struct westwood *w = tcp_ca(tp);
+        const struct tcp_sock *tp = tcp_sk(sk);
+        const struct westwood *w = inet_csk_ca(sk);
        return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
@@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
 * so avoids ever returning 0.
 */
-static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+static u32 tcp_westwood_cwnd_min(struct sock *sk)
 {
-        return westwood_bw_rttmin(tp);
+        return westwood_bw_rttmin(sk);
 }
-static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 {
-        struct westwood *w = tcp_ca(tp);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct westwood *w = inet_csk_ca(sk);
        switch(event) {
        case CA_EVENT_FAST_ACK:
-                westwood_fast_bw(tp);
+                westwood_fast_bw(sk);
                break;
        case CA_EVENT_COMPLETE_CWR:
-                tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+                tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
                break;
        case CA_EVENT_FRTO:
-                tp->snd_ssthresh = westwood_bw_rttmin(tp);
+                tp->snd_ssthresh = westwood_bw_rttmin(sk);
                break;
        case CA_EVENT_SLOW_ACK:
-                westwood_update_window(tp);
+                westwood_update_window(sk);
-                w->bk += westwood_acked_count(tp);
+                w->bk += westwood_acked_count(sk);
                w->rtt_min = min(w->rtt, w->rtt_min);
                break;
@@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+static void tcp_westwood_info(struct sock *sk, u32 ext,
                              struct sk_buff *skb)
 {
-        const struct westwood *ca = tcp_ca(tp);
+        const struct westwood *ca = inet_csk_ca(sk);
-        if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+        if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
                struct rtattr *rta;
                struct tcpvegas_info *info;
-                rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+                rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
                info = RTA_DATA(rta);
                info->tcpv_enabled = 1;
                info->tcpv_rttcnt = 0;
@@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = {
 static int __init tcp_westwood_register(void)
 {
-        BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+        BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
        return tcp_register_congestion_control(&tcp_westwood);
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dc4d07357e3a..e5beca7de86c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
 #include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <net/snmp.h>
-#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
 *      Snmp MIB for the UDP layer
 */
-DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
                /* ... which is an evident application bug. --ANK */
                release_sock(sk);
-                LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
                err = -EINVAL;
                goto out;
        }
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
        if (unlikely(!up->pending)) {
                release_sock(sk);
-                LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
                return -EINVAL;
        }
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
                        return 0;
-                LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
                skb->ip_summed = CHECKSUM_NONE;
        }
        if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
        return(0);
 short_packet:
-        LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+        LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
-                              NIPQUAD(saddr),
+                       NIPQUAD(saddr),
-                              ntohs(uh->source),
+                       ntohs(uh->source),
-                              ulen,
+                       ulen,
-                              len,
+                       len,
-                              NIPQUAD(daddr),
+                       NIPQUAD(daddr),
-                              ntohs(uh->dest)));
+                       ntohs(uh->dest));
 no_header:
        UDP_INC_STATS_BH(UDP_MIB_INERRORS);
        kfree_skb(skb);
@@ -1198,12 +1199,12 @@ csum_error:
         * RFC1122: OK.  Discards the bad packet silently (as far as 
         * the network is concerned, anyway) as per 4.1.3.4 (MUST). 
         */
-        LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+        LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
-                              NIPQUAD(saddr),
+                       NIPQUAD(saddr),
-                              ntohs(uh->source),
+                       ntohs(uh->source),
-                              NIPQUAD(daddr),
+                       NIPQUAD(daddr),
-                              ntohs(uh->dest),
+                       ntohs(uh->dest),
-                              ulen));
+                       ulen);
 drop:
        UDP_INC_STATS_BH(UDP_MIB_INERRORS);
        kfree_skb(skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 050611d7a967..d23e07fc81fa 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -128,8 +128,10 @@ void __init xfrm4_state_init(void)
        xfrm_state_register_afinfo(&xfrm4_state_afinfo);
 }
+#if 0
 void __exit xfrm4_state_fini(void)
 {
        xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
 }
+#endif  /*  0  */
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index b39e04940590..6460eec834b7 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
                route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
                protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
                exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
-                ip6_flowlabel.o ipv6_syms.o
+                ip6_flowlabel.o ipv6_syms.o netfilter.o
 ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
        xfrm6_output.o
@@ -23,3 +23,5 @@ obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-y += exthdrs_core.o
+obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 77004b9456c0..937ad32db77c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1041,9 +1041,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
        const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
        const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
        u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
-        u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
+        u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
        int sk_ipv6only = ipv6_only_sock(sk);
-        int sk2_ipv6only = tcp_v6_ipv6only(sk2);
+        int sk2_ipv6only = inet_v6_ipv6only(sk2);
        int addr_type = ipv6_addr_type(sk_rcv_saddr6);
        int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -1126,7 +1126,7 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr)
        __ipv6_dev_mc_dec(idev, &maddr);
 }
-void addrconf_join_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
 {
        struct in6_addr addr;
        ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1135,7 +1135,7 @@ void addrconf_join_anycast(struct inet6_ifaddr *ifp)
        ipv6_dev_ac_inc(ifp->idev->dev, &addr);
 }
-void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
 {
        struct in6_addr addr;
        ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -2858,16 +2858,16 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, ENOBUFS);
                return;
        }
        if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
                kfree_skb(skb);
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, EINVAL);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR;
+        NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFADDR;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFADDR, GFP_ATOMIC);
 }
 static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
@@ -2994,16 +2994,16 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
        
        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, ENOBUFS);
                return;
        }
        if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
                kfree_skb(skb);
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, EINVAL);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO;
+        NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFINFO;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC);
 }
 static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
@@ -3054,16 +3054,16 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, ENOBUFS);
                return;
        }
        if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
                kfree_skb(skb);
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, EINVAL);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX;
+        NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_PREFIX;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC);
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_PREFIX, GFP_ATOMIC);
 }
 static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 28d9bcab0970..4f8795af2edb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -44,6 +44,7 @@
 #include <linux/netdevice.h>
 #include <linux/icmpv6.h>
 #include <linux/smp_lock.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
@@ -66,45 +67,14 @@ MODULE_AUTHOR("Cast of dozens");
 MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
 MODULE_LICENSE("GPL");
-/* IPv6 procfs goodies... */
-#ifdef CONFIG_PROC_FS
-extern int raw6_proc_init(void);
-extern void raw6_proc_exit(void);
-extern int tcp6_proc_init(void);
-extern void tcp6_proc_exit(void);
-extern int udp6_proc_init(void);
-extern void udp6_proc_exit(void);
-extern int ipv6_misc_proc_init(void);
-extern void ipv6_misc_proc_exit(void);
-extern int ac6_proc_init(void);
-extern void ac6_proc_exit(void);
-extern int if6_proc_init(void);
-extern void if6_proc_exit(void);
-#endif
 int sysctl_ipv6_bindv6only;
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet6_sock_nr;
-EXPORT_SYMBOL(inet6_sock_nr);
-#endif
 /* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
 static struct list_head inetsw6[SOCK_MAX];
 static DEFINE_SPINLOCK(inetsw6_lock);
-static void inet6_sock_destruct(struct sock *sk)
-{
-        inet_sock_destruct(sk);
-#ifdef INET_REFCNT_DEBUG
-        atomic_dec(&inet6_sock_nr);
-#endif
-}
 static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
 {
        const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -185,7 +155,7 @@ static int inet6_create(struct socket *sock, int protocol)
                        inet->hdrincl = 1;
        }
-        sk->sk_destruct         = inet6_sock_destruct;
+        sk->sk_destruct         = inet_sock_destruct;
        sk->sk_family           = PF_INET6;
        sk->sk_protocol         = protocol;
@@ -212,12 +182,17 @@ static int inet6_create(struct socket *sock, int protocol)
                inet->pmtudisc = IP_PMTUDISC_DONT;
        else
                inet->pmtudisc = IP_PMTUDISC_WANT;
+        /* 
+         * Increment only the relevant sk_prot->socks debug field, this changes
+         * the previous behaviour of incrementing both the equivalent to
+         * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
+         *
+         * This allows better debug granularity as we'll know exactly how many
+         * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
+         * transport protocol socks. -acme
+         */
+        sk_refcnt_debug_inc(sk);
-#ifdef INET_REFCNT_DEBUG
-        atomic_inc(&inet6_sock_nr);
-        atomic_inc(&inet_sock_nr);
-#endif
        if (inet->num) {
                /* It assumes that any protocol which allows
                 * the user to assign a number at socket
@@ -513,11 +488,6 @@ static struct net_proto_family inet6_family_ops = {
        .owner  = THIS_MODULE,
 };
-#ifdef CONFIG_SYSCTL
-extern void ipv6_sysctl_register(void);
-extern void ipv6_sysctl_unregister(void);
-#endif
 /* Same as inet6_dgram_ops, sans udp_poll.  */
 static struct proto_ops inet6_sockraw_ops = {
        .family =       PF_INET6,
@@ -684,8 +654,6 @@ static void cleanup_ipv6_mibs(void)
        snmp6_mib_free((void **)udp_stats_in6);
 }
-extern int ipv6_misc_proc_init(void);
 static int __init inet6_init(void)
 {
        struct sk_buff *dummy_skb;
@@ -757,6 +725,9 @@ static int __init inet6_init(void)
        err = igmp6_init(&inet6_family_ops);
        if (err)
                goto igmp_fail;
+        err = ipv6_netfilter_init();
+        if (err)
+                goto netfilter_fail;
        /* Create /proc/foo6 entries. */
 #ifdef CONFIG_PROC_FS
        err = -ENOMEM;
@@ -813,6 +784,8 @@ proc_tcp6_fail:
        raw6_proc_exit();
 proc_raw6_fail:
 #endif
+        ipv6_netfilter_fini();
+netfilter_fail:
        igmp6_cleanup();
 igmp_fail:
        ndisc_cleanup();
@@ -852,6 +825,7 @@ static void __exit inet6_exit(void)
        ip6_route_cleanup();
        ipv6_packet_cleanup();
        igmp6_cleanup();
+        ipv6_netfilter_fini();
        ndisc_cleanup();
        icmpv6_cleanup();
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 986fdfdccbcd..0ebfad907a03 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -131,10 +131,10 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len)
                case NEXTHDR_HOP:
                case NEXTHDR_DEST:
                        if (!zero_out_mutable_opts(exthdr.opth)) {
-                                LIMIT_NETDEBUG(printk(
+                                LIMIT_NETDEBUG(
                                        KERN_WARNING "overrun %sopts\n",
                                        nexthdr == NEXTHDR_HOP ?
-                                                "hop" : "dest"));
+                                                "hop" : "dest");
                                return -EINVAL;
                        }
                        break;
@@ -293,8 +293,7 @@ static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
                skb_push(skb, skb->data - skb->nh.raw);
                ahp->icv(ahp, skb, ah->auth_data);
                if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
-                        LIMIT_NETDEBUG(
+                        LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
-                                printk(KERN_WARNING "ipsec ah authentication error\n"));
                        x->stats.integrity_failed++;
                        goto free_out;
                }
@@ -332,9 +331,9 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        if (!x)
                return;
-        NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
+        NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
-                        "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+                 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
-               ntohl(ah->spi), NIP6(iph->daddr)));
+                 ntohl(ah->spi), NIP6(iph->daddr));
        xfrm_state_put(x);
 }
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 5229365cd8b4..01468fab3d3d 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -29,6 +29,7 @@
 #include <net/addrconf.h>
 #include <net/transp_v6.h>
 #include <net/ip6_route.h>
+#include <net/tcp_states.h>
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
@@ -588,8 +589,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
                        break;
                default:
-                        LIMIT_NETDEBUG(
+                        LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n",
-                                printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type));
+                                       cmsg->cmsg_type);
                        err = -EINVAL;
                        break;
                };
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 324db62515a2..e8bff9d3d96c 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -212,8 +212,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru
                padlen = nexthdr[0];
                if (padlen+2 >= elen) {
-                        LIMIT_NETDEBUG(
+                        LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen);
-                                printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen));
                        ret = -EINVAL;
                        goto out;
                }
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index e0839eafc3a9..5be6da2584ee 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -424,8 +424,8 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
                IP6CB(skb)->ra = optoff;
                return 1;
        }
-        LIMIT_NETDEBUG(
+        LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n",
-                 printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1]));
+                       skb->nh.raw[optoff+1]);
        kfree_skb(skb);
        return 0;
 }
@@ -437,8 +437,8 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
        u32 pkt_len;
        if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) {
-                LIMIT_NETDEBUG(
+                LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
-                         printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1]));
+                               skb->nh.raw[optoff+1]);
                IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
                goto drop;
        }
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ff3ec9822e36..5176fc655ea9 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -67,7 +67,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
-DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
 /*
 *      The ICMP socket(s). This is the most convenient way to flow control
@@ -332,8 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
         *      for now we don't know that.
         */
        if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
-                LIMIT_NETDEBUG(
+                LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
-                        printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"));
                return;
        }
@@ -341,8 +340,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
         *      Never answer to a ICMP packet.
         */
        if (is_ineligible(skb)) {
-                LIMIT_NETDEBUG(
+                LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
-                        printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n")); 
                return;
        }
@@ -393,8 +391,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
        len = skb->len - msg.offset;
        len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
        if (len < 0) {
-                LIMIT_NETDEBUG(
+                LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n");
-                        printk(KERN_DEBUG "icmp: len problem\n"));
                goto out_dst_release;
        }
@@ -551,7 +548,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
        read_lock(&raw_v6_lock);
        if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) {
-                while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) {
+                while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr,
+                                            skb->dev->ifindex))) {
                        rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
                        sk = sk_next(sk);
                }
@@ -583,17 +581,15 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
                                    skb->csum)) {
-                        LIMIT_NETDEBUG(
+                        LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n");
-                                printk(KERN_DEBUG "ICMPv6 hw checksum failed\n"));
                        skb->ip_summed = CHECKSUM_NONE;
                }
        }
        if (skb->ip_summed == CHECKSUM_NONE) {
                if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
                                    skb_checksum(skb, 0, skb->len, 0))) {
-                        LIMIT_NETDEBUG(
+                        LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
-                                printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
+                                       NIP6(*saddr), NIP6(*daddr));
-                                       NIP6(*saddr), NIP6(*daddr)));
                        goto discard_it;
                }
        }
@@ -669,8 +665,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
                break;
        default:
-                LIMIT_NETDEBUG(
+                LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
-                        printk(KERN_DEBUG "icmpv6: msg of unknown type\n"));
                /* informational */
                if (type & ICMPV6_INFOMSG_MASK)
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
new file mode 100644
index 000000000000..01d5f46d4e40
--- /dev/null
+++ b/net/ipv6/inet6_hashtables.c
@@ -0,0 +1,81 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Generic INET6 transport hashtables
+ *
+ * Authors:     Lotsa people, from code originally in tcp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
+                                   const struct in6_addr *daddr,
+                                   const unsigned short hnum, const int dif)
+{
+        struct sock *sk;
+        const struct hlist_node *node;
+        struct sock *result = NULL;
+        int score, hiscore = 0;
+        read_lock(&hashinfo->lhash_lock);
+        sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
+                if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
+                        const struct ipv6_pinfo *np = inet6_sk(sk);
+                        
+                        score = 1;
+                        if (!ipv6_addr_any(&np->rcv_saddr)) {
+                                if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+                                        continue;
+                                score++;
+                        }
+                        if (sk->sk_bound_dev_if) {
+                                if (sk->sk_bound_dev_if != dif)
+                                        continue;
+                                score++;
+                        }
+                        if (score == 3) {
+                                result = sk;
+                                break;
+                        }
+                        if (score > hiscore) {
+                                hiscore = score;
+                                result = sk;
+                        }
+                }
+        }
+        if (result)
+                sock_hold(result);
+        read_unlock(&hashinfo->lhash_lock);
+        return result;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup_listener);
+struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
+                          const struct in6_addr *saddr, const u16 sport,
+                          const struct in6_addr *daddr, const u16 dport,
+                          const int dif)
+{
+        struct sock *sk;
+        local_bh_disable();
+        sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+        local_bh_enable();
+        return sk;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1b354aa97934..16af874c9e8f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -49,7 +49,7 @@
 struct rt6_statistics   rt6_stats;
-static kmem_cache_t * fib6_node_kmem;
+static kmem_cache_t * fib6_node_kmem __read_mostly;
 enum fib_walk_state_t
 {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 10fbb50daea4..6e3480426939 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct sk_buff *skb)
        return dst_input(skb);
 }
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct ipv6hdr *hdr;
        u32             pkt_len;
@@ -166,8 +166,8 @@ resubmit:
        nexthdr = skb->nh.raw[nhoff];
        raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
-        if (raw_sk)
+        if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
-                ipv6_raw_deliver(skb, nexthdr);
+                raw_sk = NULL;
        hash = nexthdr & (MAX_INET_PROTOS - 1);
        if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ae652ca14bc9..01ef94f7c7f1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -153,51 +153,6 @@ int ip6_output(struct sk_buff *skb)
                return ip6_output2(skb);
 }
-#ifdef CONFIG_NETFILTER
-int ip6_route_me_harder(struct sk_buff *skb)
-{
-        struct ipv6hdr *iph = skb->nh.ipv6h;
-        struct dst_entry *dst;
-        struct flowi fl = {
-                .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
-                .nl_u =
-                { .ip6_u =
-                  { .daddr = iph->daddr,
-                    .saddr = iph->saddr, } },
-                .proto = iph->nexthdr,
-        };
-        dst = ip6_route_output(skb->sk, &fl);
-        if (dst->error) {
-                IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
-                LIMIT_NETDEBUG(
-                        printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
-                dst_release(dst);
-                return -EINVAL;
-        }
-        /* Drop old route. */
-        dst_release(skb->dst);
-        skb->dst = dst;
-        return 0;
-}
-#endif
-static inline int ip6_maybe_reroute(struct sk_buff *skb)
-{
-#ifdef CONFIG_NETFILTER
-        if (skb->nfcache & NFC_ALTERED){
-                if (ip6_route_me_harder(skb) != 0){
-                        kfree_skb(skb);
-                        return -EINVAL;
-                }
-        }
-#endif /* CONFIG_NETFILTER */
-        return dst_output(skb);
-}
 /*
 *      xmit an sk_buff (used by TCP)
 */
@@ -266,7 +221,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
        mtu = dst_mtu(dst);
        if ((skb->len <= mtu) || ipfragok) {
                IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
-                return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
+                return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
+                                dst_output);
        }
        if (net_ratelimit())
@@ -321,7 +277,9 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
        read_lock(&ip6_ra_lock);
        for (ra = ip6_ra_chain; ra; ra = ra->next) {
                struct sock *sk = ra->sk;
-                if (sk && ra->sel == sel) {
+                if (sk && ra->sel == sel &&
+                    (!sk->sk_bound_dev_if ||
+                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
                        if (last) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                                if (skb2)
@@ -667,7 +625,7 @@ slow_path:
                 */
                if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
-                        NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
+                        NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
                        IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                        err = -ENOMEM;
                        goto fail;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3bc144a79fa5..76466af8331e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,7 +55,7 @@
 #include <asm/uaccess.h>
-DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
 static struct packet_type ipv6_packet_type = {
        .type = __constant_htons(ETH_P_IPV6), 
@@ -109,13 +109,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
        return 0;
 }
-extern int ip6_mc_source(int add, int omode, struct sock *sk,
-        struct group_source_req *pgsr);
-extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
-extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
-        struct group_filter __user *optval, int __user *optlen);
 int ipv6_setsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int optlen)
 {
@@ -163,6 +156,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
                        fl6_free_socklist(sk);
                        ipv6_sock_mc_close(sk);
+                        /*
+                         * Sock is moving from IPv6 to IPv4 (sk_prot), so
+                         * remove it from the refcnt debug socks count in the
+                         * original family...
+                         */
+                        sk_refcnt_debug_dec(sk);
                        if (sk->sk_protocol == IPPROTO_TCP) {
                                struct tcp_sock *tp = tcp_sk(sk);
@@ -192,9 +192,11 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
                                kfree_skb(pktopt);
                        sk->sk_destruct = inet_sock_destruct;
-#ifdef INET_REFCNT_DEBUG
+                        /*
-                        atomic_dec(&inet6_sock_nr);
+                         * ... and add it to the refcnt debug socks count
-#endif
+                         * in the new family. -acme
+                         */
+                        sk_refcnt_debug_inc(sk);
                        module_put(THIS_MODULE);
                        retv = 0;
                        break;
@@ -437,7 +439,6 @@ done:
        }
        case MCAST_MSFILTER:
        {
-                extern int sysctl_optmem_max;
                extern int sysctl_mld_max_msf;
                struct group_filter *gsf;
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 5ade5a5d1990..37a4a99c9fe9 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map);
 EXPORT_SYMBOL(register_inet6addr_notifier);
 EXPORT_SYMBOL(unregister_inet6addr_notifier);
 EXPORT_SYMBOL(ip6_route_output);
-#ifdef CONFIG_NETFILTER
-EXPORT_SYMBOL(ip6_route_me_harder);
-#endif
 EXPORT_SYMBOL(addrconf_lock);
 EXPORT_SYMBOL(ipv6_setsockopt);
 EXPORT_SYMBOL(ipv6_getsockopt);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7ae72d4c9bd2..a7eae30f4554 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -812,7 +812,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
                if (ipv6_chk_acast_addr(dev, &msg->target) ||
                    (idev->cnf.forwarding && 
                     pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) {
-                        if (skb->stamp.tv_sec != LOCALLY_ENQUEUED &&
+                        if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
                            skb->pkt_type != PACKET_HOST &&
                            inc != 0 &&
                            idev->nd_parms->proxy_delay != 0) {
@@ -1487,6 +1487,8 @@ int ndisc_rcv(struct sk_buff *skb)
                return 0;
        }
+        memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
        switch (msg->icmph.icmp6_type) {
        case NDISC_NEIGHBOUR_SOLICITATION:
                ndisc_recv_ns(skb);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
new file mode 100644
index 000000000000..f8626ebf90fd
--- /dev/null
+++ b/net/ipv6/netfilter.c
@@ -0,0 +1,104 @@
+#include <linux/config.h>
+#include <linux/init.h>
+#ifdef CONFIG_NETFILTER
+#include <linux/kernel.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+int ip6_route_me_harder(struct sk_buff *skb)
+{
+        struct ipv6hdr *iph = skb->nh.ipv6h;
+        struct dst_entry *dst;
+        struct flowi fl = {
+                .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+                .nl_u =
+                { .ip6_u =
+                  { .daddr = iph->daddr,
+                    .saddr = iph->saddr, } },
+                .proto = iph->nexthdr,
+        };
+        dst = ip6_route_output(skb->sk, &fl);
+        if (dst->error) {
+                IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+                LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
+                dst_release(dst);
+                return -EINVAL;
+        }
+        /* Drop old route. */
+        dst_release(skb->dst);
+        skb->dst = dst;
+        return 0;
+}
+EXPORT_SYMBOL(ip6_route_me_harder);
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+struct ip6_rt_info {
+        struct in6_addr daddr;
+        struct in6_addr saddr;
+};
+static void save(const struct sk_buff *skb, struct nf_info *info)
+{
+        struct ip6_rt_info *rt_info = nf_info_reroute(info);
+        if (info->hook == NF_IP6_LOCAL_OUT) {
+                struct ipv6hdr *iph = skb->nh.ipv6h;
+                rt_info->daddr = iph->daddr;
+                rt_info->saddr = iph->saddr;
+        }
+}
+static int reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+        struct ip6_rt_info *rt_info = nf_info_reroute(info);
+        if (info->hook == NF_IP6_LOCAL_OUT) {
+                struct ipv6hdr *iph = (*pskb)->nh.ipv6h;
+                if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
+                    !ipv6_addr_equal(&iph->saddr, &rt_info->saddr))
+                        return ip6_route_me_harder(*pskb);
+        }
+        return 0;
+}
+static struct nf_queue_rerouter ip6_reroute = {
+        .rer_size       = sizeof(struct ip6_rt_info),
+        .save           = &save,
+        .reroute        = &reroute,
+};
+int __init ipv6_netfilter_init(void)
+{
+        return nf_register_queue_rerouter(PF_INET6, &ip6_reroute);
+}
+void ipv6_netfilter_fini(void)
+{
+        nf_unregister_queue_rerouter(PF_INET6);
+}
+#else /* CONFIG_NETFILTER */
+int __init ipv6_netfilter_init(void)
+{
+        return 0;
+}
+void ipv6_netfilter_fini(void)
+{
+}
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 77ec704c9ee3..216fbe1ac65c 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
 #  dep_tristate '  FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK
 #fi
 config IP6_NF_QUEUE
-        tristate "Userspace queueing via NETLINK"
+        tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
        ---help---
          This option adds a queue handler to the kernel for IPv6
-          packets which lets us to receive the filtered packets
+          packets which enables users to receive the filtered packets
-          with QUEUE target using libiptc as we can do with
+          with QUEUE target using libipq.
-          the IPv4 now.
+          THis option enables the old IPv6-only "ip6_queue" implementation
+          which has been obsoleted by the new "nfnetlink_queue" code (see
+          CONFIG_NETFILTER_NETLINK_QUEUE).
          (C) Fernando Anton 2001
          IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
@@ -196,6 +199,16 @@ config IP6_NF_TARGET_LOG
          To compile it as a module, choose M here.  If unsure, say N.
+config IP6_NF_TARGET_REJECT
+        tristate "REJECT target support"
+        depends on IP6_NF_FILTER
+        help
+          The REJECT target allows a filtering rule to specify that an ICMPv6
+          error should be issued in response to an incoming packet, rather
+          than silently being dropped.
+          To compile it as a module, choose M here.  If unsure, say N.
 #  if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then
 #    dep_tristate '    REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER
 #    if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
@@ -226,6 +239,22 @@ config IP6_NF_TARGET_MARK
          To compile it as a module, choose M here.  If unsure, say N.
+config IP6_NF_TARGET_HL
+        tristate  'HL (hoplimit) target support'
+        depends on IP6_NF_MANGLE
+        help
+          This option adds a `HL' target, which enables the user to decrement
+          the hoplimit value of the IPv6 header or set it to a given (lower)
+          value.
+        
+          While it is safe to decrement the hoplimit value, this option also
+          enables functionality to increment and set the hoplimit value of the
+          IPv6 header to arbitrary values.  This is EXTREMELY DANGEROUS since
+          you can easily create immortal packets that loop forever on the
+          network.  
+          To compile it as a module, choose M here.  If unsure, say N.
 #dep_tristate '  LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
 config IP6_NF_RAW
        tristate  'raw table support (required for TRACE)'
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2e51714953b6..bd9a16a5cbba 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -20,7 +20,10 @@ obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o
 obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
 obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
 obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o
+obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o
 obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
 obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
 obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
 obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
+obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index a16df5b27c84..aa11cf366efa 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -47,16 +47,10 @@
 #define NET_IPQ_QMAX 2088
 #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
-struct ipq_rt_info {
-        struct in6_addr daddr;
-        struct in6_addr saddr;
-};
 struct ipq_queue_entry {
        struct list_head list;
        struct nf_info *info;
        struct sk_buff *skb;
-        struct ipq_rt_info rt_info;
 };
 typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -244,8 +238,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
        pmsg->packet_id       = (unsigned long )entry;
        pmsg->data_len        = data_len;
-        pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
+        pmsg->timestamp_sec   = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
-        pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+        pmsg->timestamp_usec  = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
        pmsg->mark            = entry->skb->nfmark;
        pmsg->hook            = entry->info->hook;
        pmsg->hw_protocol     = entry->skb->protocol;
@@ -284,7 +278,8 @@ nlmsg_failure:
 }
 static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, 
+                   unsigned int queuenum, void *data)
 {
        int status = -EINVAL;
        struct sk_buff *nskb;
@@ -302,13 +297,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
        entry->info = info;
        entry->skb = skb;
-        if (entry->info->hook == NF_IP_LOCAL_OUT) {
-                struct ipv6hdr *iph = skb->nh.ipv6h;
-                entry->rt_info.daddr = iph->daddr;
-                entry->rt_info.saddr = iph->saddr;
-        }
        nskb = ipq_build_packet_message(entry, &status);
        if (nskb == NULL)
                goto err_out_free;
@@ -384,23 +372,11 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
                }
                skb_put(e->skb, diff);
        }
-        if (!skb_ip_make_writable(&e->skb, v->data_len))
+        if (!skb_make_writable(&e->skb, v->data_len))
                return -ENOMEM;
        memcpy(e->skb->data, v->payload, v->data_len);
        e->skb->ip_summed = CHECKSUM_NONE;
-        e->skb->nfcache |= NFC_ALTERED;
-        /*
-         * Extra routing may needed on local out, as the QUEUE target never
-         * returns control to the table.
-         * Not a nice way to cmp, but works
-         */
-        if (e->info->hook == NF_IP_LOCAL_OUT) {
-                struct ipv6hdr *iph = e->skb->nh.ipv6h;
-                if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) ||
-                    !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr))
-                        return ip6_route_me_harder(e->skb);
-        }
        return 0;
 }
@@ -676,6 +652,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
        return len;
 }
+static struct nf_queue_handler nfqh = {
+        .name   = "ip6_queue",
+        .outfn  = &ipq_enqueue_packet,
+};
 static int
 init_or_cleanup(int init)
 {
@@ -686,7 +667,8 @@ init_or_cleanup(int init)
                goto cleanup;
        netlink_register_notifier(&ipq_nl_notifier);
-        ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk);
+        ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk,
+                                      THIS_MODULE);
        if (ipqnl == NULL) {
                printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
                goto cleanup_netlink_notifier;
@@ -703,7 +685,7 @@ init_or_cleanup(int init)
        register_netdevice_notifier(&ipq_dev_notifier);
        ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
        
-        status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL);
+        status = nf_register_queue_handler(PF_INET6, &nfqh);
        if (status < 0) {
                printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
                goto cleanup_sysctl;
@@ -711,7 +693,7 @@ init_or_cleanup(int init)
        return status;
 cleanup:
-        nf_unregister_queue_handler(PF_INET6);
+        nf_unregister_queue_handlers(&nfqh);
        synchronize_net();
        ipq_flush(NF_DROP);
        
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73034511c8db..1cb8adb2787f 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -401,7 +401,6 @@ ip6t_do_table(struct sk_buff **pskb,
        do {
                IP_NF_ASSERT(e);
                IP_NF_ASSERT(back);
-                (*pskb)->nfcache |= e->nfcache;
                if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6,
                        &protoff, &offset)) {
                        struct ip6t_entry_target *t;
@@ -434,8 +433,8 @@ ip6t_do_table(struct sk_buff **pskb,
                                                         back->comefrom);
                                        continue;
                                }
-                                if (table_base + v
+                                if (table_base + v != (void *)e + e->next_offset
-                                    != (void *)e + e->next_offset) {
+                                    && !(e->ipv6.flags & IP6T_F_GOTO)) {
                                        /* Save old back ptr in next entry */
                                        struct ip6t_entry *next
                                                = (void *)e + e->next_offset;
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
new file mode 100644
index 000000000000..8f5549b72720
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -0,0 +1,118 @@
+/* 
+ * Hop Limit modification target for ip6tables
+ * Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ * Based on HW's TTL module
+ *
+ * This software is distributed under the terms of GNU GPL
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_HL.h>
+MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
+MODULE_DESCRIPTION("IP tables Hop Limit modification module");
+MODULE_LICENSE("GPL");
+static unsigned int ip6t_hl_target(struct sk_buff **pskb, 
+                                   const struct net_device *in,
+                                   const struct net_device *out,
+                                   unsigned int hooknum,
+                                   const void *targinfo, void *userinfo)
+{
+        struct ipv6hdr *ip6h;
+        const struct ip6t_HL_info *info = targinfo;
+        u_int16_t diffs[2];
+        int new_hl;
+        if (!skb_make_writable(pskb, (*pskb)->len))
+                return NF_DROP;
+        ip6h = (*pskb)->nh.ipv6h;
+        switch (info->mode) {
+                case IP6T_HL_SET:
+                        new_hl = info->hop_limit;
+                        break;
+                case IP6T_HL_INC:
+                        new_hl = ip6h->hop_limit + info->hop_limit;
+                        if (new_hl > 255)
+                                new_hl = 255;
+                        break;
+                case IP6T_HL_DEC:
+                        new_hl = ip6h->hop_limit - info->hop_limit;
+                        if (new_hl < 0)
+                                new_hl = 0;
+                        break;
+                default:
+                        new_hl = ip6h->hop_limit;
+                        break;
+        }
+        if (new_hl != ip6h->hop_limit) {
+                diffs[0] = htons(((unsigned)ip6h->hop_limit) << 8) ^ 0xFFFF;
+                ip6h->hop_limit = new_hl;
+                diffs[1] = htons(((unsigned)ip6h->hop_limit) << 8);
+        }
+        return IP6T_CONTINUE;
+}
+static int ip6t_hl_checkentry(const char *tablename,
+                const struct ip6t_entry *e,
+                void *targinfo,
+                unsigned int targinfosize,
+                unsigned int hook_mask)
+{
+        struct ip6t_HL_info *info = targinfo;
+        if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_HL_info))) {
+                printk(KERN_WARNING "ip6t_HL: targinfosize %u != %Zu\n",
+                                targinfosize,
+                                IP6T_ALIGN(sizeof(struct ip6t_HL_info)));
+                return 0;       
+        }       
+        if (strcmp(tablename, "mangle")) {
+                printk(KERN_WARNING "ip6t_HL: can only be called from "
+                        "\"mangle\" table, not \"%s\"\n", tablename);
+                return 0;
+        }
+        if (info->mode > IP6T_HL_MAXMODE) {
+                printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", 
+                        info->mode);
+                return 0;
+        }
+        if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) {
+                printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't "
+                        "make sense with value 0\n");
+                return 0;
+        }
+        
+        return 1;
+}
+static struct ip6t_target ip6t_HL = { 
+        .name           = "HL", 
+        .target         = ip6t_hl_target, 
+        .checkentry     = ip6t_hl_checkentry, 
+        .me             = THIS_MODULE
+};
+static int __init init(void)
+{
+        return ip6t_register_target(&ip6t_HL);
+}
+static void __exit fini(void)
+{
+        ip6t_unregister_target(&ip6t_HL);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index a692e26a4fa3..0cd1d1bd9033 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
 MODULE_DESCRIPTION("IP6 tables LOG target module");
 MODULE_LICENSE("GPL");
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
- 
 struct in_device;
 #include <net/route.h>
 #include <linux/netfilter_ipv6/ip6t_LOG.h>
@@ -44,7 +40,7 @@ struct in_device;
 static DEFINE_SPINLOCK(log_lock);
 /* One level of recursion won't kill us */
-static void dump_packet(const struct ip6t_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
                        const struct sk_buff *skb, unsigned int ip6hoff,
                        int recurse)
 {
@@ -53,6 +49,12 @@ static void dump_packet(const struct ip6t_log_info *info,
        struct ipv6hdr _ip6h, *ih;
        unsigned int ptr;
        unsigned int hdrlen = 0;
+        unsigned int logflags;
+        if (info->type == NF_LOG_TYPE_LOG)
+                logflags = info->u.log.logflags;
+        else
+                logflags = NF_LOG_MASK;
        ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
        if (ih == NULL) {
@@ -84,7 +86,7 @@ static void dump_packet(const struct ip6t_log_info *info,
                }
                /* Max length: 48 "OPT (...) " */
-                if (info->logflags & IP6T_LOG_IPOPT)
+                if (logflags & IP6T_LOG_IPOPT)
                        printk("OPT ( ");
                switch (currenthdr) {
@@ -119,7 +121,7 @@ static void dump_packet(const struct ip6t_log_info *info,
                case IPPROTO_ROUTING:
                case IPPROTO_HOPOPTS:
                        if (fragment) {
-                                if (info->logflags & IP6T_LOG_IPOPT)
+                                if (logflags & IP6T_LOG_IPOPT)
                                        printk(")");
                                return;
                        }
@@ -127,7 +129,7 @@ static void dump_packet(const struct ip6t_log_info *info,
                        break;
                /* Max Length */
                case IPPROTO_AH:
-                        if (info->logflags & IP6T_LOG_IPOPT) {
+                        if (logflags & IP6T_LOG_IPOPT) {
                                struct ip_auth_hdr _ahdr, *ah;
                                /* Max length: 3 "AH " */
@@ -158,7 +160,7 @@ static void dump_packet(const struct ip6t_log_info *info,
                        hdrlen = (hp->hdrlen+2)<<2;
                        break;
                case IPPROTO_ESP:
-                        if (info->logflags & IP6T_LOG_IPOPT) {
+                        if (logflags & IP6T_LOG_IPOPT) {
                                struct ip_esp_hdr _esph, *eh;
                                /* Max length: 4 "ESP " */
@@ -190,7 +192,7 @@ static void dump_packet(const struct ip6t_log_info *info,
                        printk("Unknown Ext Hdr %u", currenthdr);
                        return;
                }
-                if (info->logflags & IP6T_LOG_IPOPT)
+                if (logflags & IP6T_LOG_IPOPT)
                        printk(") ");
                currenthdr = hp->nexthdr;
@@ -218,7 +220,7 @@ static void dump_packet(const struct ip6t_log_info *info,
                printk("SPT=%u DPT=%u ",
                       ntohs(th->source), ntohs(th->dest));
                /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
-                if (info->logflags & IP6T_LOG_TCPSEQ)
+                if (logflags & IP6T_LOG_TCPSEQ)
                        printk("SEQ=%u ACK=%u ",
                               ntohl(th->seq), ntohl(th->ack_seq));
                /* Max length: 13 "WINDOW=65535 " */
@@ -245,7 +247,7 @@ static void dump_packet(const struct ip6t_log_info *info,
                /* Max length: 11 "URGP=65535 " */
                printk("URGP=%u ", ntohs(th->urg_ptr));
-                if ((info->logflags & IP6T_LOG_TCPOPT)
+                if ((logflags & IP6T_LOG_TCPOPT)
                    && th->doff * 4 > sizeof(struct tcphdr)) {
                        u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
                        unsigned int i;
@@ -349,7 +351,7 @@ static void dump_packet(const struct ip6t_log_info *info,
        }
        /* Max length: 15 "UID=4294967295 " */
-        if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) {
+        if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
                read_lock_bh(&skb->sk->sk_callback_lock);
                if (skb->sk->sk_socket && skb->sk->sk_socket->file)
                        printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -357,19 +359,31 @@ static void dump_packet(const struct ip6t_log_info *info,
        }
 }
+static struct nf_loginfo default_loginfo = {
+        .type   = NF_LOG_TYPE_LOG,
+        .u = {
+                .log = {
+                        .level    = 0,
+                        .logflags = NF_LOG_MASK,
+                },
+        },
+};
 static void
-ip6t_log_packet(unsigned int hooknum,
+ip6t_log_packet(unsigned int pf,
+                unsigned int hooknum,
                const struct sk_buff *skb,
                const struct net_device *in,
                const struct net_device *out,
-                const struct ip6t_log_info *loginfo,
+                const struct nf_loginfo *loginfo,
-                const char *level_string,
                const char *prefix)
 {
+        if (!loginfo)
+                loginfo = &default_loginfo;
        spin_lock_bh(&log_lock);
-        printk(level_string);
+        printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, 
-        printk("%sIN=%s OUT=%s ",
+                prefix,
-                prefix == NULL ? loginfo->prefix : prefix,
                in ? in->name : "",
                out ? out->name : "");
        if (in && !out) {
@@ -416,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb,
                void *userinfo)
 {
        const struct ip6t_log_info *loginfo = targinfo;
-        char level_string[4] = "< >";
+        struct nf_loginfo li;
+        li.type = NF_LOG_TYPE_LOG;
+        li.u.log.level = loginfo->level;
+        li.u.log.logflags = loginfo->logflags;
-        level_string[1] = '0' + (loginfo->level % 8);
+        nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix);
-        ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
        return IP6T_CONTINUE;
 }
-static void
-ip6t_logfn(unsigned int hooknum,
-           const struct sk_buff *skb,
-           const struct net_device *in,
-           const struct net_device *out,
-           const char *prefix)
-{
-        struct ip6t_log_info loginfo = {
-                .level = 0,
-                .logflags = IP6T_LOG_MASK,
-                .prefix = ""
-        };
-        ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
-}
 static int ip6t_log_checkentry(const char *tablename,
                               const struct ip6t_entry *e,
@@ -475,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = {
        .me             = THIS_MODULE,
 };
+static struct nf_logger ip6t_logger = {
+        .name           = "ip6t_LOG",
+        .logfn          = &ip6t_log_packet,
+        .me             = THIS_MODULE,
+};
 static int __init init(void)
 {
        if (ip6t_register_target(&ip6t_log_reg))
                return -EINVAL;
-        if (nflog)
+        if (nf_log_register(PF_INET6, &ip6t_logger) < 0) {
-                nf_log_register(PF_INET6, &ip6t_logfn);
+                printk(KERN_WARNING "ip6t_LOG: not logging via system console "
+                       "since somebody else already registered for PF_INET6\n");
+                /* we cannot make module load fail here, since otherwise
+                 * ip6tables userspace would abort */
+        }
        return 0;
 }
 static void __exit fini(void)
 {
-        if (nflog)
+        nf_log_unregister_logger(&ip6t_logger);
-                nf_log_unregister(PF_INET6, &ip6t_logfn);
        ip6t_unregister_target(&ip6t_log_reg);
 }
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
index d09ceb05013a..81924fcc5857 100644
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ b/net/ipv6/netfilter/ip6t_MARK.c
@@ -28,10 +28,9 @@ target(struct sk_buff **pskb,
 {
        const struct ip6t_mark_target_info *markinfo = targinfo;
-        if((*pskb)->nfmark != markinfo->mark) {
+        if((*pskb)->nfmark != markinfo->mark)
                (*pskb)->nfmark = markinfo->mark;
-                (*pskb)->nfcache |= NFC_ALTERED;
-        }
        return IP6T_CONTINUE;
 }
diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c
new file mode 100644
index 000000000000..c6e3730e7409
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* ip6tables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables NFQUEUE target");
+MODULE_LICENSE("GPL");
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+        const struct ipt_NFQ_info *tinfo = targinfo;
+        return NF_QUEUE_NR(tinfo->queuenum);
+}
+static int
+checkentry(const char *tablename,
+           const struct ip6t_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+        if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) {
+                printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+                       targinfosize,
+                       IP6T_ALIGN(sizeof(struct ipt_NFQ_info)));
+                return 0;
+        }
+        return 1;
+}
+static struct ip6t_target ipt_NFQ_reg = {
+        .name           = "NFQUEUE",
+        .target         = target,
+        .checkentry     = checkentry,
+        .me             = THIS_MODULE,
+};
+static int __init init(void)
+{
+        return ip6t_register_target(&ipt_NFQ_reg);
+}
+static void __exit fini(void)
+{
+        ip6t_unregister_target(&ipt_NFQ_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
new file mode 100644
index 000000000000..14316c3ebde4
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -0,0 +1,284 @@
+/*
+ * IP6 tables REJECT target module
+ * Linux INET6 implementation
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Authors:
+ *      Yasuyuki Kozakai        <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on net/ipv4/netfilter/ipt_REJECT.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netdevice.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/flow.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_REJECT.h>
+MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
+MODULE_DESCRIPTION("IP6 tables REJECT target module");
+MODULE_LICENSE("GPL");
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb)
+{
+        struct sk_buff *nskb;
+        struct tcphdr otcph, *tcph;
+        unsigned int otcplen, hh_len;
+        int tcphoff, needs_ack;
+        struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h;
+        struct dst_entry *dst = NULL;
+        u8 proto;
+        struct flowi fl;
+        if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+            (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+                DEBUGP("ip6t_REJECT: addr is not unicast.\n");
+                return;
+        }
+        proto = oip6h->nexthdr;
+        tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
+        if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
+                DEBUGP("ip6t_REJECT: Can't get TCP header.\n");
+                return;
+        }
+        otcplen = oldskb->len - tcphoff;
+        /* IP header checks: fragment, too short. */
+        if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) {
+                DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n",
+                        proto, otcplen);
+                return;
+        }
+        if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
+                BUG();
+        /* No RST for RST. */
+        if (otcph.rst) {
+                DEBUGP("ip6t_REJECT: RST is set\n");
+                return;
+        }
+        /* Check checksum. */
+        if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
+                            skb_checksum(oldskb, tcphoff, otcplen, 0))) {
+                DEBUGP("ip6t_REJECT: TCP checksum is invalid\n");
+                return;
+        }
+        memset(&fl, 0, sizeof(fl));
+        fl.proto = IPPROTO_TCP;
+        ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
+        ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
+        fl.fl_ip_sport = otcph.dest;
+        fl.fl_ip_dport = otcph.source;
+        dst = ip6_route_output(NULL, &fl);
+        if (dst == NULL)
+                return;
+        if (dst->error ||
+            xfrm_lookup(&dst, &fl, NULL, 0)) {
+                dst_release(dst);
+                return;
+        }
+        hh_len = (dst->dev->hard_header_len + 15)&~15;
+        nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+                         + sizeof(struct tcphdr) + dst->trailer_len,
+                         GFP_ATOMIC);
+        if (!nskb) {
+                if (net_ratelimit())
+                        printk("ip6t_REJECT: Can't alloc skb\n");
+                dst_release(dst);
+                return;
+        }
+        nskb->dst = dst;
+        skb_reserve(nskb, hh_len + dst->header_len);
+        ip6h = nskb->nh.ipv6h = (struct ipv6hdr *)
+                                        skb_put(nskb, sizeof(struct ipv6hdr));
+        ip6h->version = 6;
+        ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT);
+        ip6h->nexthdr = IPPROTO_TCP;
+        ip6h->payload_len = htons(sizeof(struct tcphdr));
+        ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr);
+        ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr);
+        tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+        /* Truncate to length (no data) */
+        tcph->doff = sizeof(struct tcphdr)/4;
+        tcph->source = otcph.dest;
+        tcph->dest = otcph.source;
+        if (otcph.ack) {
+                needs_ack = 0;
+                tcph->seq = otcph.ack_seq;
+                tcph->ack_seq = 0;
+        } else {
+                needs_ack = 1;
+                tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
+                                      + otcplen - (otcph.doff<<2));
+                tcph->seq = 0;
+        }
+        /* Reset flags */
+        ((u_int8_t *)tcph)[13] = 0;
+        tcph->rst = 1;
+        tcph->ack = needs_ack;
+        tcph->window = 0;
+        tcph->urg_ptr = 0;
+        tcph->check = 0;
+        /* Adjust TCP checksum */
+        tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr,
+                                      &nskb->nh.ipv6h->daddr,
+                                      sizeof(struct tcphdr), IPPROTO_TCP,
+                                      csum_partial((char *)tcph,
+                                                   sizeof(struct tcphdr), 0));
+        NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
+                dst_output);
+}
+static inline void
+send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum)
+{
+        if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL)
+                skb_in->dev = &loopback_dev;
+        icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL);
+}
+static unsigned int reject6_target(struct sk_buff **pskb,
+                           const struct net_device *in,
+                           const struct net_device *out,
+                           unsigned int hooknum,
+                           const void *targinfo,
+                           void *userinfo)
+{
+        const struct ip6t_reject_info *reject = targinfo;
+        DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__);
+        /* WARNING: This code causes reentry within ip6tables.
+           This means that the ip6tables jump stack is now crap.  We
+           must return an absolute verdict. --RR */
+        switch (reject->with) {
+        case IP6T_ICMP6_NO_ROUTE:
+                send_unreach(*pskb, ICMPV6_NOROUTE, hooknum);
+                break;
+        case IP6T_ICMP6_ADM_PROHIBITED:
+                send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum);
+                break;
+        case IP6T_ICMP6_NOT_NEIGHBOUR:
+                send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum);
+                break;
+        case IP6T_ICMP6_ADDR_UNREACH:
+                send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum);
+                break;
+        case IP6T_ICMP6_PORT_UNREACH:
+                send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum);
+                break;
+        case IP6T_ICMP6_ECHOREPLY:
+                /* Do nothing */
+                break;
+        case IP6T_TCP_RESET:
+                send_reset(*pskb);
+                break;
+        default:
+                if (net_ratelimit())
+                        printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with);
+                break;
+        }
+        return NF_DROP;
+}
+static int check(const char *tablename,
+                 const struct ip6t_entry *e,
+                 void *targinfo,
+                 unsigned int targinfosize,
+                 unsigned int hook_mask)
+{
+        const struct ip6t_reject_info *rejinfo = targinfo;
+        if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) {
+                DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize);
+                return 0;
+        }
+        /* Only allow these for packet filtering. */
+        if (strcmp(tablename, "filter") != 0) {
+                DEBUGP("ip6t_REJECT: bad table `%s'.\n", tablename);
+                return 0;
+        }
+        if ((hook_mask & ~((1 << NF_IP6_LOCAL_IN)
+                           | (1 << NF_IP6_FORWARD)
+                           | (1 << NF_IP6_LOCAL_OUT))) != 0) {
+                DEBUGP("ip6t_REJECT: bad hook mask %X\n", hook_mask);
+                return 0;
+        }
+        if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
+                printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
+                return 0;
+        } else if (rejinfo->with == IP6T_TCP_RESET) {
+                /* Must specify that it's a TCP packet */
+                if (e->ipv6.proto != IPPROTO_TCP
+                    || (e->ipv6.invflags & IP6T_INV_PROTO)) {
+                        DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
+                        return 0;
+                }
+        }
+        return 1;
+}
+static struct ip6t_target ip6t_reject_reg = {
+        .name           = "REJECT",
+        .target         = reject6_target,
+        .checkentry     = check,
+        .me             = THIS_MODULE
+};
+static int __init init(void)
+{
+        if (ip6t_register_target(&ip6t_reject_reg))
+                return -EINVAL;
+        return 0;
+}
+static void __exit fini(void)
+{
+        ip6t_unregister_target(&ip6t_reject_reg);
+}
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index ab0e32d3de46..9b91decbfddb 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -20,71 +20,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("IP6 tables owner matching module");
 MODULE_LICENSE("GPL");
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
-        struct task_struct *p;
-        struct files_struct *files;
-        int i;
-        read_lock(&tasklist_lock);
-        p = find_task_by_pid(pid);
-        if (!p)
-                goto out;
-        task_lock(p);
-        files = p->files;
-        if(files) {
-                spin_lock(&files->file_lock);
-                for (i=0; i < files->max_fds; i++) {
-                        if (fcheck_files(files, i) == skb->sk->sk_socket->file) {
-                                spin_unlock(&files->file_lock);
-                                task_unlock(p);
-                                read_unlock(&tasklist_lock);
-                                return 1;
-                        }
-                }
-                spin_unlock(&files->file_lock);
-        }
-        task_unlock(p);
-out:
-        read_unlock(&tasklist_lock);
-        return 0;
-}
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
-        struct task_struct *g, *p;
-        struct file *file = skb->sk->sk_socket->file;
-        int i, found=0;
-        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
-                struct files_struct *files;
-                if (p->signal->session != sid)
-                        continue;
-                task_lock(p);
-                files = p->files;
-                if (files) {
-                        spin_lock(&files->file_lock);
-                        for (i=0; i < files->max_fds; i++) {
-                                if (fcheck_files(files, i) == file) {
-                                        found = 1;
-                                        break;
-                                }
-                        }
-                        spin_unlock(&files->file_lock);
-                }
-                task_unlock(p);
-                if (found)
-                        goto out;
-        } while_each_thread(g, p);
-out:
-        read_unlock(&tasklist_lock);
-        return found;
-}
 static int
 match(const struct sk_buff *skb,
@@ -112,18 +47,6 @@ match(const struct sk_buff *skb,
                        return 0;
        }
-        if(info->match & IP6T_OWNER_PID) {
-                if (!match_pid(skb, info->pid) ^
-                    !!(info->invert & IP6T_OWNER_PID))
-                        return 0;
-        }
-        if(info->match & IP6T_OWNER_SID) {
-                if (!match_sid(skb, info->sid) ^
-                    !!(info->invert & IP6T_OWNER_SID))
-                        return 0;
-        }
        return 1;
 }
@@ -134,6 +57,8 @@ checkentry(const char *tablename,
           unsigned int matchsize,
           unsigned int hook_mask)
 {
+        const struct ip6t_owner_info *info = matchinfo;
        if (hook_mask
            & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) {
                printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -142,14 +67,13 @@ checkentry(const char *tablename,
        if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info)))
                return 0;
-#ifdef CONFIG_SMP
-        /* files->file_lock can not be used in a BH */
+        if (info->match & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
-        if (((struct ip6t_owner_info *)matchinfo)->match
+                printk("ipt_owner: pid and sid matching "
-            & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
+                       "not supported anymore\n");
-                printk("ip6t_owner: pid and sid matching is broken on SMP.\n");
                return 0;
        }
-#endif
        return 1;
 }
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 1d4d75b34d32..7a5863298f3f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -49,6 +49,7 @@
 #include <net/transp_v6.h>
 #include <net/udp.h>
 #include <net/inet_common.h>
+#include <net/tcp_states.h>
 #include <net/rawv6.h>
 #include <net/xfrm.h>
@@ -81,7 +82,8 @@ static void raw_v6_unhash(struct sock *sk)
 /* Grumble... icmp and ip_input want to get at this... */
 struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
-                             struct in6_addr *loc_addr, struct in6_addr *rmt_addr)
+                             struct in6_addr *loc_addr, struct in6_addr *rmt_addr,
+                             int dif)
 {
        struct hlist_node *node;
        int is_multicast = ipv6_addr_is_multicast(loc_addr);
@@ -94,6 +96,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
                            !ipv6_addr_equal(&np->daddr, rmt_addr))
                                continue;
+                        if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+                                continue;
                        if (!ipv6_addr_any(&np->rcv_saddr)) {
                                if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
                                        goto found;
@@ -137,11 +142,12 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
 *
 *      Caller owns SKB so we must make clones.
 */
-void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 {
        struct in6_addr *saddr;
        struct in6_addr *daddr;
        struct sock *sk;
+        int delivered = 0;
        __u8 hash;
        saddr = &skb->nh.ipv6h->saddr;
@@ -160,9 +166,10 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
        if (sk == NULL)
                goto out;
-        sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr);
+        sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, skb->dev->ifindex);
        while (sk) {
+                delivered = 1;
                if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) {
                        struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -170,10 +177,12 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
                        if (clone)
                                rawv6_rcv(sk, clone);
                }
-                sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr);
+                sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
+                                     skb->dev->ifindex);
        }
 out:
        read_unlock(&raw_v6_lock);
+        return delivered;
 }
 /* This cleans up af_inet6 a bit. -DaveM */
@@ -334,8 +343,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
                        if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
                                            &skb->nh.ipv6h->daddr,
                                            skb->len, inet->num, skb->csum)) {
-                                LIMIT_NETDEBUG(
+                                LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
-                                printk(KERN_DEBUG "raw v6 hw csum failure.\n"));
                                skb->ip_summed = CHECKSUM_NONE;
                        }
                }
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 59e7c6317872..9d9e04344c77 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -562,7 +562,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
        if (skb->dev)
                fq->iif = skb->dev->ifindex;
        skb->dev = NULL;
-        fq->stamp = skb->stamp;
+        skb_get_timestamp(skb, &fq->stamp);
        fq->meat += skb->len;
        atomic_add(skb->truesize, &ip6_frag_mem);
@@ -664,7 +664,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
        head->next = NULL;
        head->dev = dev;
-        head->stamp = fq->stamp;
+        skb_set_timestamp(head, &fq->stamp);
        head->nh.ipv6h->payload_len = htons(payload_len);
        *skb_in = head;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 878789b3122d..5d5bbb49ec78 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1372,7 +1372,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
 *      Drop the packet on the floor
 */
-int ip6_pkt_discard(struct sk_buff *skb)
+static int ip6_pkt_discard(struct sk_buff *skb)
 {
        IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
@@ -1380,7 +1380,7 @@ int ip6_pkt_discard(struct sk_buff *skb)
        return 0;
 }
-int ip6_pkt_discard_out(struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct sk_buff *skb)
 {
        skb->dev = skb->dst->dev;
        return ip6_pkt_discard(skb);
@@ -1850,16 +1850,16 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
        
        skb = alloc_skb(size, gfp_any());
        if (!skb) {
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
                return;
        }
        if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
                kfree_skb(skb);
-                netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
+                netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
                return;
        }
-        NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
+        NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
-        netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
+        netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
 }
 /*
@@ -1960,8 +1960,6 @@ static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
        return arg.len;
 }
-extern struct rt6_statistics rt6_stats;
 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 {
        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e553e5b80d6e..c3123c9e1a8e 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -770,7 +770,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
        return 0;
 }
-int __init ipip6_fb_tunnel_init(struct net_device *dev)
+static int __init ipip6_fb_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel = dev->priv;
        struct iphdr *iph = &tunnel->parms.iph;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 3a18e0e6ffed..8eff9fa1e983 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -14,9 +14,6 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
-extern ctl_table ipv6_route_table[];
-extern ctl_table ipv6_icmp_table[];
 #ifdef CONFIG_SYSCTL
 static ctl_table ipv6_table[] = {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ef29cfd936d3..794734f1d230 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -47,6 +47,7 @@
 #include <net/tcp.h>
 #include <net/ndisc.h>
+#include <net/inet6_hashtables.h>
 #include <net/ipv6.h>
 #include <net/transp_v6.h>
 #include <net/addrconf.h>
@@ -75,34 +76,11 @@ static int	tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
 static struct tcp_func ipv6_mapped;
 static struct tcp_func ipv6_specific;
-/* I have no idea if this is a good hash for v6 or not. -DaveM */
+static inline int tcp_v6_bind_conflict(const struct sock *sk,
-static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport,
+                                       const struct inet_bind_bucket *tb)
-                                    struct in6_addr *faddr, u16 fport)
 {
-        int hashent = (lport ^ fport);
+        const struct sock *sk2;
+        const struct hlist_node *node;
-        hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
-        hashent ^= hashent>>16;
-        hashent ^= hashent>>8;
-        return (hashent & (tcp_ehash_size - 1));
-}
-static __inline__ int tcp_v6_sk_hashfn(struct sock *sk)
-{
-        struct inet_sock *inet = inet_sk(sk);
-        struct ipv6_pinfo *np = inet6_sk(sk);
-        struct in6_addr *laddr = &np->rcv_saddr;
-        struct in6_addr *faddr = &np->daddr;
-        __u16 lport = inet->num;
-        __u16 fport = inet->dport;
-        return tcp_v6_hashfn(laddr, lport, faddr, fport);
-}
-static inline int tcp_v6_bind_conflict(struct sock *sk,
-                                       struct tcp_bind_bucket *tb)
-{
-        struct sock *sk2;
-        struct hlist_node *node;
        /* We must walk the whole port owner list in this case. -DaveM */
        sk_for_each_bound(sk2, node, &tb->owners) {
@@ -126,8 +104,8 @@ static inline int tcp_v6_bind_conflict(struct sock *sk,
 */
 static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 {
-        struct tcp_bind_hashbucket *head;
+        struct inet_bind_hashbucket *head;
-        struct tcp_bind_bucket *tb;
+        struct inet_bind_bucket *tb;
        struct hlist_node *node;
        int ret;
@@ -138,25 +116,25 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
                int remaining = (high - low) + 1;
                int rover;
-                spin_lock(&tcp_portalloc_lock);
+                spin_lock(&tcp_hashinfo.portalloc_lock);
-                if (tcp_port_rover < low)
+                if (tcp_hashinfo.port_rover < low)
                        rover = low;
                else
-                        rover = tcp_port_rover;
+                        rover = tcp_hashinfo.port_rover;
                do {    rover++;
                        if (rover > high)
                                rover = low;
-                        head = &tcp_bhash[tcp_bhashfn(rover)];
+                        head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
                        spin_lock(&head->lock);
-                        tb_for_each(tb, node, &head->chain)
+                        inet_bind_bucket_for_each(tb, node, &head->chain)
                                if (tb->port == rover)
                                        goto next;
                        break;
                next:
                        spin_unlock(&head->lock);
                } while (--remaining > 0);
-                tcp_port_rover = rover;
+                tcp_hashinfo.port_rover = rover;
-                spin_unlock(&tcp_portalloc_lock);
+                spin_unlock(&tcp_hashinfo.portalloc_lock);
                /* Exhausted local port range during search?  It is not
                 * possible for us to be holding one of the bind hash
@@ -171,9 +149,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
                /* OK, here is the one we will use. */
                snum = rover;
        } else {
-                head = &tcp_bhash[tcp_bhashfn(snum)];
+                head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
                spin_lock(&head->lock);
-                tb_for_each(tb, node, &head->chain)
+                inet_bind_bucket_for_each(tb, node, &head->chain)
                        if (tb->port == snum)
                                goto tb_found;
        }
@@ -192,8 +170,11 @@ tb_found:
        }
 tb_not_found:
        ret = 1;
-        if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
+        if (tb == NULL) {
-                goto fail_unlock;
+                tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
+                if (tb == NULL)
+                        goto fail_unlock;
+        }
        if (hlist_empty(&tb->owners)) {
                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
                        tb->fastreuse = 1;
@@ -204,9 +185,9 @@ tb_not_found:
                tb->fastreuse = 0;
 success:
-        if (!tcp_sk(sk)->bind_hash)
+        if (!inet_csk(sk)->icsk_bind_hash)
-                tcp_bind_hash(sk, tb, snum);
+                inet_bind_hash(sk, tb, snum);
-        BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
+        BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
        ret = 0;
 fail_unlock:
@@ -224,13 +205,13 @@ static __inline__ void __tcp_v6_hash(struct sock *sk)
        BUG_TRAP(sk_unhashed(sk));
        if (sk->sk_state == TCP_LISTEN) {
-                list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
+                list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
-                lock = &tcp_lhash_lock;
+                lock = &tcp_hashinfo.lhash_lock;
-                tcp_listen_wlock();
+                inet_listen_wlock(&tcp_hashinfo);
        } else {
-                sk->sk_hashent = tcp_v6_sk_hashfn(sk);
+                sk->sk_hashent = inet6_sk_ehashfn(sk, tcp_hashinfo.ehash_size);
-                list = &tcp_ehash[sk->sk_hashent].chain;
+                list = &tcp_hashinfo.ehash[sk->sk_hashent].chain;
-                lock = &tcp_ehash[sk->sk_hashent].lock;
+                lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock;
                write_lock(lock);
        }
@@ -255,131 +236,11 @@ static void tcp_v6_hash(struct sock *sk)
        }
 }
-static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif)
-{
-        struct sock *sk;
-        struct hlist_node *node;
-        struct sock *result = NULL;
-        int score, hiscore;
-        hiscore=0;
-        read_lock(&tcp_lhash_lock);
-        sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) {
-                if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
-                        struct ipv6_pinfo *np = inet6_sk(sk);
-                        
-                        score = 1;
-                        if (!ipv6_addr_any(&np->rcv_saddr)) {
-                                if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
-                                        continue;
-                                score++;
-                        }
-                        if (sk->sk_bound_dev_if) {
-                                if (sk->sk_bound_dev_if != dif)
-                                        continue;
-                                score++;
-                        }
-                        if (score == 3) {
-                                result = sk;
-                                break;
-                        }
-                        if (score > hiscore) {
-                                hiscore = score;
-                                result = sk;
-                        }
-                }
-        }
-        if (result)
-                sock_hold(result);
-        read_unlock(&tcp_lhash_lock);
-        return result;
-}
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * The sockhash lock must be held as a reader here.
- */
-static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport,
-                                                       struct in6_addr *daddr, u16 hnum,
-                                                       int dif)
-{
-        struct tcp_ehash_bucket *head;
-        struct sock *sk;
-        struct hlist_node *node;
-        __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
-        int hash;
-        /* Optimize here for direct hit, only listening connections can
-         * have wildcards anyways.
-         */
-        hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
-        head = &tcp_ehash[hash];
-        read_lock(&head->lock);
-        sk_for_each(sk, node, &head->chain) {
-                /* For IPV6 do the cheaper port and family tests first. */
-                if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif))
-                        goto hit; /* You sunk my battleship! */
-        }
-        /* Must check for a TIME_WAIT'er before going to listener hash. */
-        sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
-                /* FIXME: acme: check this... */
-                struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
-                if(*((__u32 *)&(tw->tw_dport))  == ports        &&
-                   sk->sk_family                == PF_INET6) {
-                        if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr)     &&
-                           ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) &&
-                           (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
-                                goto hit;
-                }
-        }
-        read_unlock(&head->lock);
-        return NULL;
-hit:
-        sock_hold(sk);
-        read_unlock(&head->lock);
-        return sk;
-}
-static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-                                           struct in6_addr *daddr, u16 hnum,
-                                           int dif)
-{
-        struct sock *sk;
-        sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif);
-        if (sk)
-                return sk;
-        return tcp_v6_lookup_listener(daddr, hnum, dif);
-}
-inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-                                  struct in6_addr *daddr, u16 dport,
-                                  int dif)
-{
-        struct sock *sk;
-        local_bh_disable();
-        sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif);
-        local_bh_enable();
-        return sk;
-}
-EXPORT_SYMBOL_GPL(tcp_v6_lookup);
 /*
 * Open request hash tables.
 */
-static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
+static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
 {
        u32 a, b, c;
@@ -399,14 +260,15 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
        return c & (TCP_SYNQ_HSIZE - 1);
 }
-static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
+static struct request_sock *tcp_v6_search_req(const struct sock *sk,
                                              struct request_sock ***prevp,
                                              __u16 rport,
                                              struct in6_addr *raddr,
                                              struct in6_addr *laddr,
                                              int iif)
 {
-        struct listen_sock *lopt = tp->accept_queue.listen_opt;
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
        struct request_sock *req, **prev;  
        for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -451,44 +313,48 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
        }
 }
-static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
+static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
-                                      struct tcp_tw_bucket **twp)
+                                      struct inet_timewait_sock **twp)
 {
        struct inet_sock *inet = inet_sk(sk);
-        struct ipv6_pinfo *np = inet6_sk(sk);
+        const struct ipv6_pinfo *np = inet6_sk(sk);
-        struct in6_addr *daddr = &np->rcv_saddr;
+        const struct in6_addr *daddr = &np->rcv_saddr;
-        struct in6_addr *saddr = &np->daddr;
+        const struct in6_addr *saddr = &np->daddr;
-        int dif = sk->sk_bound_dev_if;
+        const int dif = sk->sk_bound_dev_if;
-        u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
+        const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-        int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport);
+        const int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport,
-        struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+                                       tcp_hashinfo.ehash_size);
+        struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
        struct sock *sk2;
-        struct hlist_node *node;
+        const struct hlist_node *node;
-        struct tcp_tw_bucket *tw;
+        struct inet_timewait_sock *tw;
        write_lock(&head->lock);
        /* Check TIME-WAIT sockets first. */
-        sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
+        sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-                tw = (struct tcp_tw_bucket*)sk2;
+                const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
+                tw = inet_twsk(sk2);
                if(*((__u32 *)&(tw->tw_dport))  == ports        &&
                   sk2->sk_family               == PF_INET6     &&
-                   ipv6_addr_equal(&tw->tw_v6_daddr, saddr)     &&
+                   ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
-                   ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) &&
+                   ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr)     &&
                   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+                        const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
                        struct tcp_sock *tp = tcp_sk(sk);
-                        if (tw->tw_ts_recent_stamp &&
+                        if (tcptw->tw_ts_recent_stamp &&
-                            (!twp || (sysctl_tcp_tw_reuse &&
+                            (!twp ||
-                                      xtime.tv_sec - 
+                             (sysctl_tcp_tw_reuse &&
-                                      tw->tw_ts_recent_stamp > 1))) {
+                              xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
                                /* See comment in tcp_ipv4.c */
-                                tp->write_seq = tw->tw_snd_nxt + 65535 + 2;
+                                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
                                if (!tp->write_seq)
                                        tp->write_seq = 1;
-                                tp->rx_opt.ts_recent = tw->tw_ts_recent;
+                                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
-                                tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+                                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
                                sock_hold(sk2);
                                goto unique;
                        } else
@@ -499,7 +365,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
        /* And established part... */
        sk_for_each(sk2, node, &head->chain) {
-                if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif))
+                if (INET6_MATCH(sk2, saddr, daddr, ports, dif))
                        goto not_unique;
        }
@@ -515,10 +381,10 @@ unique:
                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
-                tcp_tw_deschedule(tw);
+                inet_twsk_deschedule(tw, &tcp_death_row);
                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-                tcp_tw_put(tw);
+                inet_twsk_put(tw);
        }
        return 0;
@@ -540,8 +406,8 @@ static inline u32 tcpv6_port_offset(const struct sock *sk)
 static int tcp_v6_hash_connect(struct sock *sk)
 {
        unsigned short snum = inet_sk(sk)->num;
-        struct tcp_bind_hashbucket *head;
+        struct inet_bind_hashbucket *head;
-        struct tcp_bind_bucket *tb;
+        struct inet_bind_bucket *tb;
        int ret;
        if (!snum) {
@@ -553,19 +419,19 @@ static int tcp_v6_hash_connect(struct sock *sk)
                static u32 hint;
                u32 offset = hint + tcpv6_port_offset(sk);
                struct hlist_node *node;
-                struct tcp_tw_bucket *tw = NULL;
+                struct inet_timewait_sock *tw = NULL;
                local_bh_disable();
                for (i = 1; i <= range; i++) {
                        port = low + (i + offset) % range;
-                        head = &tcp_bhash[tcp_bhashfn(port)];
+                        head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
                        spin_lock(&head->lock);
                        /* Does not bother with rcv_saddr checks,
                         * because the established check is already
                         * unique enough.
                         */
-                        tb_for_each(tb, node, &head->chain) {
+                        inet_bind_bucket_for_each(tb, node, &head->chain) {
                                if (tb->port == port) {
                                        BUG_TRAP(!hlist_empty(&tb->owners));
                                        if (tb->fastreuse >= 0)
@@ -578,7 +444,7 @@ static int tcp_v6_hash_connect(struct sock *sk)
                                }
                        }
-                        tb = tcp_bucket_create(head, port);
+                        tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
                        if (!tb) {
                                spin_unlock(&head->lock);
                                break;
@@ -597,7 +463,7 @@ ok:
                hint += i;
                /* Head lock still held and bh's disabled */
-                tcp_bind_hash(sk, tb, port);
+                inet_bind_hash(sk, tb, port);
                if (sk_unhashed(sk)) {
                        inet_sk(sk)->sport = htons(port);
                        __tcp_v6_hash(sk);
@@ -605,16 +471,16 @@ ok:
                spin_unlock(&head->lock);
                if (tw) {
-                        tcp_tw_deschedule(tw);
+                        inet_twsk_deschedule(tw, &tcp_death_row);
-                        tcp_tw_put(tw);
+                        inet_twsk_put(tw);
                }
                ret = 0;
                goto out;
        }
-        head  = &tcp_bhash[tcp_bhashfn(snum)];
+        head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
-        tb  = tcp_sk(sk)->bind_hash;
+        tb   = inet_csk(sk)->icsk_bind_hash;
        spin_lock_bh(&head->lock);
        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
@@ -631,11 +497,6 @@ out:
        }
 }
-static __inline__ int tcp_v6_iif(struct sk_buff *skb)
-{
-        return IP6CB(skb)->iif;
-}
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 
                          int addr_len)
 {
@@ -827,14 +688,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                int type, int code, int offset, __u32 info)
 {
        struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
-        struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+        const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
        struct ipv6_pinfo *np;
        struct sock *sk;
        int err;
        struct tcp_sock *tp; 
        __u32 seq;
-        sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex);
+        sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr,
+                          th->source, skb->dev->ifindex);
        if (sk == NULL) {
                ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
@@ -842,7 +704,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        }
        if (sk->sk_state == TCP_TIME_WAIT) {
-                tcp_tw_put((struct tcp_tw_bucket*)sk);
+                inet_twsk_put((struct inet_timewait_sock *)sk);
                return;
        }
@@ -920,8 +782,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                if (sock_owned_by_user(sk))
                        goto out;
-                req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr,
+                req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
-                                        &hdr->saddr, tcp_v6_iif(skb));
+                                        &hdr->saddr, inet6_iif(skb));
                if (!req)
                        goto out;
@@ -935,7 +797,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                        goto out;
                }
-                tcp_synq_drop(sk, req, prev);
+                inet_csk_reqsk_queue_drop(sk, req, prev);
                goto out;
        case TCP_SYN_SENT:
@@ -1132,7 +994,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb)
                                    buff->csum);
        fl.proto = IPPROTO_TCP;
-        fl.oif = tcp_v6_iif(skb);
+        fl.oif = inet6_iif(skb);
        fl.fl_ip_dport = t1->dest;
        fl.fl_ip_sport = t1->source;
@@ -1201,7 +1063,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
                                    buff->csum);
        fl.proto = IPPROTO_TCP;
-        fl.oif = tcp_v6_iif(skb);
+        fl.oif = inet6_iif(skb);
        fl.fl_ip_dport = t1->dest;
        fl.fl_ip_sport = t1->source;
@@ -1220,12 +1082,14 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
-        struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+        struct inet_timewait_sock *tw = inet_twsk(sk);
+        const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-        tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
+        tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
-                        tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+                        tcptw->tw_ts_recent);
-        tcp_tw_put(tw);
+        inet_twsk_put(tw);
 }
 static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1237,28 +1101,25 @@ static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 {
        struct request_sock *req, **prev;
-        struct tcphdr *th = skb->h.th;
+        const struct tcphdr *th = skb->h.th;
-        struct tcp_sock *tp = tcp_sk(sk);
        struct sock *nsk;
        /* Find possible connection requests. */
-        req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr,
+        req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
-                                &skb->nh.ipv6h->daddr, tcp_v6_iif(skb));
+                                &skb->nh.ipv6h->daddr, inet6_iif(skb));
        if (req)
                return tcp_check_req(sk, skb, req, prev);
-        nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr,
+        nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr,
-                                          th->source,
+                                         th->source, &skb->nh.ipv6h->daddr,
-                                          &skb->nh.ipv6h->daddr,
+                                         ntohs(th->dest), inet6_iif(skb));
-                                          ntohs(th->dest),
-                                          tcp_v6_iif(skb));
        if (nsk) {
                if (nsk->sk_state != TCP_TIME_WAIT) {
                        bh_lock_sock(nsk);
                        return nsk;
                }
-                tcp_tw_put((struct tcp_tw_bucket*)nsk);
+                inet_twsk_put((struct inet_timewait_sock *)nsk);
                return NULL;
        }
@@ -1271,12 +1132,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
-        struct listen_sock *lopt = tp->accept_queue.listen_opt;
+        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-        u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
+        const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-        reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
+        reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
-        tcp_synq_added(sk);
+        inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
 }
@@ -1301,13 +1162,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
        /*
         *      There are no SYN attacks on IPv6, yet...        
         */
-        if (tcp_synq_is_full(sk) && !isn) {
+        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
                if (net_ratelimit())
                        printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n");
                goto drop;              
        }
-        if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
                goto drop;
        req = reqsk_alloc(&tcp6_request_sock_ops);
@@ -1339,7 +1200,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
        /* So that link locals have meaning */
        if (!sk->sk_bound_dev_if &&
            ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
-                treq->iif = tcp_v6_iif(skb);
+                treq->iif = inet6_iif(skb);
        if (isn == 0) 
                isn = tcp_v6_init_sequence(sk,skb);
@@ -1404,15 +1265,14 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                newsk->sk_backlog_rcv = tcp_v4_do_rcv;
                newnp->pktoptions  = NULL;
                newnp->opt         = NULL;
-                newnp->mcast_oif   = tcp_v6_iif(skb);
+                newnp->mcast_oif   = inet6_iif(skb);
                newnp->mcast_hops  = skb->nh.ipv6h->hop_limit;
-                /* Charge newly allocated IPv6 socket. Though it is mapped,
+                /*
-                 * it is IPv6 yet.
+                 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+                 * here, tcp_create_openreq_child now does this for us, see the comment in
+                 * that function for the gory details. -acme
                 */
-#ifdef INET_REFCNT_DEBUG
-                atomic_inc(&inet6_sock_nr);
-#endif
                /* It is tricky place. Until this moment IPv4 tcp
                   worked with IPv6 af_tcp.af_specific.
@@ -1467,10 +1327,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        if (newsk == NULL)
                goto out;
-        /* Charge newly allocated IPv6 socket */
+        /*
-#ifdef INET_REFCNT_DEBUG
+         * No need to charge this sock to the relevant IPv6 refcnt debug socks
-        atomic_inc(&inet6_sock_nr);
+         * count here, tcp_create_openreq_child now does this for us, see the
-#endif
+         * comment in that function for the gory details. -acme
+         */
        ip6_dst_store(newsk, dst, NULL);
        newsk->sk_route_caps = dst->dev->features &
@@ -1509,7 +1370,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                        skb_set_owner_r(newnp->pktoptions, newsk);
        }
        newnp->opt        = NULL;
-        newnp->mcast_oif  = tcp_v6_iif(skb);
+        newnp->mcast_oif  = inet6_iif(skb);
        newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
        /* Clone native IPv6 options from listening socket (if any)
@@ -1536,7 +1397,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
        __tcp_v6_hash(newsk);
-        tcp_inherit_port(sk, newsk);
+        inet_inherit_port(&tcp_hashinfo, sk, newsk);
        return newsk;
@@ -1557,7 +1418,7 @@ static int tcp_v6_checksum_init(struct sk_buff *skb)
                if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
                                  &skb->nh.ipv6h->daddr,skb->csum))
                        return 0;
-                LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n");
        }
        if (skb->len <= 76) {
                if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
@@ -1684,7 +1545,7 @@ ipv6_pktoptions:
        if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
            !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
                if (np->rxopt.bits.rxinfo)
-                        np->mcast_oif = tcp_v6_iif(opt_skb);
+                        np->mcast_oif = inet6_iif(opt_skb);
                if (np->rxopt.bits.rxhlim)
                        np->mcast_hops = opt_skb->nh.ipv6h->hop_limit;
                if (ipv6_opt_accepted(sk, opt_skb)) {
@@ -1739,8 +1600,9 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
        TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h);
        TCP_SKB_CB(skb)->sacked = 0;
-        sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source,
+        sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source,
-                             &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+                            &skb->nh.ipv6h->daddr, ntohs(th->dest),
+                            inet6_iif(skb));
        if (!sk)
                goto no_tcp_socket;
@@ -1795,26 +1657,29 @@ discard_and_relse:
 do_time_wait:
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-                tcp_tw_put((struct tcp_tw_bucket *) sk);
+                inet_twsk_put((struct inet_timewait_sock *)sk);
                goto discard_it;
        }
        if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
                TCP_INC_STATS_BH(TCP_MIB_INERRS);
-                tcp_tw_put((struct tcp_tw_bucket *) sk);
+                inet_twsk_put((struct inet_timewait_sock *)sk);
                goto discard_it;
        }
-        switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+        switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
-                                          skb, th, skb->len)) {
+                                           skb, th)) {
        case TCP_TW_SYN:
        {
                struct sock *sk2;
-                sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+                sk2 = inet6_lookup_listener(&tcp_hashinfo,
+                                            &skb->nh.ipv6h->daddr,
+                                            ntohs(th->dest), inet6_iif(skb));
                if (sk2 != NULL) {
-                        tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
+                        struct inet_timewait_sock *tw = inet_twsk(sk);
-                        tcp_tw_put((struct tcp_tw_bucket *)sk);
+                        inet_twsk_deschedule(tw, &tcp_death_row);
+                        inet_twsk_put(tw);
                        sk = sk2;
                        goto process;
                }
@@ -1983,7 +1848,7 @@ static struct tcp_func ipv6_specific = {
 static struct tcp_func ipv6_mapped = {
        .queue_xmit     =       ip_queue_xmit,
        .send_check     =       tcp_v4_send_check,
-        .rebuild_header =       tcp_v4_rebuild_header,
+        .rebuild_header =       inet_sk_rebuild_header,
        .conn_request   =       tcp_v6_conn_request,
        .syn_recv_sock  =       tcp_v6_syn_recv_sock,
        .remember_stamp =       tcp_v4_remember_stamp,
@@ -2002,13 +1867,14 @@ static struct tcp_func ipv6_mapped = {
 */
 static int tcp_v6_init_sock(struct sock *sk)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        skb_queue_head_init(&tp->out_of_order_queue);
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
-        tp->rto  = TCP_TIMEOUT_INIT;
+        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        tp->mdev = TCP_TIMEOUT_INIT;
        /* So many TCP implementations out there (incorrectly) count the
@@ -2030,7 +1896,7 @@ static int tcp_v6_init_sock(struct sock *sk)
        sk->sk_state = TCP_CLOSE;
        tp->af_specific = &ipv6_specific;
-        tp->ca_ops = &tcp_init_congestion_ops;
+        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
        sk->sk_write_space = sk_stream_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
@@ -2044,8 +1910,6 @@ static int tcp_v6_init_sock(struct sock *sk)
 static int tcp_v6_destroy_sock(struct sock *sk)
 {
-        extern int tcp_v4_destroy_sock(struct sock *sk);
        tcp_v4_destroy_sock(sk);
        return inet6_destroy_sock(sk);
 }
@@ -2091,18 +1955,20 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
        unsigned long timer_expires;
        struct inet_sock *inet = inet_sk(sp);
        struct tcp_sock *tp = tcp_sk(sp);
+        const struct inet_connection_sock *icsk = inet_csk(sp);
        struct ipv6_pinfo *np = inet6_sk(sp);
        dest  = &np->daddr;
        src   = &np->rcv_saddr;
        destp = ntohs(inet->dport);
        srcp  = ntohs(inet->sport);
-        if (tp->pending == TCP_TIME_RETRANS) {
+        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
                timer_active    = 1;
-                timer_expires   = tp->timeout;
+                timer_expires   = icsk->icsk_timeout;
-        } else if (tp->pending == TCP_TIME_PROBE0) {
+        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
                timer_active    = 4;
-                timer_expires   = tp->timeout;
+                timer_expires   = icsk->icsk_timeout;
        } else if (timer_pending(&sp->sk_timer)) {
                timer_active    = 2;
                timer_expires   = sp->sk_timer.expires;
@@ -2123,28 +1989,31 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
                   tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
                   timer_active,
                   jiffies_to_clock_t(timer_expires - jiffies),
-                   tp->retransmits,
+                   icsk->icsk_retransmits,
                   sock_i_uid(sp),
-                   tp->probes_out,
+                   icsk->icsk_probes_out,
                   sock_i_ino(sp),
                   atomic_read(&sp->sk_refcnt), sp,
-                   tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
+                   icsk->icsk_rto,
+                   icsk->icsk_ack.ato,
+                   (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong,
                   tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
                   );
 }
 static void get_timewait6_sock(struct seq_file *seq, 
-                               struct tcp_tw_bucket *tw, int i)
+                               struct inet_timewait_sock *tw, int i)
 {
        struct in6_addr *dest, *src;
        __u16 destp, srcp;
+        struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
        int ttd = tw->tw_ttd - jiffies;
        if (ttd < 0)
                ttd = 0;
-        dest  = &tw->tw_v6_daddr;
+        dest = &tcp6tw->tw_v6_daddr;
-        src   = &tw->tw_v6_rcv_saddr;
+        src  = &tcp6tw->tw_v6_rcv_saddr;
        destp = ntohs(tw->tw_dport);
        srcp  = ntohs(tw->tw_sport);
@@ -2219,7 +2088,7 @@ struct proto tcpv6_prot = {
        .close                  = tcp_close,
        .connect                = tcp_v6_connect,
        .disconnect             = tcp_disconnect,
-        .accept                 = tcp_accept,
+        .accept                 = inet_csk_accept,
        .ioctl                  = tcp_ioctl,
        .init                   = tcp_v6_init_sock,
        .destroy                = tcp_v6_destroy_sock,
@@ -2236,11 +2105,13 @@ struct proto tcpv6_prot = {
        .sockets_allocated      = &tcp_sockets_allocated,
        .memory_allocated       = &tcp_memory_allocated,
        .memory_pressure        = &tcp_memory_pressure,
+        .orphan_count           = &tcp_orphan_count,
        .sysctl_mem             = sysctl_tcp_mem,
        .sysctl_wmem            = sysctl_tcp_wmem,
        .sysctl_rmem            = sysctl_tcp_rmem,
        .max_header             = MAX_TCP_HEADER,
        .obj_size               = sizeof(struct tcp6_sock),
+        .twsk_obj_size          = sizeof(struct tcp6_timewait_sock),
        .rsk_prot               = &tcp6_request_sock_ops,
 };
@@ -2250,8 +2121,6 @@ static struct inet6_protocol tcpv6_protocol = {
        .flags          =       INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
-extern struct proto_ops inet6_stream_ops;
 static struct inet_protosw tcpv6_protosw = {
        .type           =       SOCK_STREAM,
        .protocol       =       IPPROTO_TCP,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eff050ac7049..390d750449ce 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -51,6 +51,7 @@
 #include <net/udp.h>
 #include <net/raw.h>
 #include <net/inet_common.h>
+#include <net/tcp_states.h>
 #include <net/ip6_checksum.h>
 #include <net/xfrm.h>
@@ -58,7 +59,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6);
+DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
 /* Grrr, addr_type already calculated by caller, but I don't want
 * to add some silly "cookie" argument to this method just for that.
@@ -477,8 +478,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
                /* RFC 2460 section 8.1 says that we SHOULD log
                   this error. Well, it is reasonable.
                 */
-                LIMIT_NETDEBUG(
+                LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n");
-                        printk(KERN_INFO "IPv6: udp checksum is 0\n"));
                goto discard;
        }
@@ -493,7 +493,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
        if (skb->ip_summed==CHECKSUM_HW) {
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
-                        LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n"));
+                        LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
                        skb->ip_summed = CHECKSUM_NONE;
                }
        }
@@ -825,7 +825,7 @@ back_from_confirm:
                /* ... which is an evident application bug. --ANK */
                release_sock(sk);
-                LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
                err = -EINVAL;
                goto out;
        }
@@ -1054,8 +1054,6 @@ struct proto udpv6_prot = {
        .obj_size =     sizeof(struct udp6_sock),
 };
-extern struct proto_ops inet6_dgram_ops;
 static struct inet_protosw udpv6_protosw = {
        .type =      SOCK_DGRAM,
        .protocol =  IPPROTO_UDP,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 60c26c87277e..fbef7826a74f 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -79,7 +79,7 @@ static u32 xfrm6_tunnel_spi;
 #define XFRM6_TUNNEL_SPI_MIN    1
 #define XFRM6_TUNNEL_SPI_MAX    0xffffffff
-static kmem_cache_t *xfrm6_tunnel_spi_kmem;
+static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly;
 #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 5a27e5df5886..34b3bb868409 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -44,7 +44,6 @@
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/string.h>
-#include <linux/tcp.h>
 #include <linux/types.h>
 #include <linux/termios.h>
@@ -52,6 +51,7 @@
 #include <net/p8022.h>
 #include <net/psnap.h>
 #include <net/sock.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
@@ -1627,7 +1627,7 @@ out:
        return rc;
 }
-static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        /* NULL here for pt means the packet was looped back */
        struct ipx_interface *intrfc;
@@ -1796,8 +1796,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
                                     copied);
        if (rc)
                goto out_free;
-        if (skb->stamp.tv_sec)
+        if (skb->tstamp.off_sec)
-                sk->sk_stamp = skb->stamp;
+                skb_get_timestamp(skb, &sk->sk_stamp);
        msg->msg_namelen = sizeof(*sipx);
@@ -1940,9 +1940,7 @@ static struct notifier_block ipx_dev_notifier = {
 };
 extern struct datalink_proto *make_EII_client(void);
-extern struct datalink_proto *make_8023_client(void);
 extern void destroy_EII_client(struct datalink_proto *);
-extern void destroy_8023_client(struct datalink_proto *);
 static unsigned char ipx_8022_type = 0xE0;
 static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
index b6761913445a..1f73d9ea434d 100644
--- a/net/ipx/ipx_proc.c
+++ b/net/ipx/ipx_proc.c
@@ -10,7 +10,7 @@
 #include <linux/proc_fs.h>
 #include <linux/spinlock.h>
 #include <linux/seq_file.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ipx.h>
 static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92c6e8d4e731..6f92f9c62990 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -56,7 +56,7 @@
 #include <asm/uaccess.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/irda/af_irda.h>
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 6dafbb43b529..3e9a06abbdd0 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
                        IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
                        return;
                }
-                /* Unlink tx_skb from list */
-                tx_skb->next = tx_skb->prev = NULL;
-                tx_skb->list = NULL;
                /* Clear old Nr field + poll bit */
                tx_skb->data[1] &= 0x0f;
@@ -1063,9 +1060,6 @@ void irlap_resend_rejected_frame(struct irlap_cb *self, int command)
                        IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
                        return;
                }
-                /* Unlink tx_skb from list */
-                tx_skb->next = tx_skb->prev = NULL;
-                tx_skb->list = NULL;
                /* Clear old Nr field + poll bit */
                tx_skb->data[1] &= 0x0f;
@@ -1309,7 +1303,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb,
 * Jean II
 */
 int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
-                     struct packet_type *ptype)
+                     struct packet_type *ptype, struct net_device *orig_dev)
 {
        struct irlap_info info;
        struct irlap_cb *self;
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 7a4a4d7fbe66..c19e9ce05a3a 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -53,7 +53,6 @@ struct irlmp_cb *irlmp = NULL;
 /* These can be altered by the sysctl interface */
 int  sysctl_discovery         = 0;
 int  sysctl_discovery_timeout = 3; /* 3 seconds by default */
-EXPORT_SYMBOL(sysctl_discovery_timeout);
 int  sysctl_discovery_slots   = 6; /* 6 slots by default */
 int  sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ;
 char sysctl_devname[65];
@@ -67,7 +66,6 @@ const char *irlmp_reasons[] = {
        "LM_INIT_DISCONNECT",
        "ERROR, NOT USED",
 };
-EXPORT_SYMBOL(irlmp_reasons);
 /*
 * Function irlmp_init (void)
@@ -675,7 +673,6 @@ struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance)
        return new;
 }
-EXPORT_SYMBOL(irlmp_dup);
 /*
 * Function irlmp_disconnect_request (handle, userdata)
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 6ffaed4544e9..634901dd156f 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -54,7 +54,7 @@ extern int  irsock_init(void);
 extern void irsock_cleanup(void);
 /* irlap_frame.c */
 extern int  irlap_driver_rcv(struct sk_buff *, struct net_device *, 
-                             struct packet_type *);
+                             struct packet_type *, struct net_device *);
 /*
 * Module parameters
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 9004f7349a76..b391cb3893d4 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -517,9 +517,6 @@ extern int
        irda_irnet_init(void);          /* Initialise IrDA part of IrNET */
 extern void
        irda_irnet_cleanup(void);       /* Teardown IrDA part of IrNET */
-/* ---------------------------- MODULE ---------------------------- */
-extern int
-        irnet_init(void);               /* Initialise IrNET module */
 /**************************** VARIABLES ****************************/
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index f8f984bb9922..e53bf9e0053e 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -1107,7 +1107,7 @@ ppp_irnet_cleanup(void)
 /*
 * Module main entry point
 */
-int __init
+static int __init
 irnet_init(void)
 {
  int err;
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index b0dd3ea35999..1ba8c7106639 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -822,7 +822,6 @@ void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name,
        return entry;
 }
-EXPORT_SYMBOL(hashbin_find_next);
 /*
 * Function hashbin_get_first (hashbin)
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 5de05a0bc0ff..8b5eefd70f03 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb *lapb)
                if (!skb_prev)
                        skb_queue_head(&lapb->write_queue, skb);
                else
-                        skb_append(skb_prev, skb);
+                        skb_append(skb_prev, skb, &lapb->write_queue);
                skb_prev = skb;
        }
 }
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 20b4cfebd74c..66f55e514b56 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -23,13 +23,13 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/tcp.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <net/llc.h>
 #include <net/llc_sap.h>
 #include <net/llc_pdu.h>
 #include <net/llc_conn.h>
+#include <net/tcp_states.h>
 /* remember: uninitialized global data is zeroed because its in .bss */
 static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
@@ -714,7 +714,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
        if (uaddr)
                memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
        msg->msg_namelen = sizeof(*uaddr);
-        if (!skb->list) {
+        if (!skb->next) {
 dgram_free:
                kfree_skb(skb);
        }
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index eba812a9c69c..4c644bc70eae 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -16,7 +16,7 @@
 #include <net/llc_sap.h>
 #include <net/llc_conn.h>
 #include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <net/llc_c_ev.h>
 #include <net/llc_c_ac.h>
 #include <net/llc_c_st.h>
@@ -71,7 +71,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
        if (!ev->ind_prim && !ev->cfm_prim) {
                /* indicate or confirm not required */
-                if (!skb->list)
+                /* XXX this is not very pretty, perhaps we should store
+                 * XXX indicate/confirm-needed state in the llc_conn_state_ev
+                 * XXX control block of the SKB instead? -DaveM
+                 */
+                if (!skb->next)
                        goto out_kfree_skb;
                goto out_skb_put;
        }
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 5ff02c080a0b..9727455bf0e7 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -103,7 +103,8 @@ out:
 struct llc_sap *llc_sap_open(unsigned char lsap,
                             int (*func)(struct sk_buff *skb,
                                         struct net_device *dev,
-                                         struct packet_type *pt))
+                                         struct packet_type *pt,
+                                         struct net_device *orig_dev))
 {
        struct llc_sap *sap = llc_sap_find(lsap);
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 0f9fc48aeaf9..0f84f66018e4 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
-#include <linux/tcp.h>
 #include <asm/errno.h>
 #include <net/llc_if.h>
 #include <net/llc_sap.h>
@@ -25,6 +24,7 @@
 #include <net/llc_c_ev.h>
 #include <net/llc_c_ac.h>
 #include <net/llc_c_st.h>
+#include <net/tcp_states.h>
 u8 llc_mac_null_var[IFHWADDRLEN];
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index 4da6976efc9c..13b46240b7a1 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -132,7 +132,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
 *      data now), it queues this frame in the connection's backlog.
 */
 int llc_rcv(struct sk_buff *skb, struct net_device *dev,
-            struct packet_type *pt)
+            struct packet_type *pt, struct net_device *orig_dev)
 {
        struct llc_sap *sap;
        struct llc_pdu_sn *pdu;
@@ -165,7 +165,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
         * LLC functionality
         */
        if (sap->rcv_func) {
-                sap->rcv_func(skb, dev, pt);
+                sap->rcv_func(skb, dev, pt, orig_dev);
                goto out;
        }
        dest = llc_pdu_type(skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 965c94eb4bbc..34228ef14985 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -21,7 +21,7 @@
 #include <net/llc_s_ev.h>
 #include <net/llc_s_st.h>
 #include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <linux/llc.h>
 /**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
new file mode 100644
index 000000000000..8296b38bf270
--- /dev/null
+++ b/net/netfilter/Kconfig
@@ -0,0 +1,24 @@
+config NETFILTER_NETLINK
+       tristate "Netfilter netlink interface"
+       help
+         If this option is enabled, the kernel will include support
+         for the new netfilter netlink interface.
+config NETFILTER_NETLINK_QUEUE
+        tristate "Netfilter NFQUEUE over NFNETLINK interface"
+        depends on NETFILTER_NETLINK
+        help
+          If this option isenabled, the kernel will include support
+          for queueing packets via NFNETLINK.
+          
+config NETFILTER_NETLINK_LOG
+        tristate "Netfilter LOG over NFNETLINK interface"
+        depends on NETFILTER_NETLINK
+        help
+          If this option is enabled, the kernel will include support
+          for logging packets via NFNETLINK.
+          This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms,
+          and is also scheduled to replace the old syslog-based ipt_LOG
+          and ip6t_LOG modules.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
new file mode 100644
index 000000000000..b3b44f8b415a
--- /dev/null
+++ b/net/netfilter/Makefile
@@ -0,0 +1,7 @@
+netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
new file mode 100644
index 000000000000..1ceb1a6c254b
--- /dev/null
+++ b/net/netfilter/core.c
@@ -0,0 +1,216 @@
+/* netfilter.c: look after the filters for various protocols. 
+ * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
+ *
+ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
+ * way.
+ *
+ * Rusty Russell (C)2000 -- This code is GPL.
+ *
+ * February 2000: Modified by James Morris to have 1 queue per protocol.
+ * 15-Mar-2000:   Added NF_REPEAT --RR.
+ * 08-May-2003:   Internal logging interface added by Jozsef Kadlecsik.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include "nf_internals.h"
+/* In this code, we can be waiting indefinitely for userspace to
+ * service a packet if a hook returns NF_QUEUE.  We could keep a count
+ * of skbuffs queued for userspace, and not deregister a hook unless
+ * this is zero, but that sucks.  Now, we simply check when the
+ * packets come back: if the hook is gone, the packet is discarded. */
+struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
+EXPORT_SYMBOL(nf_hooks);
+static DEFINE_SPINLOCK(nf_hook_lock);
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+        struct list_head *i;
+        spin_lock_bh(&nf_hook_lock);
+        list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
+                if (reg->priority < ((struct nf_hook_ops *)i)->priority)
+                        break;
+        }
+        list_add_rcu(&reg->list, i->prev);
+        spin_unlock_bh(&nf_hook_lock);
+        synchronize_net();
+        return 0;
+}
+EXPORT_SYMBOL(nf_register_hook);
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+        spin_lock_bh(&nf_hook_lock);
+        list_del_rcu(&reg->list);
+        spin_unlock_bh(&nf_hook_lock);
+        synchronize_net();
+}
+EXPORT_SYMBOL(nf_unregister_hook);
+unsigned int nf_iterate(struct list_head *head,
+                        struct sk_buff **skb,
+                        int hook,
+                        const struct net_device *indev,
+                        const struct net_device *outdev,
+                        struct list_head **i,
+                        int (*okfn)(struct sk_buff *),
+                        int hook_thresh)
+{
+        unsigned int verdict;
+        /*
+         * The caller must not block between calls to this
+         * function because of risk of continuing from deleted element.
+         */
+        list_for_each_continue_rcu(*i, head) {
+                struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
+                if (hook_thresh > elem->priority)
+                        continue;
+                /* Optimization: we don't need to hold module
+                   reference here, since function can't sleep. --RR */
+                verdict = elem->hook(hook, skb, indev, outdev, okfn);
+                if (verdict != NF_ACCEPT) {
+#ifdef CONFIG_NETFILTER_DEBUG
+                        if (unlikely((verdict & NF_VERDICT_MASK)
+                                                        > NF_MAX_VERDICT)) {
+                                NFDEBUG("Evil return from %p(%u).\n",
+                                        elem->hook, hook);
+                                continue;
+                        }
+#endif
+                        if (verdict != NF_REPEAT)
+                                return verdict;
+                        *i = (*i)->prev;
+                }
+        }
+        return NF_ACCEPT;
+}
+/* Returns 1 if okfn() needs to be executed by the caller,
+ * -EPERM for NF_DROP, 0 otherwise. */
+int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+                 struct net_device *indev,
+                 struct net_device *outdev,
+                 int (*okfn)(struct sk_buff *),
+                 int hook_thresh)
+{
+        struct list_head *elem;
+        unsigned int verdict;
+        int ret = 0;
+        /* We may already have this, but read-locks nest anyway */
+        rcu_read_lock();
+        elem = &nf_hooks[pf][hook];
+next_hook:
+        verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
+                             outdev, &elem, okfn, hook_thresh);
+        if (verdict == NF_ACCEPT || verdict == NF_STOP) {
+                ret = 1;
+                goto unlock;
+        } else if (verdict == NF_DROP) {
+                kfree_skb(*pskb);
+                ret = -EPERM;
+        } else if ((verdict & NF_VERDICT_MASK)  == NF_QUEUE) {
+                NFDEBUG("nf_hook: Verdict = QUEUE.\n");
+                if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn,
+                              verdict >> NF_VERDICT_BITS))
+                        goto next_hook;
+        }
+unlock:
+        rcu_read_unlock();
+        return ret;
+}
+EXPORT_SYMBOL(nf_hook_slow);
+int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
+{
+        struct sk_buff *nskb;
+        if (writable_len > (*pskb)->len)
+                return 0;
+        /* Not exclusive use of packet?  Must copy. */
+        if (skb_shared(*pskb) || skb_cloned(*pskb))
+                goto copy_skb;
+        return pskb_may_pull(*pskb, writable_len);
+copy_skb:
+        nskb = skb_copy(*pskb, GFP_ATOMIC);
+        if (!nskb)
+                return 0;
+        BUG_ON(skb_is_nonlinear(nskb));
+        /* Rest of kernel will get very unhappy if we pass it a
+           suddenly-orphaned skbuff */
+        if ((*pskb)->sk)
+                skb_set_owner_w(nskb, (*pskb)->sk);
+        kfree_skb(*pskb);
+        *pskb = nskb;
+        return 1;
+}
+EXPORT_SYMBOL(skb_make_writable);
+/* This does not belong here, but locally generated errors need it if connection
+   tracking in use: without this, connection may not be in hash table, and hence
+   manufactured ICMP or RST packets will not be associated with it. */
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+EXPORT_SYMBOL(ip_ct_attach);
+void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+{
+        void (*attach)(struct sk_buff *, struct sk_buff *);
+        if (skb->nfct && (attach = ip_ct_attach) != NULL) {
+                mb(); /* Just to be sure: must be read before executing this */
+                attach(new, skb);
+        }
+}
+EXPORT_SYMBOL(nf_ct_attach);
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_net_netfilter;
+EXPORT_SYMBOL(proc_net_netfilter);
+#endif
+void __init netfilter_init(void)
+{
+        int i, h;
+        for (i = 0; i < NPROTO; i++) {
+                for (h = 0; h < NF_MAX_HOOKS; h++)
+                        INIT_LIST_HEAD(&nf_hooks[i][h]);
+        }
+#ifdef CONFIG_PROC_FS
+        proc_net_netfilter = proc_mkdir("netfilter", proc_net);
+        if (!proc_net_netfilter)
+                panic("cannot create netfilter proc entry");
+#endif
+        if (netfilter_queue_init() < 0)
+                panic("cannot initialize nf_queue");
+        if (netfilter_log_init() < 0)
+                panic("cannot initialize nf_log");
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
new file mode 100644
index 000000000000..6bdee2910617
--- /dev/null
+++ b/net/netfilter/nf_internals.h
@@ -0,0 +1,39 @@
+#ifndef _NF_INTERNALS_H
+#define _NF_INTERNALS_H
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_NETFILTER_DEBUG
+#define NFDEBUG(format, args...)  printk(format , ## args)
+#else
+#define NFDEBUG(format, args...)
+#endif
+/* core.c */
+extern unsigned int nf_iterate(struct list_head *head,
+                                struct sk_buff **skb,
+                                int hook,
+                                const struct net_device *indev,
+                                const struct net_device *outdev,
+                                struct list_head **i,
+                                int (*okfn)(struct sk_buff *),
+                                int hook_thresh);
+/* nf_queue.c */
+extern int nf_queue(struct sk_buff **skb, 
+                    struct list_head *elem, 
+                    int pf, unsigned int hook,
+                    struct net_device *indev,
+                    struct net_device *outdev,
+                    int (*okfn)(struct sk_buff *),
+                    unsigned int queuenum);
+extern int __init netfilter_queue_init(void);
+/* nf_log.c */
+extern int __init netfilter_log_init(void);
+#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
new file mode 100644
index 000000000000..3e76bd0824a2
--- /dev/null
+++ b/net/netfilter/nf_log.c
@@ -0,0 +1,178 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+#include "nf_internals.h"
+/* Internal logging interface, which relies on the real 
+   LOG target modules */
+#define NF_LOG_PREFIXLEN                128
+static struct nf_logger *nf_logging[NPROTO]; /* = NULL */
+static DEFINE_SPINLOCK(nf_log_lock);
+/* return EBUSY if somebody else is registered, EEXIST if the same logger
+ * is registred, 0 on success. */
+int nf_log_register(int pf, struct nf_logger *logger)
+{
+        int ret = -EBUSY;
+        if (pf >= NPROTO)
+                return -EINVAL;
+        /* Any setup of logging members must be done before
+         * substituting pointer. */
+        spin_lock(&nf_log_lock);
+        if (!nf_logging[pf]) {
+                rcu_assign_pointer(nf_logging[pf], logger);
+                ret = 0;
+        } else if (nf_logging[pf] == logger)
+                ret = -EEXIST;
+        spin_unlock(&nf_log_lock);
+        return ret;
+}               
+EXPORT_SYMBOL(nf_log_register);
+int nf_log_unregister_pf(int pf)
+{
+        if (pf >= NPROTO)
+                return -EINVAL;
+        spin_lock(&nf_log_lock);
+        nf_logging[pf] = NULL;
+        spin_unlock(&nf_log_lock);
+        /* Give time to concurrent readers. */
+        synchronize_net();
+        return 0;
+}
+EXPORT_SYMBOL(nf_log_unregister_pf);
+void nf_log_unregister_logger(struct nf_logger *logger)
+{
+        int i;
+        spin_lock(&nf_log_lock);
+        for (i = 0; i < NPROTO; i++) {
+                if (nf_logging[i] == logger)
+                        nf_logging[i] = NULL;
+        }
+        spin_unlock(&nf_log_lock);
+        synchronize_net();
+}
+EXPORT_SYMBOL(nf_log_unregister_logger);
+void nf_log_packet(int pf,
+                   unsigned int hooknum,
+                   const struct sk_buff *skb,
+                   const struct net_device *in,
+                   const struct net_device *out,
+                   struct nf_loginfo *loginfo,
+                   const char *fmt, ...)
+{
+        va_list args;
+        char prefix[NF_LOG_PREFIXLEN];
+        struct nf_logger *logger;
+        
+        rcu_read_lock();
+        logger = rcu_dereference(nf_logging[pf]);
+        if (logger) {
+                va_start(args, fmt);
+                vsnprintf(prefix, sizeof(prefix), fmt, args);
+                va_end(args);
+                /* We must read logging before nf_logfn[pf] */
+                logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
+        } else if (net_ratelimit()) {
+                printk(KERN_WARNING "nf_log_packet: can\'t log since "
+                       "no backend logging module loaded in! Please either "
+                       "load one, or disable logging explicitly\n");
+        }
+        rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_packet);
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+        rcu_read_lock();
+        if (*pos >= NPROTO)
+                return NULL;
+        return pos;
+}
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        (*pos)++;
+        if (*pos >= NPROTO)
+                return NULL;
+        return pos;
+}
+static void seq_stop(struct seq_file *s, void *v)
+{
+        rcu_read_unlock();
+}
+static int seq_show(struct seq_file *s, void *v)
+{
+        loff_t *pos = v;
+        const struct nf_logger *logger;
+        logger = rcu_dereference(nf_logging[*pos]);
+        if (!logger)
+                return seq_printf(s, "%2lld NONE\n", *pos);
+        
+        return seq_printf(s, "%2lld %s\n", *pos, logger->name);
+}
+static struct seq_operations nflog_seq_ops = {
+        .start  = seq_start,
+        .next   = seq_next,
+        .stop   = seq_stop,
+        .show   = seq_show,
+};
+static int nflog_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &nflog_seq_ops);
+}
+static struct file_operations nflog_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = nflog_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+#endif /* PROC_FS */
+int __init netfilter_log_init(void)
+{
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *pde;
+        pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter);
+        if (!pde)
+                return -1;
+        pde->proc_fops = &nflog_file_ops;
+#endif
+        return 0;
+}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
new file mode 100644
index 000000000000..d10d552d9c40
--- /dev/null
+++ b/net/netfilter/nf_queue.c
@@ -0,0 +1,343 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+#include "nf_internals.h"
+/* 
+ * A queue handler may be registered for each protocol.  Each is protected by
+ * long term mutex.  The handler must provide an an outfn() to accept packets
+ * for queueing and must reinject all packets it receives, no matter what.
+ */
+static struct nf_queue_handler *queue_handler[NPROTO];
+static struct nf_queue_rerouter *queue_rerouter;
+static DEFINE_RWLOCK(queue_handler_lock);
+/* return EBUSY when somebody else is registered, return EEXIST if the
+ * same handler is registered, return 0 in case of success. */
+int nf_register_queue_handler(int pf, struct nf_queue_handler *qh)
+{      
+        int ret;
+        if (pf >= NPROTO)
+                return -EINVAL;
+        write_lock_bh(&queue_handler_lock);
+        if (queue_handler[pf] == qh)
+                ret = -EEXIST;
+        else if (queue_handler[pf])
+                ret = -EBUSY;
+        else {
+                queue_handler[pf] = qh;
+                ret = 0;
+        }
+        write_unlock_bh(&queue_handler_lock);
+        return ret;
+}
+EXPORT_SYMBOL(nf_register_queue_handler);
+/* The caller must flush their queue before this */
+int nf_unregister_queue_handler(int pf)
+{
+        if (pf >= NPROTO)
+                return -EINVAL;
+        write_lock_bh(&queue_handler_lock);
+        queue_handler[pf] = NULL;
+        write_unlock_bh(&queue_handler_lock);
+        
+        return 0;
+}
+EXPORT_SYMBOL(nf_unregister_queue_handler);
+int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer)
+{
+        if (pf >= NPROTO)
+                return -EINVAL;
+        write_lock_bh(&queue_handler_lock);
+        memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf]));
+        write_unlock_bh(&queue_handler_lock);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nf_register_queue_rerouter);
+int nf_unregister_queue_rerouter(int pf)
+{
+        if (pf >= NPROTO)
+                return -EINVAL;
+        write_lock_bh(&queue_handler_lock);
+        memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf]));
+        write_unlock_bh(&queue_handler_lock);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter);
+void nf_unregister_queue_handlers(struct nf_queue_handler *qh)
+{
+        int pf;
+        write_lock_bh(&queue_handler_lock);
+        for (pf = 0; pf < NPROTO; pf++)  {
+                if (queue_handler[pf] == qh)
+                        queue_handler[pf] = NULL;
+        }
+        write_unlock_bh(&queue_handler_lock);
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
+/* 
+ * Any packet that leaves via this function must come back 
+ * through nf_reinject().
+ */
+int nf_queue(struct sk_buff **skb, 
+             struct list_head *elem, 
+             int pf, unsigned int hook,
+             struct net_device *indev,
+             struct net_device *outdev,
+             int (*okfn)(struct sk_buff *),
+             unsigned int queuenum)
+{
+        int status;
+        struct nf_info *info;
+#ifdef CONFIG_BRIDGE_NETFILTER
+        struct net_device *physindev = NULL;
+        struct net_device *physoutdev = NULL;
+#endif
+        /* QUEUE == DROP if noone is waiting, to be safe. */
+        read_lock(&queue_handler_lock);
+        if (!queue_handler[pf]->outfn) {
+                read_unlock(&queue_handler_lock);
+                kfree_skb(*skb);
+                return 1;
+        }
+        info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC);
+        if (!info) {
+                if (net_ratelimit())
+                        printk(KERN_ERR "OOM queueing packet %p\n",
+                               *skb);
+                read_unlock(&queue_handler_lock);
+                kfree_skb(*skb);
+                return 1;
+        }
+        *info = (struct nf_info) { 
+                (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
+        /* If it's going away, ignore hook. */
+        if (!try_module_get(info->elem->owner)) {
+                read_unlock(&queue_handler_lock);
+                kfree(info);
+                return 0;
+        }
+        /* Bump dev refs so they don't vanish while packet is out */
+        if (indev) dev_hold(indev);
+        if (outdev) dev_hold(outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+        if ((*skb)->nf_bridge) {
+                physindev = (*skb)->nf_bridge->physindev;
+                if (physindev) dev_hold(physindev);
+                physoutdev = (*skb)->nf_bridge->physoutdev;
+                if (physoutdev) dev_hold(physoutdev);
+        }
+#endif
+        if (queue_rerouter[pf].save)
+                queue_rerouter[pf].save(*skb, info);
+        status = queue_handler[pf]->outfn(*skb, info, queuenum,
+                                          queue_handler[pf]->data);
+        if (status >= 0 && queue_rerouter[pf].reroute)
+                status = queue_rerouter[pf].reroute(skb, info);
+        read_unlock(&queue_handler_lock);
+        if (status < 0) {
+                /* James M doesn't say fuck enough. */
+                if (indev) dev_put(indev);
+                if (outdev) dev_put(outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+                if (physindev) dev_put(physindev);
+                if (physoutdev) dev_put(physoutdev);
+#endif
+                module_put(info->elem->owner);
+                kfree(info);
+                kfree_skb(*skb);
+                return 1;
+        }
+        return 1;
+}
+void nf_reinject(struct sk_buff *skb, struct nf_info *info,
+                 unsigned int verdict)
+{
+        struct list_head *elem = &info->elem->list;
+        struct list_head *i;
+        rcu_read_lock();
+        /* Release those devices we held, or Alexey will kill me. */
+        if (info->indev) dev_put(info->indev);
+        if (info->outdev) dev_put(info->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+        if (skb->nf_bridge) {
+                if (skb->nf_bridge->physindev)
+                        dev_put(skb->nf_bridge->physindev);
+                if (skb->nf_bridge->physoutdev)
+                        dev_put(skb->nf_bridge->physoutdev);
+        }
+#endif
+        /* Drop reference to owner of hook which queued us. */
+        module_put(info->elem->owner);
+        list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
+                if (i == elem) 
+                        break;
+        }
+  
+        if (elem == &nf_hooks[info->pf][info->hook]) {
+                /* The module which sent it to userspace is gone. */
+                NFDEBUG("%s: module disappeared, dropping packet.\n",
+                        __FUNCTION__);
+                verdict = NF_DROP;
+        }
+        /* Continue traversal iff userspace said ok... */
+        if (verdict == NF_REPEAT) {
+                elem = elem->prev;
+                verdict = NF_ACCEPT;
+        }
+        if (verdict == NF_ACCEPT) {
+        next_hook:
+                verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
+                                     &skb, info->hook, 
+                                     info->indev, info->outdev, &elem,
+                                     info->okfn, INT_MIN);
+        }
+        switch (verdict & NF_VERDICT_MASK) {
+        case NF_ACCEPT:
+                info->okfn(skb);
+                break;
+        case NF_QUEUE:
+                if (!nf_queue(&skb, elem, info->pf, info->hook, 
+                              info->indev, info->outdev, info->okfn,
+                              verdict >> NF_VERDICT_BITS))
+                        goto next_hook;
+                break;
+        }
+        rcu_read_unlock();
+        if (verdict == NF_DROP)
+                kfree_skb(skb);
+        kfree(info);
+        return;
+}
+EXPORT_SYMBOL(nf_reinject);
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+        if (*pos >= NPROTO)
+                return NULL;
+        return pos;
+}
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        (*pos)++;
+        if (*pos >= NPROTO)
+                return NULL;
+        return pos;
+}
+static void seq_stop(struct seq_file *s, void *v)
+{
+}
+static int seq_show(struct seq_file *s, void *v)
+{
+        int ret;
+        loff_t *pos = v;
+        struct nf_queue_handler *qh;
+        read_lock_bh(&queue_handler_lock);
+        qh = queue_handler[*pos];
+        if (!qh)
+                ret = seq_printf(s, "%2lld NONE\n", *pos);
+        else
+                ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
+        read_unlock_bh(&queue_handler_lock);
+        return ret;
+}
+static struct seq_operations nfqueue_seq_ops = {
+        .start  = seq_start,
+        .next   = seq_next,
+        .stop   = seq_stop,
+        .show   = seq_show,
+};
+static int nfqueue_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &nfqueue_seq_ops);
+}
+static struct file_operations nfqueue_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = nfqueue_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+#endif /* PROC_FS */
+int __init netfilter_queue_init(void)
+{
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *pde;
+#endif
+        queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter),
+                                 GFP_KERNEL);
+        if (!queue_rerouter)
+                return -ENOMEM;
+#ifdef CONFIG_PROC_FS
+        pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter);
+        if (!pde) {
+                kfree(queue_rerouter);
+                return -1;
+        }
+        pde->proc_fops = &nfqueue_file_ops;
+#endif
+        memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter));
+        return 0;
+}
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
new file mode 100644
index 000000000000..61a833a9caa6
--- /dev/null
+++ b/net/netfilter/nf_sockopt.c
@@ -0,0 +1,132 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+#include "nf_internals.h"
+/* Sockopts only registered and called from user context, so
+   net locking would be overkill.  Also, [gs]etsockopt calls may
+   sleep. */
+static DECLARE_MUTEX(nf_sockopt_mutex);
+static LIST_HEAD(nf_sockopts);
+/* Do exclusive ranges overlap? */
+static inline int overlap(int min1, int max1, int min2, int max2)
+{
+        return max1 > min2 && min1 < max2;
+}
+/* Functions to register sockopt ranges (exclusive). */
+int nf_register_sockopt(struct nf_sockopt_ops *reg)
+{
+        struct list_head *i;
+        int ret = 0;
+        if (down_interruptible(&nf_sockopt_mutex) != 0)
+                return -EINTR;
+        list_for_each(i, &nf_sockopts) {
+                struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
+                if (ops->pf == reg->pf
+                    && (overlap(ops->set_optmin, ops->set_optmax, 
+                                reg->set_optmin, reg->set_optmax)
+                        || overlap(ops->get_optmin, ops->get_optmax, 
+                                   reg->get_optmin, reg->get_optmax))) {
+                        NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
+                                ops->set_optmin, ops->set_optmax, 
+                                ops->get_optmin, ops->get_optmax, 
+                                reg->set_optmin, reg->set_optmax,
+                                reg->get_optmin, reg->get_optmax);
+                        ret = -EBUSY;
+                        goto out;
+                }
+        }
+        list_add(&reg->list, &nf_sockopts);
+out:
+        up(&nf_sockopt_mutex);
+        return ret;
+}
+EXPORT_SYMBOL(nf_register_sockopt);
+void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
+{
+        /* No point being interruptible: we're probably in cleanup_module() */
+ restart:
+        down(&nf_sockopt_mutex);
+        if (reg->use != 0) {
+                /* To be woken by nf_sockopt call... */
+                /* FIXME: Stuart Young's name appears gratuitously. */
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                reg->cleanup_task = current;
+                up(&nf_sockopt_mutex);
+                schedule();
+                goto restart;
+        }
+        list_del(&reg->list);
+        up(&nf_sockopt_mutex);
+}
+EXPORT_SYMBOL(nf_unregister_sockopt);
+/* Call get/setsockopt() */
+static int nf_sockopt(struct sock *sk, int pf, int val, 
+                      char __user *opt, int *len, int get)
+{
+        struct list_head *i;
+        struct nf_sockopt_ops *ops;
+        int ret;
+        if (down_interruptible(&nf_sockopt_mutex) != 0)
+                return -EINTR;
+        list_for_each(i, &nf_sockopts) {
+                ops = (struct nf_sockopt_ops *)i;
+                if (ops->pf == pf) {
+                        if (get) {
+                                if (val >= ops->get_optmin
+                                    && val < ops->get_optmax) {
+                                        ops->use++;
+                                        up(&nf_sockopt_mutex);
+                                        ret = ops->get(sk, val, opt, len);
+                                        goto out;
+                                }
+                        } else {
+                                if (val >= ops->set_optmin
+                                    && val < ops->set_optmax) {
+                                        ops->use++;
+                                        up(&nf_sockopt_mutex);
+                                        ret = ops->set(sk, val, opt, *len);
+                                        goto out;
+                                }
+                        }
+                }
+        }
+        up(&nf_sockopt_mutex);
+        return -ENOPROTOOPT;
+        
+ out:
+        down(&nf_sockopt_mutex);
+        ops->use--;
+        if (ops->cleanup_task)
+                wake_up_process(ops->cleanup_task);
+        up(&nf_sockopt_mutex);
+        return ret;
+}
+int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
+                  int len)
+{
+        return nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(nf_setsockopt);
+int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
+{
+        return nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(nf_getsockopt);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
new file mode 100644
index 000000000000..e089f17bb803
--- /dev/null
+++ b/net/netfilter/nfnetlink.c
@@ -0,0 +1,376 @@
+/* Netfilter messages via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>,
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * Initial netfilter messages via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <net/sock.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+static char __initdata nfversion[] = "0.30";
+#if 0
+#define DEBUGP(format, args...) \
+                printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \
+                        __LINE__, __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+static struct sock *nfnl = NULL;
+static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
+DECLARE_MUTEX(nfnl_sem);
+void nfnl_lock(void)
+{
+        nfnl_shlock();
+}
+void nfnl_unlock(void)
+{
+        nfnl_shunlock();
+}
+int nfnetlink_subsys_register(struct nfnetlink_subsystem *n)
+{
+        DEBUGP("registering subsystem ID %u\n", n->subsys_id);
+        nfnl_lock();
+        if (subsys_table[n->subsys_id]) {
+                nfnl_unlock();
+                return -EBUSY;
+        }
+        subsys_table[n->subsys_id] = n;
+        nfnl_unlock();
+        return 0;
+}
+int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n)
+{
+        DEBUGP("unregistering subsystem ID %u\n", n->subsys_id);
+        nfnl_lock();
+        subsys_table[n->subsys_id] = NULL;
+        nfnl_unlock();
+        return 0;
+}
+static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+{
+        u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+        if (subsys_id >= NFNL_SUBSYS_COUNT
+            || subsys_table[subsys_id] == NULL)
+                return NULL;
+        return subsys_table[subsys_id];
+}
+static inline struct nfnl_callback *
+nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss)
+{
+        u_int8_t cb_id = NFNL_MSG_TYPE(type);
+        
+        if (cb_id >= ss->cb_count) {
+                DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count);
+                return NULL;
+        }
+        return &ss->cb[cb_id];
+}
+void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
+                const void *data)
+{
+        struct nfattr *nfa;
+        int size = NFA_LENGTH(attrlen);
+        nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+        nfa->nfa_type = attrtype;
+        nfa->nfa_len  = size;
+        memcpy(NFA_DATA(nfa), data, attrlen);
+        memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
+}
+int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
+{
+        memset(tb, 0, sizeof(struct nfattr *) * maxattr);
+        while (NFA_OK(nfa, len)) {
+                unsigned flavor = nfa->nfa_type;
+                if (flavor && flavor <= maxattr)
+                        tb[flavor-1] = nfa;
+                nfa = NFA_NEXT(nfa, len);
+        }
+        return 0;
+}
+/**
+ * nfnetlink_check_attributes - check and parse nfnetlink attributes
+ *
+ * subsys: nfnl subsystem for which this message is to be parsed
+ * nlmsghdr: netlink message to be checked/parsed
+ * cda: array of pointers, needs to be at least subsys->attr_count big
+ *
+ */
+static int
+nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
+                           struct nlmsghdr *nlh, struct nfattr *cda[])
+{
+        int min_len;
+        u_int16_t attr_count;
+        u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+        if (unlikely(cb_id >= subsys->cb_count)) {
+                DEBUGP("msgtype %u >= %u, returning\n",
+                        cb_id, subsys->cb_count);
+                return -EINVAL;
+        }
+        min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
+        if (unlikely(nlh->nlmsg_len < min_len))
+                return -EINVAL;
+        attr_count = subsys->cb[cb_id].attr_count;
+        memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+        /* check attribute lengths. */
+        if (likely(nlh->nlmsg_len > min_len)) {
+                struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh));
+                int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+                while (NFA_OK(attr, attrlen)) {
+                        unsigned flavor = attr->nfa_type;
+                        if (flavor) {
+                                if (flavor > attr_count)
+                                        return -EINVAL;
+                                cda[flavor - 1] = attr;
+                        }
+                        attr = NFA_NEXT(attr, attrlen);
+                }
+        }
+        /* implicit: if nlmsg_len == min_len, we return 0, and an empty
+         * (zeroed) cda[] array. The message is valid, but empty. */
+        return 0;
+}
+int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+{
+        int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+        int err = 0;
+        NETLINK_CB(skb).dst_group = group;
+        if (echo)
+                atomic_inc(&skb->users);
+        netlink_broadcast(nfnl, skb, pid, group, allocation);
+        if (echo)
+                err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT);
+        return err;
+}
+int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
+{
+        return netlink_unicast(nfnl, skb, pid, flags);
+}
+/* Process one complete nfnetlink message. */
+static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
+                                    struct nlmsghdr *nlh, int *errp)
+{
+        struct nfnl_callback *nc;
+        struct nfnetlink_subsystem *ss;
+        int type, err = 0;
+        DEBUGP("entered; subsys=%u, msgtype=%u\n",
+                 NFNL_SUBSYS_ID(nlh->nlmsg_type),
+                 NFNL_MSG_TYPE(nlh->nlmsg_type));
+        /* Only requests are handled by kernel now. */
+        if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+                DEBUGP("received non-request message\n");
+                return 0;
+        }
+        /* All the messages must at least contain nfgenmsg */
+        if (nlh->nlmsg_len < 
+                        NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) {
+                DEBUGP("received message was too short\n");
+                return 0;
+        }
+        type = nlh->nlmsg_type;
+        ss = nfnetlink_get_subsys(type);
+        if (!ss) {
+#ifdef CONFIG_KMOD
+                /* don't call nfnl_shunlock, since it would reenter
+                 * with further packet processing */
+                up(&nfnl_sem);
+                request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
+                nfnl_shlock();
+                ss = nfnetlink_get_subsys(type);
+                if (!ss)
+#endif
+                goto err_inval;
+        }
+        nc = nfnetlink_find_client(type, ss);
+        if (!nc) {
+                DEBUGP("unable to find client for type %d\n", type);
+                goto err_inval;
+        }
+        if (nc->cap_required && 
+            !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) {
+                DEBUGP("permission denied for type %d\n", type);
+                *errp = -EPERM;
+                return -1;
+        }
+        {
+                u_int16_t attr_count = 
+                        ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
+                struct nfattr *cda[attr_count];
+                memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+                
+                err = nfnetlink_check_attributes(ss, nlh, cda);
+                if (err < 0)
+                        goto err_inval;
+                DEBUGP("calling handler\n");
+                err = nc->call(nfnl, skb, nlh, cda, errp);
+                *errp = err;
+                return err;
+        }
+err_inval:
+        DEBUGP("returning -EINVAL\n");
+        *errp = -EINVAL;
+        return -1;
+}
+/* Process one packet of messages. */
+static inline int nfnetlink_rcv_skb(struct sk_buff *skb)
+{
+        int err;
+        struct nlmsghdr *nlh;
+        while (skb->len >= NLMSG_SPACE(0)) {
+                u32 rlen;
+                nlh = (struct nlmsghdr *)skb->data;
+                if (nlh->nlmsg_len < sizeof(struct nlmsghdr)
+                    || skb->len < nlh->nlmsg_len)
+                        return 0;
+                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+                if (rlen > skb->len)
+                        rlen = skb->len;
+                if (nfnetlink_rcv_msg(skb, nlh, &err)) {
+                        if (!err)
+                                return -1;
+                        netlink_ack(skb, nlh, err);
+                } else
+                        if (nlh->nlmsg_flags & NLM_F_ACK)
+                                netlink_ack(skb, nlh, 0);
+                skb_pull(skb, rlen);
+        }
+        return 0;
+}
+static void nfnetlink_rcv(struct sock *sk, int len)
+{
+        do {
+                struct sk_buff *skb;
+                if (nfnl_shlock_nowait())
+                        return;
+                while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+                        if (nfnetlink_rcv_skb(skb)) {
+                                if (skb->len)
+                                        skb_queue_head(&sk->sk_receive_queue,
+                                                       skb);
+                                else
+                                        kfree_skb(skb);
+                                break;
+                        }
+                        kfree_skb(skb);
+                }
+                /* don't call nfnl_shunlock, since it would reenter
+                 * with further packet processing */
+                up(&nfnl_sem);
+        } while(nfnl && nfnl->sk_receive_queue.qlen);
+}
+void __exit nfnetlink_exit(void)
+{
+        printk("Removing netfilter NETLINK layer.\n");
+        sock_release(nfnl->sk_socket);
+        return;
+}
+int __init nfnetlink_init(void)
+{
+        printk("Netfilter messages via NETLINK v%s.\n", nfversion);
+        nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX,
+                                     nfnetlink_rcv, THIS_MODULE);
+        if (!nfnl) {
+                printk(KERN_ERR "cannot initialize nfnetlink!\n");
+                return -1;
+        }
+        return 0;
+}
+module_init(nfnetlink_init);
+module_exit(nfnetlink_exit);
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
+EXPORT_SYMBOL_GPL(nfnetlink_send);
+EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+EXPORT_SYMBOL_GPL(nfattr_parse);
+EXPORT_SYMBOL_GPL(__nfa_fill);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
new file mode 100644
index 000000000000..ff5601ceedcb
--- /dev/null
+++ b/net/netfilter/nfnetlink_log.c
@@ -0,0 +1,1055 @@
+/*
+ * This is a module which is used for logging packets to userspace via
+ * nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ipt_ULOG.c:
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+#include <asm/atomic.h>
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+#define NFULNL_NLBUFSIZ_DEFAULT 4096
+#define NFULNL_TIMEOUT_DEFAULT  100     /* every second */
+#define NFULNL_QTHRESH_DEFAULT  100     /* 100 packets */
+#define PRINTR(x, args...)      do { if (net_ratelimit()) \
+                                     printk(x, ## args); } while (0);
+#if 0
+#define UDEBUG(x, args ...)     printk(KERN_DEBUG "%s(%d):%s(): " x,       \
+                                        __FILE__, __LINE__, __FUNCTION__,  \
+                                        ## args)
+#else
+#define UDEBUG(x, ...)
+#endif
+struct nfulnl_instance {
+        struct hlist_node hlist;        /* global list of instances */
+        spinlock_t lock;
+        atomic_t use;                   /* use count */
+        unsigned int qlen;              /* number of nlmsgs in skb */
+        struct sk_buff *skb;            /* pre-allocatd skb */
+        struct nlmsghdr *lastnlh;       /* netlink header of last msg in skb */
+        struct timer_list timer;
+        int peer_pid;                   /* PID of the peer process */
+        /* configurable parameters */
+        unsigned int flushtimeout;      /* timeout until queue flush */
+        unsigned int nlbufsiz;          /* netlink buffer allocation size */
+        unsigned int qthreshold;        /* threshold of the queue */
+        u_int32_t copy_range;
+        u_int16_t group_num;            /* number of this queue */
+        u_int8_t copy_mode;     
+};
+static DEFINE_RWLOCK(instances_lock);
+#define INSTANCE_BUCKETS        16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+static unsigned int hash_init;
+static inline u_int8_t instance_hashfn(u_int16_t group_num)
+{
+        return ((group_num & 0xff) % INSTANCE_BUCKETS);
+}
+static struct nfulnl_instance *
+__instance_lookup(u_int16_t group_num)
+{
+        struct hlist_head *head;
+        struct hlist_node *pos;
+        struct nfulnl_instance *inst;
+        UDEBUG("entering (group_num=%u)\n", group_num);
+        head = &instance_table[instance_hashfn(group_num)];
+        hlist_for_each_entry(inst, pos, head, hlist) {
+                if (inst->group_num == group_num)
+                        return inst;
+        }
+        return NULL;
+}
+static inline void
+instance_get(struct nfulnl_instance *inst)
+{
+        atomic_inc(&inst->use);
+}
+static struct nfulnl_instance *
+instance_lookup_get(u_int16_t group_num)
+{
+        struct nfulnl_instance *inst;
+        read_lock_bh(&instances_lock);
+        inst = __instance_lookup(group_num);
+        if (inst)
+                instance_get(inst);
+        read_unlock_bh(&instances_lock);
+        return inst;
+}
+static void
+instance_put(struct nfulnl_instance *inst)
+{
+        if (inst && atomic_dec_and_test(&inst->use)) {
+                UDEBUG("kfree(inst=%p)\n", inst);
+                kfree(inst);
+        }
+}
+static void nfulnl_timer(unsigned long data);
+static struct nfulnl_instance *
+instance_create(u_int16_t group_num, int pid)
+{
+        struct nfulnl_instance *inst;
+        UDEBUG("entering (group_num=%u, pid=%d)\n", group_num,
+                pid);
+        write_lock_bh(&instances_lock); 
+        if (__instance_lookup(group_num)) {
+                inst = NULL;
+                UDEBUG("aborting, instance already exists\n");
+                goto out_unlock;
+        }
+        inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+        if (!inst)
+                goto out_unlock;
+        memset(inst, 0, sizeof(*inst));
+        INIT_HLIST_NODE(&inst->hlist);
+        inst->lock = SPIN_LOCK_UNLOCKED;
+        /* needs to be two, since we _put() after creation */
+        atomic_set(&inst->use, 2);
+        init_timer(&inst->timer);
+        inst->timer.function = nfulnl_timer;
+        inst->timer.data = (unsigned long)inst;
+        /* don't start timer yet. (re)start it  with every packet */
+        inst->peer_pid = pid;
+        inst->group_num = group_num;
+        inst->qthreshold        = NFULNL_QTHRESH_DEFAULT;
+        inst->flushtimeout      = NFULNL_TIMEOUT_DEFAULT;
+        inst->nlbufsiz          = NFULNL_NLBUFSIZ_DEFAULT;
+        inst->copy_mode         = NFULNL_COPY_PACKET;
+        inst->copy_range        = 0xffff;
+        if (!try_module_get(THIS_MODULE))
+                goto out_free;
+        hlist_add_head(&inst->hlist, 
+                       &instance_table[instance_hashfn(group_num)]);
+        UDEBUG("newly added node: %p, next=%p\n", &inst->hlist, 
+                inst->hlist.next);
+        write_unlock_bh(&instances_lock);
+        return inst;
+out_free:
+        instance_put(inst);
+out_unlock:
+        write_unlock_bh(&instances_lock);
+        return NULL;
+}
+static int __nfulnl_send(struct nfulnl_instance *inst);
+static void
+_instance_destroy2(struct nfulnl_instance *inst, int lock)
+{
+        /* first pull it out of the global list */
+        if (lock)
+                write_lock_bh(&instances_lock);
+        UDEBUG("removing instance %p (queuenum=%u) from hash\n",
+                inst, inst->group_num);
+        hlist_del(&inst->hlist);
+        if (lock)
+                write_unlock_bh(&instances_lock);
+        /* then flush all pending packets from skb */
+        spin_lock_bh(&inst->lock);
+        if (inst->skb) {
+                if (inst->qlen)
+                        __nfulnl_send(inst);
+                if (inst->skb) {
+                        kfree_skb(inst->skb);
+                        inst->skb = NULL;
+                }
+        }
+        spin_unlock_bh(&inst->lock);
+        /* and finally put the refcount */
+        instance_put(inst);
+        module_put(THIS_MODULE);
+}
+static inline void
+__instance_destroy(struct nfulnl_instance *inst)
+{
+        _instance_destroy2(inst, 0);
+}
+static inline void
+instance_destroy(struct nfulnl_instance *inst)
+{
+        _instance_destroy2(inst, 1);
+}
+static int
+nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
+                  unsigned int range)
+{
+        int status = 0;
+        spin_lock_bh(&inst->lock);
+        
+        switch (mode) {
+        case NFULNL_COPY_NONE:
+        case NFULNL_COPY_META:
+                inst->copy_mode = mode;
+                inst->copy_range = 0;
+                break;
+                
+        case NFULNL_COPY_PACKET:
+                inst->copy_mode = mode;
+                /* we're using struct nfattr which has 16bit nfa_len */
+                if (range > 0xffff)
+                        inst->copy_range = 0xffff;
+                else
+                        inst->copy_range = range;
+                break;
+                
+        default:
+                status = -EINVAL;
+                break;
+        }
+        spin_unlock_bh(&inst->lock);
+        return status;
+}
+static int
+nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
+{
+        int status;
+        spin_lock_bh(&inst->lock);
+        if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
+                status = -ERANGE;
+        else if (nlbufsiz > 131072)
+                status = -ERANGE;
+        else {
+                inst->nlbufsiz = nlbufsiz;
+                status = 0;
+        }
+        spin_unlock_bh(&inst->lock);
+        return status;
+}
+static int
+nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
+{
+        spin_lock_bh(&inst->lock);
+        inst->flushtimeout = timeout;
+        spin_unlock_bh(&inst->lock);
+        return 0;
+}
+static int
+nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
+{
+        spin_lock_bh(&inst->lock);
+        inst->qthreshold = qthresh;
+        spin_unlock_bh(&inst->lock);
+        return 0;
+}
+static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, 
+                                        unsigned int pkt_size)
+{
+        struct sk_buff *skb;
+        UDEBUG("entered (%u, %u)\n", inst_size, pkt_size);
+        /* alloc skb which should be big enough for a whole multipart
+         * message.  WARNING: has to be <= 128k due to slab restrictions */
+        skb = alloc_skb(inst_size, GFP_ATOMIC);
+        if (!skb) {
+                PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
+                        inst_size);
+                /* try to allocate only as much as we need for current
+                 * packet */
+                skb = alloc_skb(pkt_size, GFP_ATOMIC);
+                if (!skb)
+                        PRINTR("nfnetlink_log: can't even alloc %u bytes\n",
+                                pkt_size);
+        }
+        return skb;
+}
+static int
+__nfulnl_send(struct nfulnl_instance *inst)
+{
+        int status;
+        if (timer_pending(&inst->timer))
+                del_timer(&inst->timer);
+        if (inst->qlen > 1)
+                inst->lastnlh->nlmsg_type = NLMSG_DONE;
+        status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT);
+        if (status < 0) {
+                UDEBUG("netlink_unicast() failed\n");
+                /* FIXME: statistics */
+        }
+        inst->qlen = 0;
+        inst->skb = NULL;
+        inst->lastnlh = NULL;
+        return status;
+}
+static void nfulnl_timer(unsigned long data)
+{
+        struct nfulnl_instance *inst = (struct nfulnl_instance *)data; 
+        UDEBUG("timer function called, flushing buffer\n");
+        spin_lock_bh(&inst->lock);
+        __nfulnl_send(inst);
+        instance_put(inst);
+        spin_unlock_bh(&inst->lock);
+}
+static inline int 
+__build_packet_message(struct nfulnl_instance *inst,
+                        const struct sk_buff *skb, 
+                        unsigned int data_len,
+                        unsigned int pf,
+                        unsigned int hooknum,
+                        const struct net_device *indev,
+                        const struct net_device *outdev,
+                        const struct nf_loginfo *li,
+                        const char *prefix)
+{
+        unsigned char *old_tail;
+        struct nfulnl_msg_packet_hdr pmsg;
+        struct nlmsghdr *nlh;
+        struct nfgenmsg *nfmsg;
+        u_int32_t tmp_uint;
+        UDEBUG("entered\n");
+                
+        old_tail = inst->skb->tail;
+        nlh = NLMSG_PUT(inst->skb, 0, 0, 
+                        NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
+                        sizeof(struct nfgenmsg));
+        nfmsg = NLMSG_DATA(nlh);
+        nfmsg->nfgen_family = pf;
+        nfmsg->version = NFNETLINK_V0;
+        nfmsg->res_id = htons(inst->group_num);
+        pmsg.hw_protocol        = htons(skb->protocol);
+        pmsg.hook               = hooknum;
+        NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
+        if (prefix) {
+                int slen = strlen(prefix);
+                if (slen > NFULNL_PREFIXLEN)
+                        slen = NFULNL_PREFIXLEN;
+                NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix);
+        }
+        if (indev) {
+                tmp_uint = htonl(indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+                NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint),
+                        &tmp_uint);
+#else
+                if (pf == PF_BRIDGE) {
+                        /* Case 1: outdev is physical input device, we need to
+                         * look for bridge group (when called from
+                         * netfilter_bridge) */
+                        NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+                                sizeof(tmp_uint), &tmp_uint);
+                        /* this is the bridge group "brX" */
+                        tmp_uint = htonl(indev->br_port->br->dev->ifindex);
+                        NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+                                sizeof(tmp_uint), &tmp_uint);
+                } else {
+                        /* Case 2: indev is bridge group, we need to look for
+                         * physical device (when called from ipv4) */
+                        NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+                                sizeof(tmp_uint), &tmp_uint);
+                        if (skb->nf_bridge && skb->nf_bridge->physindev) {
+                                tmp_uint = 
+                                    htonl(skb->nf_bridge->physindev->ifindex);
+                                NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+                                        sizeof(tmp_uint), &tmp_uint);
+                        }
+                }
+#endif
+        }
+        if (outdev) {
+                tmp_uint = htonl(outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+                NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+                        &tmp_uint);
+#else
+                if (pf == PF_BRIDGE) {
+                        /* Case 1: outdev is physical output device, we need to
+                         * look for bridge group (when called from
+                         * netfilter_bridge) */
+                        NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+                                sizeof(tmp_uint), &tmp_uint);
+                        /* this is the bridge group "brX" */
+                        tmp_uint = htonl(outdev->br_port->br->dev->ifindex);
+                        NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+                                sizeof(tmp_uint), &tmp_uint);
+                } else {
+                        /* Case 2: indev is a bridge group, we need to look
+                         * for physical device (when called from ipv4) */
+                        NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+                                sizeof(tmp_uint), &tmp_uint);
+                        if (skb->nf_bridge) {
+                                tmp_uint = 
+                                    htonl(skb->nf_bridge->physoutdev->ifindex);
+                                NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+                                        sizeof(tmp_uint), &tmp_uint);
+                        }
+                }
+#endif
+        }
+        if (skb->nfmark) {
+                tmp_uint = htonl(skb->nfmark);
+                NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint);
+        }
+        if (indev && skb->dev && skb->dev->hard_header_parse) {
+                struct nfulnl_msg_packet_hw phw;
+                phw.hw_addrlen = 
+                        skb->dev->hard_header_parse((struct sk_buff *)skb, 
+                                                    phw.hw_addr);
+                phw.hw_addrlen = htons(phw.hw_addrlen);
+                NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
+        }
+        if (skb->tstamp.off_sec) {
+                struct nfulnl_msg_packet_timestamp ts;
+                ts.sec = cpu_to_be64(skb_tv_base.tv_sec + skb->tstamp.off_sec);
+                ts.usec = cpu_to_be64(skb_tv_base.tv_usec + skb->tstamp.off_usec);
+                NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
+        }
+        /* UID */
+        if (skb->sk) {
+                read_lock_bh(&skb->sk->sk_callback_lock);
+                if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
+                        u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid);
+                        /* need to unlock here since NFA_PUT may goto */
+                        read_unlock_bh(&skb->sk->sk_callback_lock);
+                        NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid);
+                } else
+                        read_unlock_bh(&skb->sk->sk_callback_lock);
+        }
+        if (data_len) {
+                struct nfattr *nfa;
+                int size = NFA_LENGTH(data_len);
+                if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) {
+                        printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
+                        goto nlmsg_failure;
+                }
+                nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size));
+                nfa->nfa_type = NFULA_PAYLOAD;
+                nfa->nfa_len = size;
+                if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len))
+                        BUG();
+        }
+                
+        nlh->nlmsg_len = inst->skb->tail - old_tail;
+        return 0;
+nlmsg_failure:
+        UDEBUG("nlmsg_failure\n");
+nfattr_failure:
+        PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
+        return -1;
+}
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+static struct nf_loginfo default_loginfo = {
+        .type =         NF_LOG_TYPE_ULOG,
+        .u = {
+                .ulog = {
+                        .copy_len       = 0xffff,
+                        .group          = 0,
+                        .qthreshold     = 1,
+                },
+        },
+};
+/* log handler for internal netfilter logging api */
+static void
+nfulnl_log_packet(unsigned int pf,
+                  unsigned int hooknum,
+                  const struct sk_buff *skb,
+                  const struct net_device *in,
+                  const struct net_device *out,
+                  const struct nf_loginfo *li_user,
+                  const char *prefix)
+{
+        unsigned int size, data_len;
+        struct nfulnl_instance *inst;
+        const struct nf_loginfo *li;
+        unsigned int qthreshold;
+        unsigned int nlbufsiz;
+        if (li_user && li_user->type == NF_LOG_TYPE_ULOG) 
+                li = li_user;
+        else
+                li = &default_loginfo;
+        inst = instance_lookup_get(li->u.ulog.group);
+        if (!inst)
+                inst = instance_lookup_get(0);
+        if (!inst) {
+                PRINTR("nfnetlink_log: trying to log packet, "
+                        "but no instance for group %u\n", li->u.ulog.group);
+                return;
+        }
+        /* all macros expand to constant values at compile time */
+        /* FIXME: do we want to make the size calculation conditional based on
+         * what is actually present?  way more branches and checks, but more
+         * memory efficient... */
+        size =    NLMSG_SPACE(sizeof(struct nfgenmsg))
+                + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr))
+                + NFA_SPACE(sizeof(u_int32_t))  /* ifindex */
+                + NFA_SPACE(sizeof(u_int32_t))  /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+                + NFA_SPACE(sizeof(u_int32_t))  /* ifindex */
+                + NFA_SPACE(sizeof(u_int32_t))  /* ifindex */
+#endif
+                + NFA_SPACE(sizeof(u_int32_t))  /* mark */
+                + NFA_SPACE(sizeof(u_int32_t))  /* uid */
+                + NFA_SPACE(NFULNL_PREFIXLEN)   /* prefix */
+                + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw))
+                + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp));
+        UDEBUG("initial size=%u\n", size);
+        spin_lock_bh(&inst->lock);
+        qthreshold = inst->qthreshold;
+        /* per-rule qthreshold overrides per-instance */
+        if (qthreshold > li->u.ulog.qthreshold)
+                qthreshold = li->u.ulog.qthreshold;
+        
+        switch (inst->copy_mode) {
+        case NFULNL_COPY_META:
+        case NFULNL_COPY_NONE:
+                data_len = 0;
+                break;
+        
+        case NFULNL_COPY_PACKET:
+                if (inst->copy_range == 0 
+                    || inst->copy_range > skb->len)
+                        data_len = skb->len;
+                else
+                        data_len = inst->copy_range;
+                
+                size += NFA_SPACE(data_len);
+                UDEBUG("copy_packet, therefore size now %u\n", size);
+                break;
+        
+        default:
+                spin_unlock_bh(&inst->lock);
+                instance_put(inst);
+                return;
+        }
+        if (size > inst->nlbufsiz)
+                nlbufsiz = size;
+        else
+                nlbufsiz = inst->nlbufsiz;
+        if (!inst->skb) {
+                if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+                        UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+                                inst->nlbufsiz, size);
+                        goto alloc_failure;
+                }
+        } else if (inst->qlen >= qthreshold ||
+                   size > skb_tailroom(inst->skb)) {
+                /* either the queue len is too high or we don't have
+                 * enough room in the skb left. flush to userspace. */
+                UDEBUG("flushing old skb\n");
+                __nfulnl_send(inst);
+                if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+                        UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+                                inst->nlbufsiz, size);
+                        goto alloc_failure;
+                }
+        }
+        UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold);
+        inst->qlen++;
+        __build_packet_message(inst, skb, data_len, pf,
+                                hooknum, in, out, li, prefix);
+        /* timer_pending always called within inst->lock, so there
+         * is no chance of a race here */
+        if (!timer_pending(&inst->timer)) {
+                instance_get(inst);
+                inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
+                add_timer(&inst->timer);
+        }
+        spin_unlock_bh(&inst->lock);
+        return;
+alloc_failure:
+        spin_unlock_bh(&inst->lock);
+        instance_put(inst);
+        UDEBUG("error allocating skb\n");
+        /* FIXME: statistics */
+}
+static int
+nfulnl_rcv_nl_event(struct notifier_block *this,
+                   unsigned long event, void *ptr)
+{
+        struct netlink_notify *n = ptr;
+        if (event == NETLINK_URELEASE &&
+            n->protocol == NETLINK_NETFILTER && n->pid) {
+                int i;
+                /* destroy all instances for this pid */
+                write_lock_bh(&instances_lock);
+                for  (i = 0; i < INSTANCE_BUCKETS; i++) {
+                        struct hlist_node *tmp, *t2;
+                        struct nfulnl_instance *inst;
+                        struct hlist_head *head = &instance_table[i];
+                        hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+                                UDEBUG("node = %p\n", inst);
+                                if (n->pid == inst->peer_pid)
+                                        __instance_destroy(inst);
+                        }
+                }
+                write_unlock_bh(&instances_lock);
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block nfulnl_rtnl_notifier = {
+        .notifier_call  = nfulnl_rcv_nl_event,
+};
+static int
+nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+                  struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+        return -ENOTSUPP;
+}
+static struct nf_logger nfulnl_logger = {
+        .name   = "nfnetlink_log",
+        .logfn  = &nfulnl_log_packet,
+        .me     = THIS_MODULE,
+};
+static const int nfula_min[NFULA_MAX] = {
+        [NFULA_PACKET_HDR-1]    = sizeof(struct nfulnl_msg_packet_hdr),
+        [NFULA_MARK-1]          = sizeof(u_int32_t),
+        [NFULA_TIMESTAMP-1]     = sizeof(struct nfulnl_msg_packet_timestamp),
+        [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t),
+        [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t),
+        [NFULA_HWADDR-1]        = sizeof(struct nfulnl_msg_packet_hw),
+        [NFULA_PAYLOAD-1]       = 0,
+        [NFULA_PREFIX-1]        = 0,
+        [NFULA_UID-1]           = sizeof(u_int32_t),
+};
+static const int nfula_cfg_min[NFULA_CFG_MAX] = {
+        [NFULA_CFG_CMD-1]       = sizeof(struct nfulnl_msg_config_cmd),
+        [NFULA_CFG_MODE-1]      = sizeof(struct nfulnl_msg_config_mode),
+        [NFULA_CFG_TIMEOUT-1]   = sizeof(u_int32_t),
+        [NFULA_CFG_QTHRESH-1]   = sizeof(u_int32_t),
+        [NFULA_CFG_NLBUFSIZ-1]  = sizeof(u_int32_t),
+};
+static int
+nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+                   struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp)
+{
+        struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+        u_int16_t group_num = ntohs(nfmsg->res_id);
+        struct nfulnl_instance *inst;
+        int ret = 0;
+        UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+        if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) {
+                UDEBUG("bad attribute size\n");
+                return -EINVAL;
+        }
+        inst = instance_lookup_get(group_num);
+        if (nfula[NFULA_CFG_CMD-1]) {
+                u_int8_t pf = nfmsg->nfgen_family;
+                struct nfulnl_msg_config_cmd *cmd;
+                cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]);
+                UDEBUG("found CFG_CMD for\n");
+                switch (cmd->command) {
+                case NFULNL_CFG_CMD_BIND:
+                        if (inst) {
+                                ret = -EBUSY;
+                                goto out_put;
+                        }
+                        inst = instance_create(group_num,
+                                               NETLINK_CB(skb).pid);
+                        if (!inst) {
+                                ret = -EINVAL;
+                                goto out_put;
+                        }
+                        break;
+                case NFULNL_CFG_CMD_UNBIND:
+                        if (!inst) {
+                                ret = -ENODEV;
+                                goto out_put;
+                        }
+                        if (inst->peer_pid != NETLINK_CB(skb).pid) {
+                                ret = -EPERM;
+                                goto out_put;
+                        }
+                        instance_destroy(inst);
+                        break;
+                case NFULNL_CFG_CMD_PF_BIND:
+                        UDEBUG("registering log handler for pf=%u\n", pf);
+                        ret = nf_log_register(pf, &nfulnl_logger);
+                        break;
+                case NFULNL_CFG_CMD_PF_UNBIND:
+                        UDEBUG("unregistering log handler for pf=%u\n", pf);
+                        /* This is a bug and a feature.  We cannot unregister
+                         * other handlers, like nfnetlink_inst can */
+                        nf_log_unregister_pf(pf);
+                        break;
+                default:
+                        ret = -EINVAL;
+                        break;
+                }
+        } else {
+                if (!inst) {
+                        UDEBUG("no config command, and no instance for "
+                                "group=%u pid=%u =>ENOENT\n",
+                                group_num, NETLINK_CB(skb).pid);
+                        ret = -ENOENT;
+                        goto out_put;
+                }
+                if (inst->peer_pid != NETLINK_CB(skb).pid) {
+                        UDEBUG("no config command, and wrong pid\n");
+                        ret = -EPERM;
+                        goto out_put;
+                }
+        }
+        if (nfula[NFULA_CFG_MODE-1]) {
+                struct nfulnl_msg_config_mode *params;
+                params = NFA_DATA(nfula[NFULA_CFG_MODE-1]);
+                nfulnl_set_mode(inst, params->copy_mode,
+                                ntohs(params->copy_range));
+        }
+        if (nfula[NFULA_CFG_TIMEOUT-1]) {
+                u_int32_t timeout = 
+                        *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]);
+                nfulnl_set_timeout(inst, ntohl(timeout));
+        }
+        if (nfula[NFULA_CFG_NLBUFSIZ-1]) {
+                u_int32_t nlbufsiz = 
+                        *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]);
+                nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
+        }
+        if (nfula[NFULA_CFG_QTHRESH-1]) {
+                u_int32_t qthresh = 
+                        *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]);
+                nfulnl_set_qthresh(inst, ntohl(qthresh));
+        }
+out_put:
+        instance_put(inst);
+        return ret;
+}
+static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
+        [NFULNL_MSG_PACKET]     = { .call = nfulnl_recv_unsupp,
+                                    .attr_count = NFULA_MAX,
+                                    .cap_required = CAP_NET_ADMIN, },
+        [NFULNL_MSG_CONFIG]     = { .call = nfulnl_recv_config,
+                                    .attr_count = NFULA_CFG_MAX,
+                                    .cap_required = CAP_NET_ADMIN },
+};
+static struct nfnetlink_subsystem nfulnl_subsys = {
+        .name           = "log",
+        .subsys_id      = NFNL_SUBSYS_ULOG,
+        .cb_count       = NFULNL_MSG_MAX,
+        .cb             = nfulnl_cb,
+};
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+        unsigned int bucket;
+};
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+        struct iter_state *st = seq->private;
+        if (!st)
+                return NULL;
+        for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+                if (!hlist_empty(&instance_table[st->bucket]))
+                        return instance_table[st->bucket].first;
+        }
+        return NULL;
+}
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+        struct iter_state *st = seq->private;
+        h = h->next;
+        while (!h) {
+                if (++st->bucket >= INSTANCE_BUCKETS)
+                        return NULL;
+                h = instance_table[st->bucket].first;
+        }
+        return h;
+}
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct hlist_node *head;
+        head = get_first(seq);
+        if (head)
+                while (pos && (head = get_next(seq, head)))
+                        pos--;
+        return pos ? NULL : head;
+}
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock_bh(&instances_lock);
+        return get_idx(seq, *pos);
+}
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return get_next(s, v);
+}
+static void seq_stop(struct seq_file *s, void *v)
+{
+        read_unlock_bh(&instances_lock);
+}
+static int seq_show(struct seq_file *s, void *v)
+{
+        const struct nfulnl_instance *inst = v;
+        return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", 
+                          inst->group_num,
+                          inst->peer_pid, inst->qlen, 
+                          inst->copy_mode, inst->copy_range,
+                          inst->flushtimeout, atomic_read(&inst->use));
+}
+static struct seq_operations nful_seq_ops = {
+        .start  = seq_start,
+        .next   = seq_next,
+        .stop   = seq_stop,
+        .show   = seq_show,
+};
+static int nful_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        struct iter_state *is;
+        int ret;
+        is = kmalloc(sizeof(*is), GFP_KERNEL);
+        if (!is)
+                return -ENOMEM;
+        memset(is, 0, sizeof(*is));
+        ret = seq_open(file, &nful_seq_ops);
+        if (ret < 0)
+                goto out_free;
+        seq = file->private_data;
+        seq->private = is;
+        return ret;
+out_free:
+        kfree(is);
+        return ret;
+}
+static struct file_operations nful_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = nful_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+#endif /* PROC_FS */
+static int
+init_or_cleanup(int init)
+{
+        int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *proc_nful;
+#endif
+        
+        if (!init)
+                goto cleanup;
+        for (i = 0; i < INSTANCE_BUCKETS; i++)
+                INIT_HLIST_HEAD(&instance_table[i]);
+        
+        /* it's not really all that important to have a random value, so
+         * we can do this from the init function, even if there hasn't
+         * been that much entropy yet */
+        get_random_bytes(&hash_init, sizeof(hash_init));
+        netlink_register_notifier(&nfulnl_rtnl_notifier);
+        status = nfnetlink_subsys_register(&nfulnl_subsys);
+        if (status < 0) {
+                printk(KERN_ERR "log: failed to create netlink socket\n");
+                goto cleanup_netlink_notifier;
+        }
+#ifdef CONFIG_PROC_FS
+        proc_nful = create_proc_entry("nfnetlink_log", 0440,
+                                      proc_net_netfilter);
+        if (!proc_nful)
+                goto cleanup_subsys;
+        proc_nful->proc_fops = &nful_file_ops;
+#endif
+        return status;
+cleanup:
+        nf_log_unregister_logger(&nfulnl_logger);
+#ifdef CONFIG_PROC_FS
+        remove_proc_entry("nfnetlink_log", proc_net_netfilter);
+cleanup_subsys:
+#endif
+        nfnetlink_subsys_unregister(&nfulnl_subsys);
+cleanup_netlink_notifier:
+        netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+        return status;
+}
+static int __init init(void)
+{
+        
+        return init_or_cleanup(1);
+}
+static void __exit fini(void)
+{
+        init_or_cleanup(0);
+}
+MODULE_DESCRIPTION("netfilter userspace logging");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
new file mode 100644
index 000000000000..e3a5285329af
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue.c
@@ -0,0 +1,1132 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+#include <asm/atomic.h>
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+#define NFQNL_QMAX_DEFAULT 1024
+#if 0
+#define QDEBUG(x, args ...)     printk(KERN_DEBUG "%s(%d):%s(): " x,       \
+                                        __FILE__, __LINE__, __FUNCTION__,  \
+                                        ## args)
+#else
+#define QDEBUG(x, ...)
+#endif
+struct nfqnl_queue_entry {
+        struct list_head list;
+        struct nf_info *info;
+        struct sk_buff *skb;
+        unsigned int id;
+};
+struct nfqnl_instance {
+        struct hlist_node hlist;                /* global list of queues */
+        atomic_t use;
+        int peer_pid;
+        unsigned int queue_maxlen;
+        unsigned int copy_range;
+        unsigned int queue_total;
+        unsigned int queue_dropped;
+        unsigned int queue_user_dropped;
+        atomic_t id_sequence;                   /* 'sequence' of pkt ids */
+        u_int16_t queue_num;                    /* number of this queue */
+        u_int8_t copy_mode;
+        spinlock_t lock;
+        struct list_head queue_list;            /* packets in queue */
+};
+typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long);
+static DEFINE_RWLOCK(instances_lock);
+u_int64_t htonll(u_int64_t in)
+{
+        u_int64_t out;
+        int i;
+        for (i = 0; i < sizeof(u_int64_t); i++)
+                ((u_int8_t *)&out)[sizeof(u_int64_t)-1] = ((u_int8_t *)&in)[i];
+        return out;
+}
+#define INSTANCE_BUCKETS        16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+        return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
+}
+static struct nfqnl_instance *
+__instance_lookup(u_int16_t queue_num)
+{
+        struct hlist_head *head;
+        struct hlist_node *pos;
+        struct nfqnl_instance *inst;
+        head = &instance_table[instance_hashfn(queue_num)];
+        hlist_for_each_entry(inst, pos, head, hlist) {
+                if (inst->queue_num == queue_num)
+                        return inst;
+        }
+        return NULL;
+}
+static struct nfqnl_instance *
+instance_lookup_get(u_int16_t queue_num)
+{
+        struct nfqnl_instance *inst;
+        read_lock_bh(&instances_lock);
+        inst = __instance_lookup(queue_num);
+        if (inst)
+                atomic_inc(&inst->use);
+        read_unlock_bh(&instances_lock);
+        return inst;
+}
+static void
+instance_put(struct nfqnl_instance *inst)
+{
+        if (inst && atomic_dec_and_test(&inst->use)) {
+                QDEBUG("kfree(inst=%p)\n", inst);
+                kfree(inst);
+        }
+}
+static struct nfqnl_instance *
+instance_create(u_int16_t queue_num, int pid)
+{
+        struct nfqnl_instance *inst;
+        QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid);
+        write_lock_bh(&instances_lock); 
+        if (__instance_lookup(queue_num)) {
+                inst = NULL;
+                QDEBUG("aborting, instance already exists\n");
+                goto out_unlock;
+        }
+        inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+        if (!inst)
+                goto out_unlock;
+        memset(inst, 0, sizeof(*inst));
+        inst->queue_num = queue_num;
+        inst->peer_pid = pid;
+        inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+        inst->copy_range = 0xfffff;
+        inst->copy_mode = NFQNL_COPY_NONE;
+        atomic_set(&inst->id_sequence, 0);
+        /* needs to be two, since we _put() after creation */
+        atomic_set(&inst->use, 2);
+        inst->lock = SPIN_LOCK_UNLOCKED;
+        INIT_LIST_HEAD(&inst->queue_list);
+        if (!try_module_get(THIS_MODULE))
+                goto out_free;
+        hlist_add_head(&inst->hlist, 
+                       &instance_table[instance_hashfn(queue_num)]);
+        write_unlock_bh(&instances_lock);
+        QDEBUG("successfully created new instance\n");
+        return inst;
+out_free:
+        kfree(inst);
+out_unlock:
+        write_unlock_bh(&instances_lock);
+        return NULL;
+}
+static void nfqnl_flush(struct nfqnl_instance *queue, int verdict);
+static void
+_instance_destroy2(struct nfqnl_instance *inst, int lock)
+{
+        /* first pull it out of the global list */
+        if (lock)
+                write_lock_bh(&instances_lock);
+        QDEBUG("removing instance %p (queuenum=%u) from hash\n",
+                inst, inst->queue_num);
+        hlist_del(&inst->hlist);
+        if (lock)
+                write_unlock_bh(&instances_lock);
+        /* then flush all pending skbs from the queue */
+        nfqnl_flush(inst, NF_DROP);
+        /* and finally put the refcount */
+        instance_put(inst);
+        module_put(THIS_MODULE);
+}
+static inline void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+        _instance_destroy2(inst, 0);
+}
+static inline void
+instance_destroy(struct nfqnl_instance *inst)
+{
+        _instance_destroy2(inst, 1);
+}
+static void
+issue_verdict(struct nfqnl_queue_entry *entry, int verdict)
+{
+        QDEBUG("entering for entry %p, verdict %u\n", entry, verdict);
+        /* TCP input path (and probably other bits) assume to be called
+         * from softirq context, not from syscall, like issue_verdict is
+         * called.  TCP input path deadlocks with locks taken from timer
+         * softirq, e.g.  We therefore emulate this by local_bh_disable() */
+        local_bh_disable();
+        nf_reinject(entry->skb, entry->info, verdict);
+        local_bh_enable();
+        kfree(entry);
+}
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue,
+                      struct nfqnl_queue_entry *entry)
+{
+       list_add(&entry->list, &queue->queue_list);
+       queue->queue_total++;
+}
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct nfqnl_queue_entry *
+__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, 
+                   unsigned long data)
+{
+        struct list_head *p;
+        list_for_each_prev(p, &queue->queue_list) {
+                struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p;
+                
+                if (!cmpfn || cmpfn(entry, data))
+                        return entry;
+        }
+        return NULL;
+}
+static inline void
+__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry)
+{
+        list_del(&entry->list);
+        q->queue_total--;
+}
+static inline struct nfqnl_queue_entry *
+__find_dequeue_entry(struct nfqnl_instance *queue,
+                     nfqnl_cmpfn cmpfn, unsigned long data)
+{
+        struct nfqnl_queue_entry *entry;
+        entry = __find_entry(queue, cmpfn, data);
+        if (entry == NULL)
+                return NULL;
+        __dequeue_entry(queue, entry);
+        return entry;
+}
+static inline void
+__nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+        struct nfqnl_queue_entry *entry;
+        
+        while ((entry = __find_dequeue_entry(queue, NULL, 0)))
+                issue_verdict(entry, verdict);
+}
+static inline int
+__nfqnl_set_mode(struct nfqnl_instance *queue,
+                 unsigned char mode, unsigned int range)
+{
+        int status = 0;
+        
+        switch (mode) {
+        case NFQNL_COPY_NONE:
+        case NFQNL_COPY_META:
+                queue->copy_mode = mode;
+                queue->copy_range = 0;
+                break;
+                
+        case NFQNL_COPY_PACKET:
+                queue->copy_mode = mode;
+                /* we're using struct nfattr which has 16bit nfa_len */
+                if (range > 0xffff)
+                        queue->copy_range = 0xffff;
+                else
+                        queue->copy_range = range;
+                break;
+                
+        default:
+                status = -EINVAL;
+        }
+        return status;
+}
+static struct nfqnl_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue,
+                         nfqnl_cmpfn cmpfn, unsigned long data)
+{
+        struct nfqnl_queue_entry *entry;
+        
+        spin_lock_bh(&queue->lock);
+        entry = __find_dequeue_entry(queue, cmpfn, data);
+        spin_unlock_bh(&queue->lock);
+        return entry;
+}
+static void
+nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+        spin_lock_bh(&queue->lock);
+        __nfqnl_flush(queue, verdict);
+        spin_unlock_bh(&queue->lock);
+}
+static struct sk_buff *
+nfqnl_build_packet_message(struct nfqnl_instance *queue,
+                           struct nfqnl_queue_entry *entry, int *errp)
+{
+        unsigned char *old_tail;
+        size_t size;
+        size_t data_len = 0;
+        struct sk_buff *skb;
+        struct nfqnl_msg_packet_hdr pmsg;
+        struct nlmsghdr *nlh;
+        struct nfgenmsg *nfmsg;
+        unsigned int tmp_uint;
+        QDEBUG("entered\n");
+        /* all macros expand to constant values at compile time */
+        size =    NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr))
+                + NLMSG_SPACE(sizeof(u_int32_t))        /* ifindex */
+                + NLMSG_SPACE(sizeof(u_int32_t))        /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+                + NLMSG_SPACE(sizeof(u_int32_t))        /* ifindex */
+                + NLMSG_SPACE(sizeof(u_int32_t))        /* ifindex */
+#endif
+                + NLMSG_SPACE(sizeof(u_int32_t))        /* mark */
+                + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw))
+                + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp));
+        spin_lock_bh(&queue->lock);
+        
+        switch (queue->copy_mode) {
+        case NFQNL_COPY_META:
+        case NFQNL_COPY_NONE:
+                data_len = 0;
+                break;
+        
+        case NFQNL_COPY_PACKET:
+                if (queue->copy_range == 0 
+                    || queue->copy_range > entry->skb->len)
+                        data_len = entry->skb->len;
+                else
+                        data_len = queue->copy_range;
+                
+                size += NLMSG_SPACE(data_len);
+                break;
+        
+        default:
+                *errp = -EINVAL;
+                spin_unlock_bh(&queue->lock);
+                return NULL;
+        }
+        spin_unlock_bh(&queue->lock);
+        skb = alloc_skb(size, GFP_ATOMIC);
+        if (!skb)
+                goto nlmsg_failure;
+                
+        old_tail= skb->tail;
+        nlh = NLMSG_PUT(skb, 0, 0, 
+                        NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+                        sizeof(struct nfgenmsg));
+        nfmsg = NLMSG_DATA(nlh);
+        nfmsg->nfgen_family = entry->info->pf;
+        nfmsg->version = NFNETLINK_V0;
+        nfmsg->res_id = htons(queue->queue_num);
+        pmsg.packet_id          = htonl(entry->id);
+        pmsg.hw_protocol        = htons(entry->skb->protocol);
+        pmsg.hook               = entry->info->hook;
+        NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
+        if (entry->info->indev) {
+                tmp_uint = htonl(entry->info->indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+                NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+                if (entry->info->pf == PF_BRIDGE) {
+                        /* Case 1: indev is physical input device, we need to
+                         * look for bridge group (when called from 
+                         * netfilter_bridge) */
+                        NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), 
+                                &tmp_uint);
+                        /* this is the bridge group "brX" */
+                        tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex);
+                        NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+                                &tmp_uint);
+                } else {
+                        /* Case 2: indev is bridge group, we need to look for
+                         * physical device (when called from ipv4) */
+                        NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+                                &tmp_uint);
+                        if (entry->skb->nf_bridge
+                            && entry->skb->nf_bridge->physindev) {
+                                tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex);
+                                NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV,
+                                        sizeof(tmp_uint), &tmp_uint);
+                        }
+                }
+#endif
+        }
+        if (entry->info->outdev) {
+                tmp_uint = htonl(entry->info->outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+                NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+                if (entry->info->pf == PF_BRIDGE) {
+                        /* Case 1: outdev is physical output device, we need to
+                         * look for bridge group (when called from 
+                         * netfilter_bridge) */
+                        NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint),
+                                &tmp_uint);
+                        /* this is the bridge group "brX" */
+                        tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex);
+                        NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+                                &tmp_uint);
+                } else {
+                        /* Case 2: outdev is bridge group, we need to look for
+                         * physical output device (when called from ipv4) */
+                        NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+                                &tmp_uint);
+                        if (entry->skb->nf_bridge
+                            && entry->skb->nf_bridge->physoutdev) {
+                                tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex);
+                                NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV,
+                                        sizeof(tmp_uint), &tmp_uint);
+                        }
+                }
+#endif
+        }
+        if (entry->skb->nfmark) {
+                tmp_uint = htonl(entry->skb->nfmark);
+                NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint);
+        }
+        if (entry->info->indev && entry->skb->dev
+            && entry->skb->dev->hard_header_parse) {
+                struct nfqnl_msg_packet_hw phw;
+                phw.hw_addrlen =
+                        entry->skb->dev->hard_header_parse(entry->skb,
+                                                           phw.hw_addr);
+                phw.hw_addrlen = htons(phw.hw_addrlen);
+                NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
+        }
+        if (entry->skb->tstamp.off_sec) {
+                struct nfqnl_msg_packet_timestamp ts;
+                ts.sec = htonll(skb_tv_base.tv_sec + entry->skb->tstamp.off_sec);
+                ts.usec = htonll(skb_tv_base.tv_usec + entry->skb->tstamp.off_usec);
+                NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
+        }
+        if (data_len) {
+                struct nfattr *nfa;
+                int size = NFA_LENGTH(data_len);
+                if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) {
+                        printk(KERN_WARNING "nf_queue: no tailroom!\n");
+                        goto nlmsg_failure;
+                }
+                nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+                nfa->nfa_type = NFQA_PAYLOAD;
+                nfa->nfa_len = size;
+                if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len))
+                        BUG();
+        }
+                
+        nlh->nlmsg_len = skb->tail - old_tail;
+        return skb;
+nlmsg_failure:
+nfattr_failure:
+        if (skb)
+                kfree_skb(skb);
+        *errp = -EINVAL;
+        if (net_ratelimit())
+                printk(KERN_ERR "nf_queue: error creating packet message\n");
+        return NULL;
+}
+static int
+nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, 
+                     unsigned int queuenum, void *data)
+{
+        int status = -EINVAL;
+        struct sk_buff *nskb;
+        struct nfqnl_instance *queue;
+        struct nfqnl_queue_entry *entry;
+        QDEBUG("entered\n");
+        queue = instance_lookup_get(queuenum);
+        if (!queue) {
+                QDEBUG("no queue instance matching\n");
+                return -EINVAL;
+        }
+        if (queue->copy_mode == NFQNL_COPY_NONE) {
+                QDEBUG("mode COPY_NONE, aborting\n");
+                status = -EAGAIN;
+                goto err_out_put;
+        }
+        entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+        if (entry == NULL) {
+                if (net_ratelimit())
+                        printk(KERN_ERR 
+                                "nf_queue: OOM in nfqnl_enqueue_packet()\n");
+                status = -ENOMEM;
+                goto err_out_put;
+        }
+        entry->info = info;
+        entry->skb = skb;
+        entry->id = atomic_inc_return(&queue->id_sequence);
+        nskb = nfqnl_build_packet_message(queue, entry, &status);
+        if (nskb == NULL)
+                goto err_out_free;
+                
+        spin_lock_bh(&queue->lock);
+        
+        if (!queue->peer_pid)
+                goto err_out_free_nskb; 
+        if (queue->queue_total >= queue->queue_maxlen) {
+                queue->queue_dropped++;
+                status = -ENOSPC;
+                if (net_ratelimit())
+                          printk(KERN_WARNING "ip_queue: full at %d entries, "
+                                 "dropping packets(s). Dropped: %d\n", 
+                                 queue->queue_total, queue->queue_dropped);
+                goto err_out_free_nskb;
+        }
+        /* nfnetlink_unicast will either free the nskb or add it to a socket */
+        status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT);
+        if (status < 0) {
+                queue->queue_user_dropped++;
+                goto err_out_unlock;
+        }
+        __enqueue_entry(queue, entry);
+        spin_unlock_bh(&queue->lock);
+        instance_put(queue);
+        return status;
+err_out_free_nskb:
+        kfree_skb(nskb); 
+        
+err_out_unlock:
+        spin_unlock_bh(&queue->lock);
+err_out_free:
+        kfree(entry);
+err_out_put:
+        instance_put(queue);
+        return status;
+}
+static int
+nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e)
+{
+        int diff;
+        diff = data_len - e->skb->len;
+        if (diff < 0)
+                skb_trim(e->skb, data_len);
+        else if (diff > 0) {
+                if (data_len > 0xFFFF)
+                        return -EINVAL;
+                if (diff > skb_tailroom(e->skb)) {
+                        struct sk_buff *newskb;
+                        
+                        newskb = skb_copy_expand(e->skb,
+                                                 skb_headroom(e->skb),
+                                                 diff,
+                                                 GFP_ATOMIC);
+                        if (newskb == NULL) {
+                                printk(KERN_WARNING "ip_queue: OOM "
+                                      "in mangle, dropping packet\n");
+                                return -ENOMEM;
+                        }
+                        if (e->skb->sk)
+                                skb_set_owner_w(newskb, e->skb->sk);
+                        kfree_skb(e->skb);
+                        e->skb = newskb;
+                }
+                skb_put(e->skb, diff);
+        }
+        if (!skb_make_writable(&e->skb, data_len))
+                return -ENOMEM;
+        memcpy(e->skb->data, data, data_len);
+        return 0;
+}
+static inline int
+id_cmp(struct nfqnl_queue_entry *e, unsigned long id)
+{
+        return (id == e->id);
+}
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+               unsigned char mode, unsigned int range)
+{
+        int status;
+        spin_lock_bh(&queue->lock);
+        status = __nfqnl_set_mode(queue, mode, range);
+        spin_unlock_bh(&queue->lock);
+        return status;
+}
+static int
+dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex)
+{
+        if (entry->info->indev)
+                if (entry->info->indev->ifindex == ifindex)
+                        return 1;
+                        
+        if (entry->info->outdev)
+                if (entry->info->outdev->ifindex == ifindex)
+                        return 1;
+        return 0;
+}
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(int ifindex)
+{
+        int i;
+        
+        QDEBUG("entering for ifindex %u\n", ifindex);
+        /* this only looks like we have to hold the readlock for a way too long
+         * time, issue_verdict(),  nf_reinject(), ... - but we always only
+         * issue NF_DROP, which is processed directly in nf_reinject() */
+        read_lock_bh(&instances_lock);
+        for  (i = 0; i < INSTANCE_BUCKETS; i++) {
+                struct hlist_node *tmp;
+                struct nfqnl_instance *inst;
+                struct hlist_head *head = &instance_table[i];
+                hlist_for_each_entry(inst, tmp, head, hlist) {
+                        struct nfqnl_queue_entry *entry;
+                        while ((entry = find_dequeue_entry(inst, dev_cmp, 
+                                                           ifindex)) != NULL)
+                                issue_verdict(entry, NF_DROP);
+                }
+        }
+        read_unlock_bh(&instances_lock);
+}
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+                    unsigned long event, void *ptr)
+{
+        struct net_device *dev = ptr;
+        /* Drop any packets associated with the downed device */
+        if (event == NETDEV_DOWN)
+                nfqnl_dev_drop(dev->ifindex);
+        return NOTIFY_DONE;
+}
+static struct notifier_block nfqnl_dev_notifier = {
+        .notifier_call  = nfqnl_rcv_dev_event,
+};
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+                   unsigned long event, void *ptr)
+{
+        struct netlink_notify *n = ptr;
+        if (event == NETLINK_URELEASE &&
+            n->protocol == NETLINK_NETFILTER && n->pid) {
+                int i;
+                /* destroy all instances for this pid */
+                write_lock_bh(&instances_lock);
+                for  (i = 0; i < INSTANCE_BUCKETS; i++) {
+                        struct hlist_node *tmp, *t2;
+                        struct nfqnl_instance *inst;
+                        struct hlist_head *head = &instance_table[i];
+                        hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+                                if (n->pid == inst->peer_pid)
+                                        __instance_destroy(inst);
+                        }
+                }
+                write_unlock_bh(&instances_lock);
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block nfqnl_rtnl_notifier = {
+        .notifier_call  = nfqnl_rcv_nl_event,
+};
+static const int nfqa_verdict_min[NFQA_MAX] = {
+        [NFQA_VERDICT_HDR-1]    = sizeof(struct nfqnl_msg_verdict_hdr),
+        [NFQA_MARK-1]           = sizeof(u_int32_t),
+        [NFQA_PAYLOAD-1]        = 0,
+};
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+                   struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+        struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+        u_int16_t queue_num = ntohs(nfmsg->res_id);
+        struct nfqnl_msg_verdict_hdr *vhdr;
+        struct nfqnl_instance *queue;
+        unsigned int verdict;
+        struct nfqnl_queue_entry *entry;
+        int err;
+        if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) {
+                QDEBUG("bad attribute size\n");
+                return -EINVAL;
+        }
+        queue = instance_lookup_get(queue_num);
+        if (!queue)
+                return -ENODEV;
+        if (queue->peer_pid != NETLINK_CB(skb).pid) {
+                err = -EPERM;
+                goto err_out_put;
+        }
+        if (!nfqa[NFQA_VERDICT_HDR-1]) {
+                err = -EINVAL;
+                goto err_out_put;
+        }
+        vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]);
+        verdict = ntohl(vhdr->verdict);
+        if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
+                err = -EINVAL;
+                goto err_out_put;
+        }
+        entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id));
+        if (entry == NULL) {
+                err = -ENOENT;
+                goto err_out_put;
+        }
+        if (nfqa[NFQA_PAYLOAD-1]) {
+                if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]),
+                                 NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0)
+                        verdict = NF_DROP;
+        }
+        if (nfqa[NFQA_MARK-1])
+                skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1]));
+                
+        issue_verdict(entry, verdict);
+        instance_put(queue);
+        return 0;
+err_out_put:
+        instance_put(queue);
+        return err;
+}
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+                  struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+        return -ENOTSUPP;
+}
+static const int nfqa_cfg_min[NFQA_CFG_MAX] = {
+        [NFQA_CFG_CMD-1]        = sizeof(struct nfqnl_msg_config_cmd),
+        [NFQA_CFG_PARAMS-1]     = sizeof(struct nfqnl_msg_config_params),
+};
+static struct nf_queue_handler nfqh = {
+        .name   = "nf_queue",
+        .outfn  = &nfqnl_enqueue_packet,
+};
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+                  struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+        struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+        u_int16_t queue_num = ntohs(nfmsg->res_id);
+        struct nfqnl_instance *queue;
+        int ret = 0;
+        QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+        if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) {
+                QDEBUG("bad attribute size\n");
+                return -EINVAL;
+        }
+        queue = instance_lookup_get(queue_num);
+        if (nfqa[NFQA_CFG_CMD-1]) {
+                struct nfqnl_msg_config_cmd *cmd;
+                cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]);
+                QDEBUG("found CFG_CMD\n");
+                switch (cmd->command) {
+                case NFQNL_CFG_CMD_BIND:
+                        if (queue)
+                                return -EBUSY;
+                        queue = instance_create(queue_num, NETLINK_CB(skb).pid);
+                        if (!queue)
+                                return -EINVAL;
+                        break;
+                case NFQNL_CFG_CMD_UNBIND:
+                        if (!queue)
+                                return -ENODEV;
+                        if (queue->peer_pid != NETLINK_CB(skb).pid) {
+                                ret = -EPERM;
+                                goto out_put;
+                        }
+                        instance_destroy(queue);
+                        break;
+                case NFQNL_CFG_CMD_PF_BIND:
+                        QDEBUG("registering queue handler for pf=%u\n",
+                                ntohs(cmd->pf));
+                        ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh);
+                        break;
+                case NFQNL_CFG_CMD_PF_UNBIND:
+                        QDEBUG("unregistering queue handler for pf=%u\n",
+                                ntohs(cmd->pf));
+                        /* This is a bug and a feature.  We can unregister
+                         * other handlers(!) */
+                        ret = nf_unregister_queue_handler(ntohs(cmd->pf));
+                        break;
+                default:
+                        ret = -EINVAL;
+                        break;
+                }
+        } else {
+                if (!queue) {
+                        QDEBUG("no config command, and no instance ENOENT\n");
+                        ret = -ENOENT;
+                        goto out_put;
+                }
+                if (queue->peer_pid != NETLINK_CB(skb).pid) {
+                        QDEBUG("no config command, and wrong pid\n");
+                        ret = -EPERM;
+                        goto out_put;
+                }
+        }
+        if (nfqa[NFQA_CFG_PARAMS-1]) {
+                struct nfqnl_msg_config_params *params;
+                params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]);
+                nfqnl_set_mode(queue, params->copy_mode,
+                                ntohl(params->copy_range));
+        }
+out_put:
+        instance_put(queue);
+        return ret;
+}
+static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+        [NFQNL_MSG_PACKET]      = { .call = nfqnl_recv_unsupp,
+                                    .attr_count = NFQA_MAX,
+                                    .cap_required = CAP_NET_ADMIN },
+        [NFQNL_MSG_VERDICT]     = { .call = nfqnl_recv_verdict,
+                                    .attr_count = NFQA_MAX,
+                                    .cap_required = CAP_NET_ADMIN },
+        [NFQNL_MSG_CONFIG]      = { .call = nfqnl_recv_config,
+                                    .attr_count = NFQA_CFG_MAX,
+                                    .cap_required = CAP_NET_ADMIN },
+};
+static struct nfnetlink_subsystem nfqnl_subsys = {
+        .name           = "nf_queue",
+        .subsys_id      = NFNL_SUBSYS_QUEUE,
+        .cb_count       = NFQNL_MSG_MAX,
+        .cb             = nfqnl_cb,
+};
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+        unsigned int bucket;
+};
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+        struct iter_state *st = seq->private;
+        if (!st)
+                return NULL;
+        for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+                if (!hlist_empty(&instance_table[st->bucket]))
+                        return instance_table[st->bucket].first;
+        }
+        return NULL;
+}
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+        struct iter_state *st = seq->private;
+        h = h->next;
+        while (!h) {
+                if (++st->bucket >= INSTANCE_BUCKETS)
+                        return NULL;
+                h = instance_table[st->bucket].first;
+        }
+        return h;
+}
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct hlist_node *head;
+        head = get_first(seq);
+        if (head)
+                while (pos && (head = get_next(seq, head)))
+                        pos--;
+        return pos ? NULL : head;
+}
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+        read_lock_bh(&instances_lock);
+        return get_idx(seq, *pos);
+}
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return get_next(s, v);
+}
+static void seq_stop(struct seq_file *s, void *v)
+{
+        read_unlock_bh(&instances_lock);
+}
+static int seq_show(struct seq_file *s, void *v)
+{
+        const struct nfqnl_instance *inst = v;
+        return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
+                          inst->queue_num,
+                          inst->peer_pid, inst->queue_total,
+                          inst->copy_mode, inst->copy_range,
+                          inst->queue_dropped, inst->queue_user_dropped,
+                          atomic_read(&inst->id_sequence),
+                          atomic_read(&inst->use));
+}
+static struct seq_operations nfqnl_seq_ops = {
+        .start  = seq_start,
+        .next   = seq_next,
+        .stop   = seq_stop,
+        .show   = seq_show,
+};
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        struct iter_state *is;
+        int ret;
+        is = kmalloc(sizeof(*is), GFP_KERNEL);
+        if (!is)
+                return -ENOMEM;
+        memset(is, 0, sizeof(*is));
+        ret = seq_open(file, &nfqnl_seq_ops);
+        if (ret < 0)
+                goto out_free;
+        seq = file->private_data;
+        seq->private = is;
+        return ret;
+out_free:
+        kfree(is);
+        return ret;
+}
+static struct file_operations nfqnl_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = nfqnl_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release_private,
+};
+#endif /* PROC_FS */
+static int
+init_or_cleanup(int init)
+{
+        int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *proc_nfqueue;
+#endif
+        
+        if (!init)
+                goto cleanup;
+        for (i = 0; i < INSTANCE_BUCKETS; i++)
+                INIT_HLIST_HEAD(&instance_table[i]);
+        netlink_register_notifier(&nfqnl_rtnl_notifier);
+        status = nfnetlink_subsys_register(&nfqnl_subsys);
+        if (status < 0) {
+                printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+                goto cleanup_netlink_notifier;
+        }
+#ifdef CONFIG_PROC_FS
+        proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440,
+                                         proc_net_netfilter);
+        if (!proc_nfqueue)
+                goto cleanup_subsys;
+        proc_nfqueue->proc_fops = &nfqnl_file_ops;
+#endif
+        register_netdevice_notifier(&nfqnl_dev_notifier);
+        return status;
+cleanup:
+        nf_unregister_queue_handlers(&nfqh);
+        unregister_netdevice_notifier(&nfqnl_dev_notifier);
+#ifdef CONFIG_PROC_FS
+        remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
+cleanup_subsys:
+#endif  
+        nfnetlink_subsys_unregister(&nfqnl_subsys);
+cleanup_netlink_notifier:
+        netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+        return status;
+}
+static int __init init(void)
+{
+        
+        return init_or_cleanup(1);
+}
+static void __exit fini(void)
+{
+        init_or_cleanup(0);
+}
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+module_init(init);
+module_exit(fini);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ff774a06c89d..62435ffc6184 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -13,7 +13,12 @@
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 *                               use nlk_sk, as sk->protinfo is on a diet 8)
- *
+ * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
+ *                               - inc module use count of module that owns
+ *                                 the kernel socket in case userspace opens
+ *                                 socket of same protocol
+ *                               - remove all module support, since netlink is
+ *                                 mandatory if CONFIG_NET=y these days
 */
 #include <linux/config.h>
@@ -55,21 +60,29 @@
 #include <net/scm.h>
 #define Nprintk(a...)
+#define NLGRPSZ(x)      (ALIGN(x, sizeof(unsigned long) * 8) / 8)
 struct netlink_sock {
        /* struct sock has to be the first member of netlink_sock */
        struct sock             sk;
        u32                     pid;
-        unsigned int            groups;
        u32                     dst_pid;
-        unsigned int            dst_groups;
+        u32                     dst_group;
+        u32                     flags;
+        u32                     subscriptions;
+        u32                     ngroups;
+        unsigned long           *groups;
        unsigned long           state;
        wait_queue_head_t       wait;
        struct netlink_callback *cb;
        spinlock_t              cb_lock;
        void                    (*data_ready)(struct sock *sk, int bytes);
+        struct module           *module;
 };
+#define NETLINK_KERNEL_SOCKET   0x1
+#define NETLINK_RECV_PKTINFO    0x2
 static inline struct netlink_sock *nlk_sk(struct sock *sk)
 {
        return (struct netlink_sock *)sk;
@@ -92,6 +105,9 @@ struct netlink_table {
        struct nl_pid_hash hash;
        struct hlist_head mc_list;
        unsigned int nl_nonroot;
+        unsigned int groups;
+        struct module *module;
+        int registered;
 };
 static struct netlink_table *nl_table;
@@ -106,6 +122,11 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 static struct notifier_block *netlink_chain;
+static u32 netlink_group_mask(u32 group)
+{
+        return group ? 1 << (group - 1) : 0;
+}
 static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
 {
        return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
@@ -122,6 +143,7 @@ static void netlink_sock_destruct(struct sock *sk)
        BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
        BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
        BUG_TRAP(!nlk_sk(sk)->cb);
+        BUG_TRAP(!nlk_sk(sk)->groups);
 }
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP.
@@ -317,7 +339,7 @@ static void netlink_remove(struct sock *sk)
        netlink_table_grab();
        if (sk_del_node_init(sk))
                nl_table[sk->sk_protocol].hash.entries--;
-        if (nlk_sk(sk)->groups)
+        if (nlk_sk(sk)->subscriptions)
                __sk_del_bind_node(sk);
        netlink_table_ungrab();
 }
@@ -328,19 +350,11 @@ static struct proto netlink_proto = {
        .obj_size = sizeof(struct netlink_sock),
 };
-static int netlink_create(struct socket *sock, int protocol)
+static int __netlink_create(struct socket *sock, int protocol)
 {
        struct sock *sk;
        struct netlink_sock *nlk;
-        sock->state = SS_UNCONNECTED;
-        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
-                return -ESOCKTNOSUPPORT;
-        if (protocol<0 || protocol >= MAX_LINKS)
-                return -EPROTONOSUPPORT;
        sock->ops = &netlink_ops;
        sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
@@ -350,15 +364,67 @@ static int netlink_create(struct socket *sock, int protocol)
        sock_init_data(sock, sk);
        nlk = nlk_sk(sk);
        spin_lock_init(&nlk->cb_lock);
        init_waitqueue_head(&nlk->wait);
-        sk->sk_destruct = netlink_sock_destruct;
+        sk->sk_destruct = netlink_sock_destruct;
        sk->sk_protocol = protocol;
        return 0;
 }
+static int netlink_create(struct socket *sock, int protocol)
+{
+        struct module *module = NULL;
+        struct netlink_sock *nlk;
+        unsigned int groups;
+        int err = 0;
+        sock->state = SS_UNCONNECTED;
+        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+                return -ESOCKTNOSUPPORT;
+        if (protocol<0 || protocol >= MAX_LINKS)
+                return -EPROTONOSUPPORT;
+        netlink_lock_table();
+#ifdef CONFIG_KMOD
+        if (!nl_table[protocol].registered) {
+                netlink_unlock_table();
+                request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
+                netlink_lock_table();
+        }
+#endif
+        if (nl_table[protocol].registered &&
+            try_module_get(nl_table[protocol].module))
+                module = nl_table[protocol].module;
+        else
+                err = -EPROTONOSUPPORT;
+        groups = nl_table[protocol].groups;
+        netlink_unlock_table();
+        if (err || (err = __netlink_create(sock, protocol) < 0))
+                goto out_module;
+        nlk = nlk_sk(sock->sk);
+        nlk->groups = kmalloc(NLGRPSZ(groups), GFP_KERNEL);
+        if (nlk->groups == NULL) {
+                err = -ENOMEM;
+                goto out_module;
+        }
+        memset(nlk->groups, 0, NLGRPSZ(groups));
+        nlk->ngroups = groups;
+        nlk->module = module;
+out:
+        return err;
+out_module:
+        module_put(module);
+        goto out;
+}
 static int netlink_release(struct socket *sock)
 {
        struct sock *sk = sock->sk;
@@ -387,14 +453,27 @@ static int netlink_release(struct socket *sock)
        skb_queue_purge(&sk->sk_write_queue);
-        if (nlk->pid && !nlk->groups) {
+        if (nlk->pid && !nlk->subscriptions) {
                struct netlink_notify n = {
                                                .protocol = sk->sk_protocol,
                                                .pid = nlk->pid,
                                          };
                notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
        }       
-        
+        if (nlk->module)
+                module_put(nlk->module);
+        if (nlk->flags & NETLINK_KERNEL_SOCKET) {
+                netlink_table_grab();
+                nl_table[sk->sk_protocol].module = NULL;
+                nl_table[sk->sk_protocol].registered = 0;
+                netlink_table_ungrab();
+        }
+        kfree(nlk->groups);
+        nlk->groups = NULL;
        sock_put(sk);
        return 0;
 }
@@ -443,6 +522,18 @@ static inline int netlink_capable(struct socket *sock, unsigned int flag)
               capable(CAP_NET_ADMIN);
 } 
+static void
+netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
+{
+        struct netlink_sock *nlk = nlk_sk(sk);
+        if (nlk->subscriptions && !subscriptions)
+                __sk_del_bind_node(sk);
+        else if (!nlk->subscriptions && subscriptions)
+                sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+        nlk->subscriptions = subscriptions;
+}
 static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
        struct sock *sk = sock->sk;
@@ -468,15 +559,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
                        return err;
        }
-        if (!nladdr->nl_groups && !nlk->groups)
+        if (!nladdr->nl_groups && !(u32)nlk->groups[0])
                return 0;
        netlink_table_grab();
-        if (nlk->groups && !nladdr->nl_groups)
+        netlink_update_subscriptions(sk, nlk->subscriptions +
-                __sk_del_bind_node(sk);
+                                         hweight32(nladdr->nl_groups) -
-        else if (!nlk->groups && nladdr->nl_groups)
+                                         hweight32(nlk->groups[0]));
-                sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+        nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; 
-        nlk->groups = nladdr->nl_groups;
        netlink_table_ungrab();
        return 0;
@@ -493,7 +583,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
        if (addr->sa_family == AF_UNSPEC) {
                sk->sk_state    = NETLINK_UNCONNECTED;
                nlk->dst_pid    = 0;
-                nlk->dst_groups = 0;
+                nlk->dst_group  = 0;
                return 0;
        }
        if (addr->sa_family != AF_NETLINK)
@@ -509,7 +599,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
        if (err == 0) {
                sk->sk_state    = NETLINK_CONNECTED;
                nlk->dst_pid    = nladdr->nl_pid;
-                nlk->dst_groups = nladdr->nl_groups;
+                nlk->dst_group  = ffs(nladdr->nl_groups);
        }
        return err;
@@ -527,10 +617,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr
        if (peer) {
                nladdr->nl_pid = nlk->dst_pid;
-                nladdr->nl_groups = nlk->dst_groups;
+                nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
        } else {
                nladdr->nl_pid = nlk->pid;
-                nladdr->nl_groups = nlk->groups;
+                nladdr->nl_groups = nlk->groups[0];
        }
        return 0;
 }
@@ -731,7 +821,8 @@ static inline int do_one_broadcast(struct sock *sk,
        if (p->exclude_sk == sk)
                goto out;
-        if (nlk->pid == p->pid || !(nlk->groups & p->group))
+        if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+            !test_bit(p->group - 1, nlk->groups))
                goto out;
        if (p->failure) {
@@ -770,7 +861,7 @@ out:
 }
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
-                      u32 group, int allocation)
+                      u32 group, unsigned int __nocast allocation)
 {
        struct netlink_broadcast_data info;
        struct hlist_node *node;
@@ -827,7 +918,8 @@ static inline int do_one_set_err(struct sock *sk,
        if (sk == p->exclude_sk)
                goto out;
-        if (nlk->pid == p->pid || !(nlk->groups & p->group))
+        if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+            !test_bit(p->group - 1, nlk->groups))
                goto out;
        sk->sk_err = p->code;
@@ -855,6 +947,94 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
        read_unlock(&nl_table_lock);
 }
+static int netlink_setsockopt(struct socket *sock, int level, int optname,
+                              char __user *optval, int optlen)
+{
+        struct sock *sk = sock->sk;
+        struct netlink_sock *nlk = nlk_sk(sk);
+        int val = 0, err;
+        if (level != SOL_NETLINK)
+                return -ENOPROTOOPT;
+        if (optlen >= sizeof(int) &&
+            get_user(val, (int __user *)optval))
+                return -EFAULT;
+        switch (optname) {
+        case NETLINK_PKTINFO:
+                if (val)
+                        nlk->flags |= NETLINK_RECV_PKTINFO;
+                else
+                        nlk->flags &= ~NETLINK_RECV_PKTINFO;
+                err = 0;
+                break;
+        case NETLINK_ADD_MEMBERSHIP:
+        case NETLINK_DROP_MEMBERSHIP: {
+                unsigned int subscriptions;
+                int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
+                if (!netlink_capable(sock, NL_NONROOT_RECV))
+                        return -EPERM;
+                if (!val || val - 1 >= nlk->ngroups)
+                        return -EINVAL;
+                netlink_table_grab();
+                old = test_bit(val - 1, nlk->groups);
+                subscriptions = nlk->subscriptions - old + new;
+                if (new)
+                        __set_bit(val - 1, nlk->groups);
+                else
+                        __clear_bit(val - 1, nlk->groups);
+                netlink_update_subscriptions(sk, subscriptions);
+                netlink_table_ungrab();
+                err = 0;
+                break;
+        }
+        default:
+                err = -ENOPROTOOPT;
+        }
+        return err;
+}
+static int netlink_getsockopt(struct socket *sock, int level, int optname,
+                              char __user *optval, int __user *optlen)
+{
+        struct sock *sk = sock->sk;
+        struct netlink_sock *nlk = nlk_sk(sk);
+        int len, val, err;
+        if (level != SOL_NETLINK)
+                return -ENOPROTOOPT;
+        if (get_user(len, optlen))
+                return -EFAULT;
+        if (len < 0)
+                return -EINVAL;
+        switch (optname) {
+        case NETLINK_PKTINFO:
+                if (len < sizeof(int))
+                        return -EINVAL;
+                len = sizeof(int);
+                val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
+                put_user(len, optlen);
+                put_user(val, optval);
+                err = 0;
+                break;
+        default:
+                err = -ENOPROTOOPT;
+        }
+        return err;
+}
+static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+        struct nl_pktinfo info;
+        info.group = NETLINK_CB(skb).dst_group;
+        put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
+}
 static inline void netlink_rcv_wake(struct sock *sk)
 {
        struct netlink_sock *nlk = nlk_sk(sk);
@@ -873,7 +1053,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *addr=msg->msg_name;
        u32 dst_pid;
-        u32 dst_groups;
+        u32 dst_group;
        struct sk_buff *skb;
        int err;
        struct scm_cookie scm;
@@ -891,12 +1071,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
                if (addr->nl_family != AF_NETLINK)
                        return -EINVAL;
                dst_pid = addr->nl_pid;
-                dst_groups = addr->nl_groups;
+                dst_group = ffs(addr->nl_groups);
-                if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND))
+                if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
                        return -EPERM;
        } else {
                dst_pid = nlk->dst_pid;
-                dst_groups = nlk->dst_groups;
+                dst_group = nlk->dst_group;
        }
        if (!nlk->pid) {
@@ -914,9 +1094,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
                goto out;
        NETLINK_CB(skb).pid     = nlk->pid;
-        NETLINK_CB(skb).groups  = nlk->groups;
        NETLINK_CB(skb).dst_pid = dst_pid;
-        NETLINK_CB(skb).dst_groups = dst_groups;
+        NETLINK_CB(skb).dst_group = dst_group;
        NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
        memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
@@ -938,9 +1117,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
                goto out;
        }
-        if (dst_groups) {
+        if (dst_group) {
                atomic_inc(&skb->users);
-                netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL);
+                netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
        }
        err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
@@ -986,7 +1165,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
                addr->nl_family = AF_NETLINK;
                addr->nl_pad    = 0;
                addr->nl_pid    = NETLINK_CB(skb).pid;
-                addr->nl_groups = NETLINK_CB(skb).dst_groups;
+                addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
                msg->msg_namelen = sizeof(*addr);
        }
@@ -1001,6 +1180,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
                netlink_dump(sk);
        scm_recv(sock, msg, siocb->scm, flags);
+        if (nlk->flags & NETLINK_RECV_PKTINFO)
+                netlink_cmsg_recv_pktinfo(msg, skb);
 out:
        netlink_rcv_wake(sk);
@@ -1023,10 +1204,13 @@ static void netlink_data_ready(struct sock *sk, int len)
 */
 struct sock *
-netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
+netlink_kernel_create(int unit, unsigned int groups,
+                      void (*input)(struct sock *sk, int len),
+                      struct module *module)
 {
        struct socket *sock;
        struct sock *sk;
+        struct netlink_sock *nlk;
        if (!nl_table)
                return NULL;
@@ -1037,20 +1221,31 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
        if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
                return NULL;
-        if (netlink_create(sock, unit) < 0) {
+        if (__netlink_create(sock, unit) < 0)
-                sock_release(sock);
+                goto out_sock_release;
-                return NULL;
-        }
        sk = sock->sk;
        sk->sk_data_ready = netlink_data_ready;
        if (input)
                nlk_sk(sk)->data_ready = input;
-        if (netlink_insert(sk, 0)) {
+        if (netlink_insert(sk, 0))
-                sock_release(sock);
+                goto out_sock_release;
-                return NULL;
-        }
+        nlk = nlk_sk(sk);
+        nlk->flags |= NETLINK_KERNEL_SOCKET;
+        netlink_table_grab();
+        nl_table[unit].groups = groups < 32 ? 32 : groups;
+        nl_table[unit].module = module;
+        nl_table[unit].registered = 1;
+        netlink_table_ungrab();
        return sk;
+out_sock_release:
+        sock_release(sock);
+        return NULL;
 }
 void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1288,7 +1483,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
                           s,
                           s->sk_protocol,
                           nlk->pid,
-                           nlk->groups,
+                           nlk->flags & NETLINK_KERNEL_SOCKET ?
+                                0 : (unsigned int)nlk->groups[0],
                           atomic_read(&s->sk_rmem_alloc),
                           atomic_read(&s->sk_wmem_alloc),
                           nlk->cb,
@@ -1362,8 +1558,8 @@ static struct proto_ops netlink_ops = {
        .ioctl =        sock_no_ioctl,
        .listen =       sock_no_listen,
        .shutdown =     sock_no_shutdown,
-        .setsockopt =   sock_no_setsockopt,
+        .setsockopt =   netlink_setsockopt,
-        .getsockopt =   sock_no_getsockopt,
+        .getsockopt =   netlink_getsockopt,
        .sendmsg =      netlink_sendmsg,
        .recvmsg =      netlink_recvmsg,
        .mmap =         sock_no_mmap,
@@ -1438,21 +1634,7 @@ out:
        return err;
 }
-static void __exit netlink_proto_exit(void)
-{
-        sock_unregister(PF_NETLINK);
-        proc_net_remove("netlink");
-        kfree(nl_table);
-        nl_table = NULL;
-        proto_unregister(&netlink_proto);
-}
 core_initcall(netlink_proto_init);
-module_exit(netlink_proto_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NETPROTO(PF_NETLINK);
 EXPORT_SYMBOL(netlink_ack);
 EXPORT_SYMBOL(netlink_broadcast);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 162a85fed150..4b53de982114 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -39,7 +39,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/arp.h>
 #include <linux/init.h>
@@ -858,17 +858,16 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
        frametype          = skb->data[19] & 0x0F;
        flags              = skb->data[19] & 0xF0;
-#ifdef CONFIG_INET
        /*
         * Check for an incoming IP over NET/ROM frame.
         */
-        if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
+        if (frametype == NR_PROTOEXT &&
+            circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
                skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
                skb->h.raw = skb->data;
                return nr_rx_ip(skb, dev);
        }
-#endif
        /*
         * Find an existing socket connection, based on circuit ID, if it's
diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
index 220bf7494f71..263da4c26494 100644
--- a/net/netrom/nr_dev.c
+++ b/net/netrom/nr_dev.c
@@ -38,8 +38,6 @@
 #include <net/ax25.h>
 #include <net/netrom.h>
-#ifdef CONFIG_INET
 /*
 *      Only allow IP over NET/ROM frames through if the netrom device is up.
 */
@@ -64,11 +62,12 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
        skb->nh.raw   = skb->data;
        skb->pkt_type = PACKET_HOST;
-        ip_rcv(skb, skb->dev, NULL);
+        netif_rx(skb);
        return 1;
 }
+#ifdef CONFIG_INET
 static int nr_rebuild_header(struct sk_buff *skb)
 {
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
index 9c44b3794126..64b81a796907 100644
--- a/net/netrom/nr_in.c
+++ b/net/netrom/nr_in.c
@@ -22,8 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
-#include <net/ip.h>                     /* For ip_rcv */
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c
index 0627347b14b8..587bed2674bf 100644
--- a/net/netrom/nr_subr.c
+++ b/net/netrom/nr_subr.c
@@ -21,7 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
@@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk)
                if (skb_prev == NULL)
                        skb_queue_head(&sk->sk_write_queue, skb);
                else
-                        skb_append(skb_prev, skb);
+                        skb_append(skb_prev, skb, &sk->sk_write_queue);
                skb_prev = skb;
        }
 }
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index faabda8088be..75b72d389ba9 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -22,7 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c9d5980aa4de..ba997095f08f 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -241,7 +241,7 @@ static struct proto_ops packet_ops;
 #ifdef CONFIG_SOCK_PACKET
 static struct proto_ops packet_ops_spkt;
-static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
+static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
 {
        struct sock *sk;
        struct sockaddr_pkt *spkt;
@@ -441,7 +441,7 @@ static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned
   we will not harm anyone.
 */
-static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
+static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct sock *sk;
        struct sockaddr_ll *sll;
@@ -546,7 +546,7 @@ drop:
 }
 #ifdef CONFIG_PACKET_MMAP
-static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct sock *sk;
        struct packet_sock *po;
@@ -635,12 +635,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct pack
        h->tp_snaplen = snaplen;
        h->tp_mac = macoff;
        h->tp_net = netoff;
-        if (skb->stamp.tv_sec == 0) { 
+        if (skb->tstamp.off_sec == 0) { 
-                do_gettimeofday(&skb->stamp);
+                __net_timestamp(skb);
                sock_enable_timestamp(sk);
        }
-        h->tp_sec = skb->stamp.tv_sec;
+        h->tp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
-        h->tp_usec = skb->stamp.tv_usec;
+        h->tp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
        sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
        sll->sll_halen = 0;
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 5480caf8ccc2..c6e59f84c3ae 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -41,7 +41,7 @@
 #include <net/rose.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ip.h>
 #include <net/arp.h>
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index ef475a1bb1ba..8348d33f1efe 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -26,8 +26,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/ip.h>                     /* For ip_rcv */
+#include <net/tcp_states.h>
-#include <net/tcp.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 25da6f699fd0..4510cd7613ec 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -24,7 +24,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <linux/fcntl.h>
diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
index 7db7e1cedc3a..a29a3a960fd6 100644
--- a/net/rose/rose_subr.c
+++ b/net/rose/rose_subr.c
@@ -21,7 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
@@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk)
                if (skb_prev == NULL)
                        skb_queue_head(&sk->sk_write_queue, skb);
                else
-                        skb_append(skb_prev, skb);
+                        skb_append(skb_prev, skb, &sk->sk_write_queue);
                skb_prev = skb;
        }
 }
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index 84dd4403f792..50ae0371dab8 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -22,7 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 9bce7794130a..122c086ee2db 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -330,7 +330,7 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans,
        msg->trans = trans;
        msg->state = RXRPC_MSG_RECEIVED;
-        msg->stamp = pkt->stamp;
+        skb_get_timestamp(pkt, &msg->stamp);
        if (msg->stamp.tv_sec == 0) {
                do_gettimeofday(&msg->stamp); 
                if (pkt->sk) 
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 59d3e71f8b85..45d3bc0812c8 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -491,6 +491,7 @@ config NET_EMATCH_TEXT
        depends on NET_EMATCH
        select TEXTSEARCH
        select TEXTSEARCH_KMP
+        select TEXTSEARCH_BM
        select TEXTSEARCH_FSM
        ---help---
          Say Y here if you want to be ablt to classify packets based on
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 249c61936ea0..8aebe8f6d271 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
        while ((a = act) != NULL) {
 repeat:
                if (a->ops && a->ops->act) {
-                        ret = a->ops->act(&skb, a);
+                        ret = a->ops->act(&skb, a, res);
                        if (TC_MUNGED & skb->tc_verd) {
                                /* copied already, allow trampling */
                                skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -179,11 +179,6 @@ repeat:
                act = a->next;
        }
 exec_done:
-        if (skb->tc_classid > 0) {
-                res->classid = skb->tc_classid;
-                res->class = 0;
-                skb->tc_classid = 0;
-        }
        return ret;
 }
@@ -598,7 +593,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
        nlh->nlmsg_flags |= NLM_F_ROOT;
        module_put(a->ops->owner);
        kfree(a);
-        err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+        err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
        if (err > 0)
                return 0;
@@ -661,7 +656,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
                /* now do the delete */
                tcf_action_destroy(head, 0);
-                ret = rtnetlink_send(skb, pid, RTMGRP_TC,
+                ret = rtnetlink_send(skb, pid, RTNLGRP_TC,
                                     n->nlmsg_flags&NLM_F_ECHO);
                if (ret > 0)
                        return 0;
@@ -703,9 +698,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
        x->rta_len = skb->tail - (u8*)x;
        
        nlh->nlmsg_len = skb->tail - b;
-        NETLINK_CB(skb).dst_groups = RTMGRP_TC;
+        NETLINK_CB(skb).dst_group = RTNLGRP_TC;
        
-        err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO);
+        err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
        if (err > 0)
                err = 0;
        return err;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3b5714ef4d1a..b4d89fbb3782 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -367,7 +367,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
                return -EINVAL;
        }
-        return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+        return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 }
 struct tcf_dump_args
diff --git a/net/sched/gact.c b/net/sched/gact.c
index a811c89fef7f..d1c6d542912a 100644
--- a/net/sched/gact.c
+++ b/net/sched/gact.c
@@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, int bind)
 }
 static int
-tcf_gact(struct sk_buff **pskb, struct tc_action *a)
+tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
        struct tcf_gact *p = PRIV(a, gact);
        struct sk_buff *skb = *pskb;
diff --git a/net/sched/ipt.c b/net/sched/ipt.c
index b114d994d523..f50136eed211 100644
--- a/net/sched/ipt.c
+++ b/net/sched/ipt.c
@@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int bind)
 }
 static int
-tcf_ipt(struct sk_buff **pskb, struct tc_action *a)
+tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
        int ret = 0, result = 0;
        struct tcf_ipt *p = PRIV(a, ipt);
diff --git a/net/sched/mirred.c b/net/sched/mirred.c
index f309ce336803..20d06916dc0b 100644
--- a/net/sched/mirred.c
+++ b/net/sched/mirred.c
@@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, int bind)
 }
 static int
-tcf_mirred(struct sk_buff **pskb, struct tc_action *a)
+tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
        struct tcf_mirred *p = PRIV(a, mirred);
        struct net_device *dev;
diff --git a/net/sched/pedit.c b/net/sched/pedit.c
index 678be6a645fb..767d24f4610e 100644
--- a/net/sched/pedit.c
+++ b/net/sched/pedit.c
@@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, int bind)
 }
 static int
-tcf_pedit(struct sk_buff **pskb, struct tc_action *a)
+tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
        struct tcf_pedit *p = PRIV(a, pedit);
        struct sk_buff *skb = *pskb;
diff --git a/net/sched/police.c b/net/sched/police.c
index c03545faf523..eb39fb2f39b6 100644
--- a/net/sched/police.c
+++ b/net/sched/police.c
@@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind)
        return 0;
 }
-static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a,
+                          struct tcf_result *res)
 {
        psched_time_t now;
        struct sk_buff *skb = *pskb;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b9a069af4a02..737681cb9a92 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -816,7 +816,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
        }
        if (skb->len)
-                return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+                return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 err_out:
        kfree_skb(skb);
@@ -1040,7 +1040,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
                return -EINVAL;
        }
-        return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+        return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 }
 struct qdisc_dump_args
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0d066c965342..99ceb91f0150 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -238,6 +238,20 @@ static void dev_watchdog_down(struct net_device *dev)
        spin_unlock_bh(&dev->xmit_lock);
 }
+void netif_carrier_on(struct net_device *dev)
+{
+        if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
+                linkwatch_fire_event(dev);
+        if (netif_running(dev))
+                __netdev_watchdog_up(dev);
+}
+void netif_carrier_off(struct net_device *dev)
+{
+        if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
+                linkwatch_fire_event(dev);
+}
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
@@ -600,6 +614,8 @@ void dev_shutdown(struct net_device *dev)
 }
 EXPORT_SYMBOL(__netdev_watchdog_up);
+EXPORT_SYMBOL(netif_carrier_on);
+EXPORT_SYMBOL(netif_carrier_off);
 EXPORT_SYMBOL(noop_qdisc);
 EXPORT_SYMBOL(noop_qdisc_ops);
 EXPORT_SYMBOL(qdisc_create_dflt);
diff --git a/net/sched/simple.c b/net/sched/simple.c
index 3ab4c675ab5d..8a6ae4f491e8 100644
--- a/net/sched/simple.c
+++ b/net/sched/simple.c
@@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock);
 #include <net/pkt_act.h>
 #include <net/act_generic.h>
-static int tcf_simp(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
        struct sk_buff *skb = *pskb;
        struct tcf_defact *p = PRIV(a, defact);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 742be9171b7d..28f32243397f 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -236,8 +236,8 @@ int sctp_rcv(struct sk_buff *skb)
        }
        /* SCTP seems to always need a timestamp right now (FIXME) */
-        if (skb->stamp.tv_sec == 0) {
+        if (skb->tstamp.off_sec == 0) {
-                do_gettimeofday(&skb->stamp);
+                __net_timestamp(skb);
                sock_enable_timestamp(sk); 
        }
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e9b2fd480d61..fa3be2b8fb5f 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -66,8 +66,8 @@
 #include <linux/seq_file.h>
 #include <net/protocol.h>
-#include <net/tcp.h>
 #include <net/ndisc.h>
+#include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/transp_v6.h>
 #include <net/addrconf.h>
@@ -641,10 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
        else
                newinet->pmtudisc = IP_PMTUDISC_WANT;
-#ifdef INET_REFCNT_DEBUG
+        sk_refcnt_debug_inc(newsk);
-        atomic_inc(&inet6_sock_nr);
-        atomic_inc(&inet_sock_nr);
-#endif
        if (newsk->sk_prot->init(newsk)) {
                sk_common_release(newsk);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index ce9245e71fca..e7025be77691 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -62,7 +62,7 @@
 /* Global data structures. */
 struct sctp_globals sctp_globals;
 struct proc_dir_entry   *proc_net_sctp;
-DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics);
+DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly;
 struct idr sctp_assocs_id;
 DEFINE_SPINLOCK(sctp_assocs_id_lock);
@@ -78,8 +78,8 @@ static struct sctp_pf *sctp_pf_inet_specific;
 static struct sctp_af *sctp_af_v4_specific;
 static struct sctp_af *sctp_af_v6_specific;
-kmem_cache_t *sctp_chunk_cachep;
+kmem_cache_t *sctp_chunk_cachep __read_mostly;
-kmem_cache_t *sctp_bucket_cachep;
+kmem_cache_t *sctp_bucket_cachep __read_mostly;
 extern int sctp_snmp_proc_init(void);
 extern int sctp_snmp_proc_exit(void);
@@ -593,9 +593,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
        newinet->mc_index = 0;
        newinet->mc_list = NULL;
-#ifdef INET_REFCNT_DEBUG
+        sk_refcnt_debug_inc(newsk);
-        atomic_inc(&inet_sock_nr);
-#endif
        if (newsk->sk_prot->init(newsk)) {
                sk_common_release(newsk);
@@ -1244,6 +1242,10 @@ SCTP_STATIC __exit void sctp_exit(void)
 module_init(sctp_init);
 module_exit(sctp_exit);
+/*
+ * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
 MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
 MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
 MODULE_LICENSE("GPL");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 00d32b7c8266..3868a8d70cc0 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1362,6 +1362,7 @@ struct sctp_association *sctp_unpack_cookie(
        char *key;
        sctp_scope_t scope;
        struct sk_buff *skb = chunk->skb;
+        struct timeval tv;
        headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE;
        bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
@@ -1434,7 +1435,8 @@ no_hmac:
         * an association, there is no need to check cookie's expiration
         * for init collision case of lost COOKIE ACK.
         */
-        if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) {
+        skb_get_timestamp(skb, &tv);
+        if (!asoc && tv_lt(bear_cookie->expiration, tv)) {
                __u16 len;
                /*
                 * Section 3.3.10.3 Stale Cookie Error (3)
@@ -1447,10 +1449,9 @@ no_hmac:
                len = ntohs(chunk->chunk_hdr->length);
                *errp = sctp_make_op_error_space(asoc, chunk, len);
                if (*errp) {
-                        suseconds_t usecs = (skb->stamp.tv_sec -
+                        suseconds_t usecs = (tv.tv_sec -
                                bear_cookie->expiration.tv_sec) * 1000000L +
-                                skb->stamp.tv_usec -
+                                tv.tv_usec - bear_cookie->expiration.tv_usec;
-                                bear_cookie->expiration.tv_usec;
                        usecs = htonl(usecs);
                        sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 091a66f06a35..4454afe4727e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4892,7 +4892,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
        sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
                event = sctp_skb2event(skb);
                if (event->asoc == assoc) {
-                        __skb_unlink(skb, skb->list);
+                        __skb_unlink(skb, &oldsk->sk_receive_queue);
                        __skb_queue_tail(&newsk->sk_receive_queue, skb);
                }
        }
@@ -4921,7 +4921,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
                sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
                        event = sctp_skb2event(skb);
                        if (event->asoc == assoc) {
-                                __skb_unlink(skb, skb->list);
+                                __skb_unlink(skb, &oldsp->pd_lobby);
                                __skb_queue_tail(queue, skb);
                        }
                }
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 8bbc279d6c99..ec2c857eae7f 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -50,9 +50,9 @@
 /* Forward declarations for internal helpers.  */
 static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
-                                                struct sctp_ulpevent *);
+                                              struct sctp_ulpevent *);
 static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *,
-                                                struct sctp_ulpevent *);
+                                              struct sctp_ulpevent *);
 /* 1st Level Abstractions */
@@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
                event = sctp_ulpq_order(ulpq, event);
        }
-        /* Send event to the ULP.  */
+        /* Send event to the ULP.  'event' is the sctp_ulpevent for
+         * very first SKB on the 'temp' list.
+         */
        if (event)
                sctp_ulpq_tail_event(ulpq, event);
@@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
        return sctp_clear_pd(ulpq->asoc->base.sk);
 }
+/* If the SKB of 'event' is on a list, it is the first such member
+ * of that list.
+ */
 int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 {
        struct sock *sk = ulpq->asoc->base.sk;
-        struct sk_buff_head *queue;
+        struct sk_buff_head *queue, *skb_list;
+        struct sk_buff *skb = sctp_event2skb(event);
        int clear_pd = 0;
+        skb_list = (struct sk_buff_head *) skb->prev;
        /* If the socket is just going to throw this away, do not
         * even try to deliver it.
         */
@@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
        /* If we are harvesting multiple skbs they will be
         * collected on a list.
         */
-        if (sctp_event2skb(event)->list)
+        if (skb_list)
-                sctp_skb_list_tail(sctp_event2skb(event)->list, queue);
+                sctp_skb_list_tail(skb_list, queue);
        else
-                __skb_queue_tail(queue, sctp_event2skb(event));
+                __skb_queue_tail(queue, skb);
        /* Did we just complete partial delivery and need to get
         * rolling again?  Move pending data to the receive
@@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
        return 1;
 out_free:
-        if (sctp_event2skb(event)->list)
+        if (skb_list)
-                sctp_queue_purge_ulpevents(sctp_event2skb(event)->list);
+                sctp_queue_purge_ulpevents(skb_list);
        else
                sctp_ulpevent_free(event);
        return 0;
 }
@@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
 * payload was fragmented on the way and ip had to reassemble them.
 * We add the rest of skb's to the first skb's fraglist.
 */
-static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag)
+static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag)
 {
        struct sk_buff *pos;
        struct sctp_ulpevent *event;
@@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
                skb_shinfo(f_frag)->frag_list = pos;
        /* Remove the first fragment from the reassembly queue.  */
-        __skb_unlink(f_frag, f_frag->list);
+        __skb_unlink(f_frag, queue);
        while (pos) {
                pnext = pos->next;
@@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
                f_frag->data_len += pos->len;
                /* Remove the fragment from the reassembly queue.  */
-                __skb_unlink(pos, pos->list);
+                __skb_unlink(pos, queue);
        
                /* Break if we have reached the last fragment.  */
                if (pos == l_frag)
@@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u
 done:
        return retval;
 found:
-        retval = sctp_make_reassembled_event(first_frag, pos);
+        retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos);
        if (retval)
                retval->msg_flags |= MSG_EOR;
        goto done;
@@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq
         * further.
         */
 done:
-        retval = sctp_make_reassembled_event(first_frag, last_frag);
+        retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
        if (retval && is_last)
                retval->msg_flags |= MSG_EOR;
@@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u
         * further.
         */
 done:
-        retval = sctp_make_reassembled_event(first_frag, last_frag);
+        retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
        return retval;
 }
@@ -537,6 +544,7 @@ done:
 static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
                                              struct sctp_ulpevent *event)
 {
+        struct sk_buff_head *event_list;
        struct sk_buff *pos, *tmp;
        struct sctp_ulpevent *cevent;
        struct sctp_stream *in;
@@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
        ssn = event->ssn;
        in  = &ulpq->asoc->ssnmap->in;
+        event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
        /* We are holding the chunks by stream, by SSN.  */
        sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
                cevent = (struct sctp_ulpevent *) pos->cb;
@@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
                /* Found it, so mark in the ssnmap. */
                sctp_ssn_next(in, sid);
-                __skb_unlink(pos, pos->list);
+                __skb_unlink(pos, &ulpq->lobby);
                /* Attach all gathered skbs to the event.  */
-                __skb_queue_tail(sctp_event2skb(event)->list, pos);
+                __skb_queue_tail(event_list, pos);
        }
 }
@@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
 }
 static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
-                                                struct sctp_ulpevent *event)
+                                             struct sctp_ulpevent *event)
 {
        __u16 sid, ssn;
        struct sctp_stream *in;
@@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
 {
        struct sk_buff *pos, *tmp;
        struct sctp_ulpevent *cevent;
-        struct sctp_ulpevent *event = NULL;
+        struct sctp_ulpevent *event;
        struct sctp_stream *in;
        struct sk_buff_head temp;
        __u16 csid, cssn;
@@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
        in  = &ulpq->asoc->ssnmap->in;
        /* We are holding the chunks by stream, by SSN.  */
+        skb_queue_head_init(&temp);
+        event = NULL;
        sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
                cevent = (struct sctp_ulpevent *) pos->cb;
                csid = cevent->stream;
@@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
                /* Found it, so mark in the ssnmap. */         
                sctp_ssn_next(in, csid);
-                __skb_unlink(pos, pos->list);
+                __skb_unlink(pos, &ulpq->lobby);
                if (!event) {                                           
                        /* Create a temporary list to collect chunks on.  */
                        event = sctp_skb2event(pos);
-                        skb_queue_head_init(&temp);
                        __skb_queue_tail(&temp, sctp_event2skb(event));
                } else {
                        /* Attach all gathered skbs to the event.  */
-                        __skb_queue_tail(sctp_event2skb(event)->list, pos);
+                        __skb_queue_tail(&temp, pos);
                }
        }
-        /* Send event to the ULP.  */
+        /* Send event to the ULP.  'event' is the sctp_ulpevent for
+         * very first SKB on the 'temp' list.
+         */
        if (event)
                sctp_ulpq_tail_event(ulpq, event);
 }
diff --git a/net/socket.c b/net/socket.c
index 6f2a17881972..94fe638b4d72 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -70,6 +70,8 @@
 #include <linux/seq_file.h>
 #include <linux/wanrouter.h>
 #include <linux/if_bridge.h>
+#include <linux/if_frad.h>
+#include <linux/if_vlan.h>
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/cache.h>
@@ -272,7 +274,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule
 #define SOCKFS_MAGIC 0x534F434B
-static kmem_cache_t * sock_inode_cachep;
+static kmem_cache_t * sock_inode_cachep __read_mostly;
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
@@ -331,7 +333,7 @@ static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
        return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
 }
-static struct vfsmount *sock_mnt;
+static struct vfsmount *sock_mnt __read_mostly;
 static struct file_system_type sock_fs_type = {
        .name =         "sockfs",
@@ -404,6 +406,7 @@ int sock_map_fd(struct socket *sock)
                file->f_mode = FMODE_READ | FMODE_WRITE;
                file->f_flags = O_RDWR;
                file->f_pos = 0;
+                file->private_data = sock;
                fd_install(fd, file);
        }
@@ -436,6 +439,9 @@ struct socket *sockfd_lookup(int fd, int *err)
                return NULL;
        }
+        if (file->f_op == &socket_file_ops)
+                return file->private_data;      /* set in sock_map_fd */
        inode = file->f_dentry->d_inode;
        if (!S_ISSOCK(inode->i_mode)) {
                *err = -ENOTSOCK;
@@ -720,8 +726,8 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
        return __sock_sendmsg(iocb, sock, &x->async_msg, size);
 }
-ssize_t sock_sendpage(struct file *file, struct page *page,
+static ssize_t sock_sendpage(struct file *file, struct page *page,
-                      int offset, size_t size, loff_t *ppos, int more)
+                             int offset, size_t size, loff_t *ppos, int more)
 {
        struct socket *sock;
        int flags;
@@ -944,7 +950,7 @@ static int sock_mmap(struct file * file, struct vm_area_struct * vma)
        return sock->ops->mmap(file, sock, vma);
 }
-int sock_close(struct inode *inode, struct file *filp)
+static int sock_close(struct inode *inode, struct file *filp)
 {
        /*
         *      It was possible the inode is NULL we were 
@@ -2023,9 +2029,6 @@ int sock_unregister(int family)
        return 0;
 }
-extern void sk_init(void);
 void __init sock_init(void)
 {
        /*
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 554f224c0445..fe1a73ce6cff 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -28,13 +28,13 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-static struct vfsmount *rpc_mount;
+static struct vfsmount *rpc_mount __read_mostly;
 static int rpc_mount_count;
 static struct file_system_type rpc_pipe_fs_type;
-static kmem_cache_t *rpc_inode_cachep;
+static kmem_cache_t *rpc_inode_cachep __read_mostly;
 #define RPC_UPCALL_TIMEOUT (30*HZ)
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2d9eb7fbd521..f3104035e35d 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -34,10 +34,10 @@ static int			rpc_task_id;
 #define RPC_BUFFER_MAXSIZE      (2048)
 #define RPC_BUFFER_POOLSIZE     (8)
 #define RPC_TASK_POOLSIZE       (8)
-static kmem_cache_t     *rpc_task_slabp;
+static kmem_cache_t     *rpc_task_slabp __read_mostly;
-static kmem_cache_t     *rpc_buffer_slabp;
+static kmem_cache_t     *rpc_buffer_slabp __read_mostly;
-static mempool_t        *rpc_task_mempool;
+static mempool_t        *rpc_task_mempool __read_mostly;
-static mempool_t        *rpc_buffer_mempool;
+static mempool_t        *rpc_buffer_mempool __read_mostly;
 static void                     __rpc_default_timer(struct rpc_task *task);
 static void                     rpciod_killall(void);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d0c3120d0233..05fe2e735538 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -34,7 +34,7 @@
 #include <net/sock.h>
 #include <net/checksum.h>
 #include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -584,13 +584,16 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                /* possibly an icmp error */
                dprintk("svc: recvfrom returned error %d\n", -err);
        }
-        if (skb->stamp.tv_sec == 0) {
+        if (skb->tstamp.off_sec == 0) {
-                skb->stamp.tv_sec = xtime.tv_sec; 
+                struct timeval tv;
-                skb->stamp.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; 
+                tv.tv_sec = xtime.tv_sec;
+                tv.tv_usec = xtime.tv_nsec * 1000;
+                skb_set_timestamp(skb, &tv);
                /* Don't enable netstamp, sunrpc doesn't 
                   need that much accuracy */
        }
-        svsk->sk_sk->sk_stamp = skb->stamp;
+        skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp);
        set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
        /*
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 3f6e31069c54..c5241fcbb966 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -17,17 +17,15 @@
 #include <linux/sysctl.h>
 #ifdef CONFIG_INET
-extern struct ctl_table ipv4_table[];
+#include <net/ip.h>
 #endif
-extern struct ctl_table core_table[];
 #ifdef CONFIG_NET
-extern struct ctl_table ether_table[];
+#include <linux/if_ether.h>
 #endif
 #ifdef CONFIG_TR
-extern struct ctl_table tr_table[];
+#include <linux/if_tr.h>
 #endif
 struct ctl_table net_table[] = {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d403e34088ad..41feca3bef86 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -105,7 +105,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <net/af_unix.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -2026,14 +2026,6 @@ static struct net_proto_family unix_family_ops = {
        .owner  = THIS_MODULE,
 };
-#ifdef CONFIG_SYSCTL
-extern void unix_sysctl_register(void);
-extern void unix_sysctl_unregister(void);
-#else
-static inline void unix_sysctl_register(void) {}
-static inline void unix_sysctl_unregister(void) {}
-#endif
 static int __init af_unix_init(void)
 {
        int rc = -1;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 4bd95c8f5934..6ffc64e1712d 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -76,11 +76,11 @@
 #include <linux/netdevice.h>
 #include <linux/file.h>
 #include <linux/proc_fs.h>
-#include <linux/tcp.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
 #include <net/scm.h>
+#include <net/tcp_states.h>
 /* Internal data structures and random procedures: */
@@ -286,16 +286,16 @@ void unix_gc(void)
                        skb = skb_peek(&s->sk_receive_queue);
                        while (skb &&
                               skb != (struct sk_buff *)&s->sk_receive_queue) {
-                                nextsk=skb->next;
+                                nextsk = skb->next;
                                /*
                                 *      Do we have file descriptors ?
                                 */
-                                if(UNIXCB(skb).fp)
+                                if (UNIXCB(skb).fp) {
-                                {
+                                        __skb_unlink(skb,
-                                        __skb_unlink(skb, skb->list);
+                                                     &s->sk_receive_queue);
-                                        __skb_queue_tail(&hitlist,skb);
+                                        __skb_queue_tail(&hitlist, skb);
                                }
-                                skb=nextsk;
+                                skb = nextsk;
                        }
                        spin_unlock(&s->sk_receive_queue.lock);
                }
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index c974dac4580a..690ffa5d5bfb 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -12,7 +12,7 @@
 #include <linux/mm.h>
 #include <linux/sysctl.h>
-extern int sysctl_unix_max_dgram_qlen;
+#include <net/af_unix.h>
 static ctl_table unix_table[] = {
        {
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index d93b19faaab7..596cb96e5f47 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -57,7 +57,7 @@
 #include <linux/wanpipe.h>
 #include <linux/if_wanpipe.h>
 #include <linux/pkt_sched.h>
-#include <linux/tcp.h>
+#include <linux/tcp_states.h>
 #include <linux/if_wanpipe_common.h>
 #include <linux/sdla_x25.h>
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 04bec047fa9a..020d73cc8414 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -47,7 +47,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/termios.h>      /* For TIOCINQ/OUTQ */
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 36fc3bf6d882..adfe7b8df355 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
 }
 int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
-                           struct packet_type *ptype)
+                           struct packet_type *ptype, struct net_device *orig_dev)
 {
        struct sk_buff *nskb;
        struct x25_neigh *nb;
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index b0197c70a9fc..26146874b839 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -28,7 +28,7 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/x25.h>
 static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 7fd872ad0c20..8be9b8fbc24d 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -27,7 +27,7 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/x25.h>
 /*
@@ -80,7 +80,7 @@ void x25_requeue_frames(struct sock *sk)
                if (!skb_prev)
                        skb_queue_head(&sk->sk_write_queue, skb);
                else
-                        skb_append(skb_prev, skb);
+                        skb_append(skb_prev, skb, &sk->sk_write_queue);
                skb_prev = skb;
        }
 }
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index d6a21a3ad80e..0a92e1da3922 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -23,7 +23,7 @@
 #include <linux/jiffies.h>
 #include <linux/timer.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/x25.h>
 static void x25_heartbeat_expiry(unsigned long);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index c58a6f05a0b6..2407a7072327 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -12,7 +12,7 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
-static kmem_cache_t *secpath_cachep;
+static kmem_cache_t *secpath_cachep __read_mostly;
 void __secpath_destroy(struct sec_path *sp)
 {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d65ed8684fc1..83c8135e1764 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,7 +37,7 @@ EXPORT_SYMBOL(xfrm_policy_list);
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
-static kmem_cache_t *xfrm_dst_cache;
+static kmem_cache_t *xfrm_dst_cache __read_mostly;
 static struct work_struct xfrm_policy_gc_work;
 static struct list_head xfrm_policy_gc_list =
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8da3e25b2c4c..c35336a0f71b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1125,9 +1125,8 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
        if (build_expire(skb, x, c->data.hard) < 0)
                BUG();
-        NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
+        NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+        return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
-        return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
 }
 static int xfrm_notify_sa_flush(struct km_event *c)
@@ -1152,7 +1151,8 @@ static int xfrm_notify_sa_flush(struct km_event *c)
        nlh->nlmsg_len = skb->tail - b;
-        return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+        NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+        return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
 nlmsg_failure:
        kfree_skb(skb);
@@ -1226,7 +1226,8 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
        nlh->nlmsg_len = skb->tail - b;
-        return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+        NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+        return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
 nlmsg_failure:
 rtattr_failure:
@@ -1304,9 +1305,8 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
        if (build_acquire(skb, x, xt, xp, dir) < 0)
                BUG();
-        NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE;
+        NETLINK_CB(skb).dst_group = XFRMNLGRP_ACQUIRE;
+        return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC);
-        return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC);
 }
 /* User gives us xfrm_user_policy_info followed by an array of 0
@@ -1405,9 +1405,8 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
        if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
                BUG();
-        NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
+        NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+        return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
-        return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
 }
 static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
@@ -1455,7 +1454,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
        nlh->nlmsg_len = skb->tail - b;
-        return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+        NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+        return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
 nlmsg_failure:
 rtattr_failure:
@@ -1480,7 +1480,8 @@ static int xfrm_notify_policy_flush(struct km_event *c)
        nlh->nlmsg_len = skb->tail - b;
-        return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+        NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+        return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
 nlmsg_failure:
        kfree_skb(skb);
@@ -1519,7 +1520,8 @@ static int __init xfrm_user_init(void)
 {
        printk(KERN_INFO "Initializing IPsec netlink socket\n");
-        xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv);
+        xfrm_nl = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX,
+                                        xfrm_netlink_rcv, THIS_MODULE);
        if (xfrm_nl == NULL)
                return -ENOMEM;
@@ -1537,3 +1539,4 @@ static void __exit xfrm_user_exit(void)
 module_init(xfrm_user_init);
 module_exit(xfrm_user_exit);
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
author	Jeff Garzik <jgarzik@pobox.com>	2005-09-01 18:02:27 -0400
committer	Jeff Garzik <jgarzik@pobox.com>	2005-09-01 18:02:27 -0400
commit	ceeec3dc375e3b0618f16b34efc56fe093918f8b (patch)
tree	2293d02721ee05131aaf1c60e4fba7e281585eec /net
parent	fbff868db3a4cc6a89d51da9a6d49b26c29d04fb (diff)
parent	e3ee3b78f83688a0ae4315e8be71b2eac559904a (diff)