46 files changed, 1060 insertions, 514 deletions
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 03ae4edddac3..2d52fee63a8c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -844,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
                 * doesn't use the bridge parent of the indev by using
                 * the BRNF_DONT_TAKE_PARENT mask. */
                if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
-                        nf_bridge->mask &= BRNF_DONT_TAKE_PARENT;
+                        nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
                        nf_bridge->physindev = (struct net_device *)in;
                }
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b88925..662975be3d1d 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 {
        struct ebt_log_info *info = (struct ebt_log_info *)data;
        char level_string[4] = "< >";
-        union {struct iphdr iph; struct tcpudphdr ports;
-               struct arphdr arph; struct arppayload arpp;} u;
        level_string[1] = '0' + info->loglevel;
        spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
                }
                printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
                   NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
-                printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos,
+                printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
                       ih->protocol);
                if (ih->protocol == IPPROTO_TCP ||
                    ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
                    ah->ar_pln == sizeof(uint32_t)) {
                        struct arppayload _arpp, *ap;
-                        ap = skb_header_pointer(skb, sizeof(u.arph),
+                        ap = skb_header_pointer(skb, sizeof(_arph),
                                                sizeof(_arpp), &_arpp);
                        if (ap == NULL) {
                                printk(" INCOMPLETE ARP payload");
diff --git a/net/core/dev.c b/net/core/dev.c
index 7016e0c36b3d..7f5f62c65115 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
 {
        unsigned short old_flags = dev->flags;
-        dev->flags |= IFF_PROMISC;
        if ((dev->promiscuity += inc) == 0)
                dev->flags &= ~IFF_PROMISC;
-        if (dev->flags ^ old_flags) {
+        else
+                dev->flags |= IFF_PROMISC;
+        if (dev->flags != old_flags) {
                dev_mc_upload(dev);
                printk(KERN_INFO "device %s %s promiscuous mode\n",
                       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
 #include <linux/filter.h>
 /* No hurry in this branch */
-static u8 *load_pointer(struct sk_buff *skb, int k)
+static void *__load_pointer(struct sk_buff *skb, int k)
 {
        u8 *ptr = NULL;
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
        return NULL;
 }
+static inline void *load_pointer(struct sk_buff *skb, int k,
+                                 unsigned int size, void *buffer)
+{
+        if (k >= 0)
+                return skb_header_pointer(skb, k, size, buffer);
+        else {
+                if (k >= SKF_AD_OFF)
+                        return NULL;
+                return __load_pointer(skb, k);
+        }
+}
 /**
 *      sk_run_filter   -       run a filter on a socket
 *      @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
 
 int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 {
-        unsigned char *data = skb->data;
-        /* len is UNSIGNED. Byte wide insns relies only on implicit
-           type casts to prevent reading arbitrary memory locations.
-         */
-        unsigned int len = skb->len-skb->data_len;
        struct sock_filter *fentry;     /* We walk down these */
+        void *ptr;
        u32 A = 0;                      /* Accumulator */
        u32 X = 0;                      /* Index Register */
        u32 mem[BPF_MEMWORDS];          /* Scratch Memory Store */
+        u32 tmp;
        int k;
        int pc;
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
                case BPF_LD|BPF_W|BPF_ABS:
                        k = fentry->k;
 load_w:
-                        if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) {
+                        ptr = load_pointer(skb, k, 4, &tmp);
-                                A = ntohl(*(u32*)&data[k]);
+                        if (ptr != NULL) {
+                                A = ntohl(*(u32 *)ptr);
                                continue;
                        }
-                        if (k < 0) {
-                                u8 *ptr;
-                                if (k >= SKF_AD_OFF)
-                                        break;
-                                ptr = load_pointer(skb, k);
-                                if (ptr) {
-                                        A = ntohl(*(u32*)ptr);
-                                        continue;
-                                }
-                        } else {
-                                u32 _tmp, *p;
-                                p = skb_header_pointer(skb, k, 4, &_tmp);
-                                if (p != NULL) {
-                                        A = ntohl(*p);
-                                        continue;
-                                }
-                        }
                        return 0;
                case BPF_LD|BPF_H|BPF_ABS:
                        k = fentry->k;
 load_h:
-                        if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) {
+                        ptr = load_pointer(skb, k, 2, &tmp);
-                                A = ntohs(*(u16*)&data[k]);
+                        if (ptr != NULL) {
+                                A = ntohs(*(u16 *)ptr);
                                continue;
                        }
-                        if (k < 0) {
-                                u8 *ptr;
-                                if (k >= SKF_AD_OFF)
-                                        break;
-                                ptr = load_pointer(skb, k);
-                                if (ptr) {
-                                        A = ntohs(*(u16*)ptr);
-                                        continue;
-                                }
-                        } else {
-                                u16 _tmp, *p;
-                                p = skb_header_pointer(skb, k, 2, &_tmp);
-                                if (p != NULL) {
-                                        A = ntohs(*p);
-                                        continue;
-                                }
-                        }
                        return 0;
                case BPF_LD|BPF_B|BPF_ABS:
                        k = fentry->k;
 load_b:
-                        if (k >= 0 && (unsigned int)k < len) {
+                        ptr = load_pointer(skb, k, 1, &tmp);
-                                A = data[k];
+                        if (ptr != NULL) {
+                                A = *(u8 *)ptr;
                                continue;
                        }
-                        if (k < 0) {
-                                u8 *ptr;
-                                if (k >= SKF_AD_OFF)
-                                        break;
-                                ptr = load_pointer(skb, k);
-                                if (ptr) {
-                                        A = *ptr;
-                                        continue;
-                                }
-                        } else {
-                                u8 _tmp, *p;
-                                p = skb_header_pointer(skb, k, 1, &_tmp);
-                                if (p != NULL) {
-                                        A = *p;
-                                        continue;
-                                }
-                        }
                        return 0;
                case BPF_LD|BPF_W|BPF_LEN:
-                        A = len;
+                        A = skb->len;
                        continue;
                case BPF_LDX|BPF_W|BPF_LEN:
-                        X = len;
+                        X = skb->len;
                        continue;
                case BPF_LD|BPF_W|BPF_IND:
                        k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
                        k = X + fentry->k;
                        goto load_b;
                case BPF_LDX|BPF_B|BPF_MSH:
-                        if (fentry->k >= len)
+                        ptr = load_pointer(skb, fentry->k, 1, &tmp);
-                                return 0;
+                        if (ptr != NULL) {
-                        X = (data[fentry->k] & 0xf) << 2;
+                                X = (*(u8 *)ptr & 0xf) << 2;
-                        continue;
+                                continue;
+                        }
+                        return 0;
                case BPF_LD|BPF_IMM:
                        A = fentry->k;
                        continue;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 851eb927ed97..1beb782ac41b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1598,6 +1598,8 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
        read_lock_bh(&tbl->lock);
        ndtmsg->ndtm_family = tbl->family;
+        ndtmsg->ndtm_pad1   = 0;
+        ndtmsg->ndtm_pad2   = 0;
        RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
        RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
@@ -1683,6 +1685,8 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
        read_lock_bh(&tbl->lock);
        ndtmsg->ndtm_family = tbl->family;
+        ndtmsg->ndtm_pad1   = 0;
+        ndtmsg->ndtm_pad2   = 0;
        RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
        if (neightbl_fill_parms(skb, parms) < 0)
@@ -1872,6 +1876,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
        struct ndmsg *ndm = NLMSG_DATA(nlh);
        ndm->ndm_family  = n->ops->family;
+        ndm->ndm_pad1    = 0;
+        ndm->ndm_pad2    = 0;
        ndm->ndm_flags   = n->flags;
        ndm->ndm_type    = n->type;
        ndm->ndm_ifindex = n->dev->ifindex;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79f3..975d651312dc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
 #include <asm/timex.h>
-#define VERSION  "pktgen v2.61: Packet Generator for packet performance testing.\n"
+#define VERSION  "pktgen v2.62: Packet Generator for packet performance testing.\n"
 /* #define PG_DEBUG(a) a */
 #define PG_DEBUG(a) 
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
        struct iphdr *iph;
        struct pktgen_hdr *pgh = NULL;
        
+        /* Update any of the values, used when we're incrementing various
+         * fields.
+         */
+        mod_cur_headers(pkt_dev);
        skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
        if (!skb) {
                sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
        iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
        udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
-        /* Update any of the values, used when we're incrementing various
-         * fields.
-         */
-        mod_cur_headers(pkt_dev);
        memcpy(eth, pkt_dev->hh, 12);
        *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
        int datalen;
        struct ipv6hdr *iph;
        struct pktgen_hdr *pgh = NULL;
-        
+        /* Update any of the values, used when we're incrementing various
+         * fields.
+         */
+        mod_cur_headers(pkt_dev);
        skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
        if (!skb) {
                sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
        iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
        udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
-        /* Update any of the values, used when we're incrementing various
-         * fields.
-         */
-        mod_cur_headers(pkt_dev);
-        
        memcpy(eth, pkt_dev->hh, 12);
        *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
-        
-        
        datalen = pkt_dev->cur_pkt_size-14- 
                sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e013d836a7ab..4b1bb30e6381 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -126,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
        rta->rta_type = attrtype;
        rta->rta_len = size;
        memcpy(RTA_DATA(rta), data, attrlen);
+        memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 }
 size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
@@ -188,6 +189,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
        nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
        r = NLMSG_DATA(nlh);
        r->ifi_family = AF_UNSPEC;
+        r->__ifi_pad = 0;
        r->ifi_type = dev->type;
        r->ifi_index = dev->ifindex;
        r->ifi_flags = dev_get_flags(dev);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb73b2190ec7..733deee24b9f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
        C(ip_summed);
        C(priority);
        C(protocol);
-        C(security);
        n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
        C(nfmark);
@@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
        new->pkt_type   = old->pkt_type;
        new->stamp      = old->stamp;
        new->destructor = NULL;
-        new->security   = old->security;
 #ifdef CONFIG_NETFILTER
        new->nfmark     = old->nfmark;
        new->nfcache    = old->nfcache;
diff --git a/net/core/wireless.c b/net/core/wireless.c
index b2fe378dfbf8..3ff5639c0b78 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -1102,6 +1102,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff *	skb,
        nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
        r = NLMSG_DATA(nlh);
        r->ifi_family = AF_UNSPEC;
+        r->__ifi_pad = 0;
        r->ifi_type = dev->type;
        r->ifi_index = dev->ifindex;
        r->ifi_flags = dev->flags;
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
                if (t < s_t)
                        continue;
                if (t > s_t)
-                        memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+                        memset(&cb->args[1], 0,
+                               sizeof(cb->args) - sizeof(cb->args[0]));
                tb = dn_fib_get_table(t, 0);
                if (tb == NULL)
                        continue;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea47d365..ab60ea63688e 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -92,10 +92,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
         *      Set the source hardware address. 
         */
         
-        if(saddr)
+        if(!saddr)
-                memcpy(eth->h_source,saddr,dev->addr_len);
+                saddr = dev->dev_addr;
-        else
+        memcpy(eth->h_source,saddr,dev->addr_len);
-                memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
        /*
         *      Anyway, the loopback-device should never use this function... 
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 347083433120..3e63123f7bbd 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -448,7 +448,6 @@ config IP_TCPDIAG_IPV6
 config TCP_CONG_ADVANCED
        bool "TCP: advanced congestion control"
        depends on INET
-        default y
        ---help---
          Support for selection of various TCP congestion control
          modules.
@@ -549,7 +548,7 @@ config TCP_CONG_SCALABLE
 endmenu
 config TCP_CONG_BIC
-        boolean
+        tristate
        depends on !TCP_CONG_ADVANCED
        default y
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
 static int ipv4_proc_init(void);
 extern void ipfrag_init(void);
+/*
+ *      IP protocol layer initialiser
+ */
+static struct packet_type ip_packet_type = {
+        .type = __constant_htons(ETH_P_IP),
+        .func = ip_rcv,
+};
 static int __init inet_init(void)
 {
        struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
        ipfrag_init();
+        dev_add_pack(&ip_packet_type);
        rc = 0;
 out:
        return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0671569ee6f0..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
 *              2 of the License, or (at your option) any later version.
 */
-#define VERSION "0.323"
+#define VERSION "0.325"
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
        unsigned int semantic_match_passed;
        unsigned int semantic_match_miss;
        unsigned int null_node_hit;
+        unsigned int resize_node_skipped;
 };
 #endif
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
 static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -341,8 +342,10 @@ static struct leaf *leaf_new(void)
 static struct leaf_info *leaf_info_new(int plen)
 {
        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-        li->plen = plen;
+        if(li) {
-        INIT_LIST_HEAD(&li->falh);
+                li->plen = plen;
+                INIT_LIST_HEAD(&li->falh);
+        }
        return li;
 }
@@ -356,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
        kfree(li);
 }
+static struct tnode *tnode_alloc(unsigned int size)
+{
+        if (size <= PAGE_SIZE) {
+                return kmalloc(size, GFP_KERNEL);
+        } else {
+                return (struct tnode *)
+                       __get_free_pages(GFP_KERNEL, get_order(size));
+        }
+}
+static void __tnode_free(struct tnode *tn)
+{
+        unsigned int size = sizeof(struct tnode) +
+                            (1<<tn->bits) * sizeof(struct node *);
+        if (size <= PAGE_SIZE)
+                kfree(tn);
+        else
+                free_pages((unsigned long)tn, get_order(size));
+}
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
        int nchildren = 1<<bits;
        int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
-        struct tnode *tn = kmalloc(sz,  GFP_KERNEL);
+        struct tnode *tn = tnode_alloc(sz);
        if(tn)  {
                memset(tn, 0, sz);
@@ -388,7 +412,7 @@ static void tnode_free(struct tnode *tn)
                        printk("FL %p \n", tn);
        }
        else if(IS_TNODE(tn)) { 
-                kfree(tn);
+                __tnode_free(tn);
                if(trie_debug > 0 ) 
                        printk("FT %p \n", tn);
        }
@@ -458,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
 static struct node *resize(struct trie *t, struct tnode *tn) 
 {
        int i;
+        int err = 0;
        if (!tn)
                return NULL;
@@ -554,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
         */
        check_tnode(tn);
+        
+        err = 0;
        while ((tn->full_children > 0 &&
               50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
                                inflate_threshold * tnode_child_length(tn))) {
-                tn = inflate(t, tn);
+                tn = inflate(t, tn, &err);
+                if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+                        t->stats.resize_node_skipped++;
+#endif
+                        break;
+                }
        }
        check_tnode(tn);
@@ -568,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
         * Halve as long as the number of empty children in this
         * node is above threshold.
         */
+        err = 0;
        while (tn->bits > 1 &&
               100 * (tnode_child_length(tn) - tn->empty_children) <
-               halve_threshold * tnode_child_length(tn))
+               halve_threshold * tnode_child_length(tn)) {
+                tn = halve(t, tn, &err);
+                if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+                        t->stats.resize_node_skipped++;
+#endif
+                        break;
+                }
+        }
-                tn = halve(t, tn);
  
        /* Only one child remains */
@@ -597,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
        return (struct node *) tn;
 }
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 {
        struct tnode *inode;
        struct tnode *oldtnode = tn;
@@ -609,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
-        if (!tn)
+        if (!tn) {
-                trie_bug("tnode_new failed");
+                *err = -ENOMEM;
+                return oldtnode;
+        }
+        /*
+         * Preallocate and store tnodes before the actual work so we 
+         * don't get into an inconsistent state if memory allocation 
+         * fails. In case of failure we return the oldnode and  inflate 
+         * of tnode is ignored.
+         */
+                        
+        for(i = 0; i < olen; i++) {
+                struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+                if (inode &&
+                    IS_TNODE(inode) &&
+                    inode->pos == oldtnode->pos + oldtnode->bits &&
+                    inode->bits > 1) {
+                        struct tnode *left, *right;
+                        t_key m = TKEY_GET_MASK(inode->pos, 1);
+ 
+                        left = tnode_new(inode->key&(~m), inode->pos + 1,
+                                         inode->bits - 1);
+                        if(!left) {
+                                *err = -ENOMEM; 
+                                break;
+                        }
+                        
+                        right = tnode_new(inode->key|m, inode->pos + 1,
+                                          inode->bits - 1);
+                        if(!right) {
+                                *err = -ENOMEM; 
+                                break;
+                        }
+                        put_child(t, tn, 2*i, (struct node *) left);
+                        put_child(t, tn, 2*i+1, (struct node *) right);
+                }
+        }
+        if(*err) {
+                int size = tnode_child_length(tn);
+                int j;
+                for(j = 0; j < size; j++) 
+                        if( tn->child[j])
+                                tnode_free((struct tnode *)tn->child[j]);
+                tnode_free(tn);
+                
+                *err = -ENOMEM;
+                return oldtnode;
+        }
        for(i = 0; i < olen; i++) {
                struct node *node = tnode_get_child(oldtnode, i);
@@ -623,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                if(IS_LEAF(node) || ((struct tnode *) node)->pos >
                   tn->pos + tn->bits - 1) {
-                        if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
+                        if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
                                             1) == 0)
                                put_child(t, tn, 2*i, node);
                        else
@@ -663,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                         * the position (inode->pos)
                         */
-                        t_key m = TKEY_GET_MASK(inode->pos, 1);
- 
                        /* Use the old key, but set the new significant 
                         *   bit to zero. 
                         */
-                        left = tnode_new(inode->key&(~m), inode->pos + 1,
-                                         inode->bits - 1);
-                        if(!left) 
+                        left = (struct tnode *) tnode_get_child(tn, 2*i);
-                                trie_bug("tnode_new failed");
+                        put_child(t, tn, 2*i, NULL);
-                        
-                        
+                        if(!left)
-                        /* Use the old key, but set the new significant 
+                                BUG();
-                         * bit to one. 
-                         */
+                        right = (struct tnode *) tnode_get_child(tn, 2*i+1);
-                        right = tnode_new(inode->key|m, inode->pos + 1,
+                        put_child(t, tn, 2*i+1, NULL);
-                                          inode->bits - 1);
+                        if(!right)
+                                BUG();
-                        if(!right) 
-                                trie_bug("tnode_new failed");
-                        
                        size = tnode_child_length(left);
                        for(j = 0; j < size; j++) {
                                put_child(t, left, j, inode->child[j]);
@@ -699,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
        return tn;
 }
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 {
        struct tnode *oldtnode = tn;
        struct node *left, *right;
@@ -710,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
  
        tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
-        if(!tn) 
+        if (!tn) {
-                trie_bug("tnode_new failed");
+                *err = -ENOMEM;
+                return oldtnode;
+        }
+        /*
+         * Preallocate and store tnodes before the actual work so we 
+         * don't get into an inconsistent state if memory allocation 
+         * fails. In case of failure we return the oldnode and halve 
+         * of tnode is ignored.
+         */
+        for(i = 0; i < olen; i += 2) {
+                left = tnode_get_child(oldtnode, i);
+                right = tnode_get_child(oldtnode, i+1);
+    
+                /* Two nonempty children */
+                if( left && right)  {
+                        struct tnode *newBinNode =
+                                tnode_new(left->key, tn->pos + tn->bits, 1);
+                        if(!newBinNode) {
+                                *err = -ENOMEM; 
+                                break;
+                        }
+                        put_child(t, tn, i/2, (struct node *)newBinNode);
+                }
+        }
+        if(*err) {
+                int size = tnode_child_length(tn);
+                int j;
+                for(j = 0; j < size; j++) 
+                        if( tn->child[j])
+                                tnode_free((struct tnode *)tn->child[j]);
+                tnode_free(tn);
+                
+                *err = -ENOMEM;
+                return oldtnode;
+        }
        for(i = 0; i < olen; i += 2) {
                left = tnode_get_child(oldtnode, i);
@@ -728,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
                /* Two nonempty children */
                else {
                        struct tnode *newBinNode =
-                                tnode_new(left->key, tn->pos + tn->bits, 1);
+                                (struct tnode *) tnode_get_child(tn, i/2);
+                        put_child(t, tn, i/2, NULL);
                        if(!newBinNode) 
-                                trie_bug("tnode_new failed");
+                                BUG();
                        put_child(t, newBinNode, 0, left);
                        put_child(t, newBinNode, 1, right);
@@ -879,8 +1014,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
        return (struct node*) tn;
 }
-static struct list_head *
+static  struct list_head *
-fib_insert_node(struct trie *t, u32 key, int plen)
+fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 {
        int pos, newpos;
        struct tnode *tp = NULL, *tn = NULL;
@@ -940,7 +1075,6 @@ fib_insert_node(struct trie *t, u32 key, int plen)
        if(tp && IS_LEAF(tp))
                BUG();
-        t->revision++;
        /* Case 1: n is a leaf. Compare prefixes */
@@ -949,8 +1083,10 @@ fib_insert_node(struct trie *t, u32 key, int plen)
                
                li = leaf_info_new(plen);
                
-                if(! li) 
+                if(! li) {
-                        BUG();
+                        *err = -ENOMEM;
+                        goto err;
+                }
                fa_head = &li->falh;
                insert_leaf_info(&l->list, li);
@@ -959,14 +1095,19 @@ fib_insert_node(struct trie *t, u32 key, int plen)
        t->size++;
        l = leaf_new();
-        if(! l) 
+        if(! l) {
-                BUG();
+                *err = -ENOMEM;
+                goto err;
+        }
        l->key = key;
        li = leaf_info_new(plen);
-        if(! li) 
+        if(! li) {
-                BUG();
+                tnode_free((struct tnode *) l);
+                *err = -ENOMEM;
+                goto err;
+        }
        fa_head = &li->falh;
        insert_leaf_info(&l->list, li);
@@ -1003,9 +1144,14 @@ fib_insert_node(struct trie *t, u32 key, int plen)
                        newpos = 0;
                        tn = tnode_new(key, newpos, 1); /* First tnode */ 
                }
-                if(!tn) 
-                        trie_bug("tnode_pfx_new failed");
+                if(!tn) {
+                        free_leaf_info(li);
+                        tnode_free((struct tnode *) l);
+                        *err = -ENOMEM;
+                        goto err;
+                }                       
+                        
                NODE_SET_PARENT(tn, tp);
                missbit=tkey_extract_bits(key, newpos, 1);
@@ -1027,7 +1173,9 @@ fib_insert_node(struct trie *t, u32 key, int plen)
        }
        /* Rebalance the trie */
        t->trie = trie_rebalance(t, tp);
-done:;
+done:
+        t->revision++;
+err:;
        return fa_head;
 }
@@ -1156,8 +1304,12 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
         * Insert new entry to the list.
         */
-        if(!fa_head)
+        if(!fa_head) {
-                fa_head = fib_insert_node(t, key, plen);
+                fa_head = fib_insert_node(t, &err, key, plen);
+                err = 0;
+                if(err) 
+                        goto out_free_new_fa;
+        }
        write_lock_bh(&fib_lock);
@@ -1170,6 +1322,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
 succeeded:
        return 0;
+out_free_new_fa:
+        kmem_cache_free(fn_alias_kmem, new_fa);
 out:
        fib_release_info(fi);
 err:;   
@@ -2279,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
        seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
        seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
        seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+        seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
 #ifdef CLEAR_STATS
        memset(&(t->stats), 0, sizeof(t->stats));
 #endif
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index af2ec88bbb2f..c703528e0bcd 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -283,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
        struct iphdr *iph = skb->nh.iph;
+        int err;
        /*
         *      Initialise the virtual path cache for the packet. It describes
         *      how the packet travels inside Linux networking.
         */ 
        if (skb->dst == NULL) {
-                if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+                if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+                        if (err == -EHOSTUNREACH)
+                                IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
                        goto drop; 
+                }
        }
 #ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ee07aec215a0..9de83e6e0f1d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -188,7 +188,13 @@ static inline int ip_finish_output2(struct sk_buff *skb)
                skb = skb2;
        }
-        nf_reset(skb);
+#ifdef CONFIG_BRIDGE_NETFILTER
+        /* bridge-netfilter defers calling some IP hooks to the bridge layer
+         * and still needs the conntrack reference.
+         */
+        if (skb->nf_bridge == NULL)
+#endif
+                nf_reset(skb);
        if (hh) {
                int hh_alen;
@@ -383,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
-        to->security = from->security;
        dst_release(to->dst);
        to->dst = dst_clone(from->dst);
        to->dev = from->dev;
@@ -1323,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
        ip_rt_put(rt);
 }
-/*
- *      IP protocol layer initialiser
- */
-static struct packet_type ip_packet_type = {
-        .type = __constant_htons(ETH_P_IP),
-        .func = ip_rcv,
-};
-/*
- *      IP registers the packet type and then calls the subprotocol initialisers
- */
 void __init ip_init(void)
 {
-        dev_add_pack(&ip_packet_type);
        ip_rt_init();
        inet_initpeers();
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce72..d2bf8e1930a3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
                ic_rarp_cleanup();
 #endif
-        if (!ic_got_reply)
+        if (!ic_got_reply) {
+                ic_myaddr = INADDR_NONE;
                return -1;
+        }
        printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
                ((ic_got_reply & IC_RARP) ? "RARP" 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e4f809a93f47..7833d920bdba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
 static void ipmr_destroy_unres(struct mfc_cache *c)
 {
        struct sk_buff *skb;
+        struct nlmsgerr *e;
        atomic_dec(&cache_resolve_queue_len);
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
                        nlh->nlmsg_type = NLMSG_ERROR;
                        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
                        skb_trim(skb, nlh->nlmsg_len);
-                        ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+                        e = NLMSG_DATA(nlh);
+                        e->error = -ETIMEDOUT;
+                        memset(&e->msg, 0, sizeof(e->msg));
                        netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
                } else
                        kfree_skb(skb);
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 {
        struct sk_buff *skb;
+        struct nlmsgerr *e;
        /*
         *      Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
                                nlh->nlmsg_type = NLMSG_ERROR;
                                nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
                                skb_trim(skb, nlh->nlmsg_len);
-                                ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+                                e = NLMSG_DATA(nlh);
+                                e->error = -EMSGSIZE;
+                                memset(&e->msg, 0, sizeof(e->msg));
                        }
                        err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
                } else
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499fe..9f16ab309106 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 {
        if (del_timer(&cp->timer))
                mod_timer(&cp->timer, jiffies);
-        __ip_vs_conn_put(cp);
 }
@@ -764,7 +763,6 @@ void ip_vs_random_dropentry(void)
 {
        int idx;
        struct ip_vs_conn *cp;
-        struct ip_vs_conn *ct;
        /*
         * Randomly scan 1/32 of the whole table every second
@@ -801,21 +799,12 @@ void ip_vs_random_dropentry(void)
                                        continue;
                        }
-                        /*
-                         * Drop the entry, and drop its ct if not referenced
-                         */
-                        atomic_inc(&cp->refcnt);
-                        ct_write_unlock(hash);
-                        if ((ct = cp->control))
-                                atomic_inc(&ct->refcnt);
                        IP_VS_DBG(4, "del connection\n");
                        ip_vs_conn_expire_now(cp);
-                        if (ct) {
+                        if (cp->control) {
                                IP_VS_DBG(4, "del conn template\n");
-                                ip_vs_conn_expire_now(ct);
+                                ip_vs_conn_expire_now(cp->control);
                        }
-                        ct_write_lock(hash);
                }
                ct_write_unlock(hash);
        }
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
 {
        int idx;
        struct ip_vs_conn *cp;
-        struct ip_vs_conn *ct;
  flush_again:
        for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
                ct_write_lock_bh(idx);
                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-                        atomic_inc(&cp->refcnt);
-                        ct_write_unlock(idx);
-                        if ((ct = cp->control))
-                                atomic_inc(&ct->refcnt);
                        IP_VS_DBG(4, "del connection\n");
                        ip_vs_conn_expire_now(cp);
-                        if (ct) {
+                        if (cp->control) {
                                IP_VS_DBG(4, "del conn template\n");
-                                ip_vs_conn_expire_now(ct);
+                                ip_vs_conn_expire_now(cp->control);
                        }
-                        ct_write_lock(idx);
                }
                ct_write_unlock_bh(idx);
        }
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d9701036e..12a82e91d22a 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2059,7 +2059,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
        dst->addr = src->addr;
        dst->port = src->port;
        dst->fwmark = src->fwmark;
-        strcpy(dst->sched_name, src->scheduler->name);
+        strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
        dst->flags = src->flags;
        dst->timeout = src->timeout / HZ;
        dst->netmask = src->netmask;
@@ -2080,6 +2080,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
                        if (count >= get->num_services)
                                goto out;
+                        memset(&entry, 0, sizeof(entry));
                        ip_vs_copy_service(&entry, svc);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -2094,6 +2095,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
                        if (count >= get->num_services)
                                goto out;
+                        memset(&entry, 0, sizeof(entry));
                        ip_vs_copy_service(&entry, svc);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -2304,12 +2306,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
                memset(&d, 0, sizeof(d));
                if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
                        d[0].state = IP_VS_STATE_MASTER;
-                        strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+                        strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
                        d[0].syncid = ip_vs_master_syncid;
                }
                if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
                        d[1].state = IP_VS_STATE_BACKUP;
-                        strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+                        strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
                        d[1].syncid = ip_vs_backup_syncid;
                }
                if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a32..574d1f509b46 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
        ip_vs_sync_state |= state;
        if (state == IP_VS_STATE_MASTER) {
-                strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+                strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
                ip_vs_master_syncid = syncid;
        } else {
-                strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+                strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
                ip_vs_backup_syncid = syncid;
        }
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9cde8c61f525..6706d3a1bc4f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,7 +30,7 @@
 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
 #include <linux/netfilter_ipv4/ip_conntrack.h>
-#define CLUSTERIP_VERSION "0.6"
+#define CLUSTERIP_VERSION "0.7"
 #define DEBUG_CLUSTERIP
@@ -524,8 +524,9 @@ arp_mangle(unsigned int hook,
            || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
                return NF_ACCEPT;
-        /* we only want to mangle arp replies */
+        /* we only want to mangle arp requests and replies */
-        if (arp->ar_op != htons(ARPOP_REPLY))
+        if (arp->ar_op != htons(ARPOP_REPLY)
+            && arp->ar_op != htons(ARPOP_REQUEST))
                return NF_ACCEPT;
        payload = (void *)(arp+1);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 80cf633d9f4a..726ea5e8180a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
 *              Marc Boucher    :       routing by fwmark
 *      Robert Olsson           :       Added rt_cache statistics
 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
+ *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
        struct rtable   *chain;
-        spinlock_t      lock;
+};
-} __attribute__((__aligned__(8)));
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ 4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ 2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ 1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ 512
+#else
+#define RT_HASH_LOCK_SZ 256
+#endif
+static spinlock_t       *rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()    { \
+                int i; \
+                rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+                if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+                for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+                        spin_lock_init(&rt_hash_locks[i]); \
+                }
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 static struct rt_hash_bucket    *rt_hash_table;
 static unsigned                 rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-        static int rover;
+        static unsigned int rover;
-        int i = rover, t;
+        unsigned int i = rover, goal;
        struct rtable *rth, **rthp;
        unsigned long now = jiffies;
+        u64 mult;
-        for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-             t -= ip_rt_gc_timeout) {
+        mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+        if (ip_rt_gc_timeout > 1)
+                do_div(mult, ip_rt_gc_timeout);
+        goal = (unsigned int)mult;
+        if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+        for (; goal > 0; goal--) {
                unsigned long tmo = ip_rt_gc_timeout;
                i = (i + 1) & rt_hash_mask;
                rthp = &rt_hash_table[i].chain;
-                spin_lock(&rt_hash_table[i].lock);
+                if (*rthp == 0)
+                        continue;
+                spin_lock(rt_hash_lock_addr(i));
                while ((rth = *rthp) != NULL) {
                        if (rth->u.dst.expires) {
                                /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
                        rt_free(rth);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
                }
-                spin_unlock(&rt_hash_table[i].lock);
+                spin_unlock(rt_hash_lock_addr(i));
                /* Fallback loop breaker. */
                if (time_after(jiffies, now))
                        break;
        }
        rover = i;
-        mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+        mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 }
 /* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
        get_random_bytes(&rt_hash_rnd, 4);
        for (i = rt_hash_mask; i >= 0; i--) {
-                spin_lock_bh(&rt_hash_table[i].lock);
+                spin_lock_bh(rt_hash_lock_addr(i));
                rth = rt_hash_table[i].chain;
                if (rth)
                        rt_hash_table[i].chain = NULL;
-                spin_unlock_bh(&rt_hash_table[i].lock);
+                spin_unlock_bh(rt_hash_lock_addr(i));
                for (; rth; rth = next) {
                        next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
                        k = (k + 1) & rt_hash_mask;
                        rthp = &rt_hash_table[k].chain;
-                        spin_lock_bh(&rt_hash_table[k].lock);
+                        spin_lock_bh(rt_hash_lock_addr(k));
                        while ((rth = *rthp) != NULL) {
                                if (!rt_may_expire(rth, tmo, expire)) {
                                        tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
                                goal--;
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
                        }
-                        spin_unlock_bh(&rt_hash_table[k].lock);
+                        spin_unlock_bh(rt_hash_lock_addr(k));
                        if (goal <= 0)
                                break;
                }
@@ -882,7 +920,7 @@ restart:
        rthp = &rt_hash_table[hash].chain;
-        spin_lock_bh(&rt_hash_table[hash].lock);
+        spin_lock_bh(rt_hash_lock_addr(hash));
        while ((rth = *rthp) != NULL) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
                if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
                        rth->u.dst.__use++;
                        dst_hold(&rth->u.dst);
                        rth->u.dst.lastuse = now;
-                        spin_unlock_bh(&rt_hash_table[hash].lock);
+                        spin_unlock_bh(rt_hash_lock_addr(hash));
                        rt_drop(rt);
                        *rp = rth;
@@ -949,7 +987,7 @@ restart:
        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
                int err = arp_bind_neighbour(&rt->u.dst);
                if (err) {
-                        spin_unlock_bh(&rt_hash_table[hash].lock);
+                        spin_unlock_bh(rt_hash_lock_addr(hash));
                        if (err != -ENOBUFS) {
                                rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
        }
 #endif
        rt_hash_table[hash].chain = rt;
-        spin_unlock_bh(&rt_hash_table[hash].lock);
+        spin_unlock_bh(rt_hash_lock_addr(hash));
        *rp = rt;
        return 0;
 }
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
 {
        struct rtable **rthp;
-        spin_lock_bh(&rt_hash_table[hash].lock);
+        spin_lock_bh(rt_hash_lock_addr(hash));
        ip_rt_put(rt);
        for (rthp = &rt_hash_table[hash].chain; *rthp;
             rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
                        rt_free(rt);
                        break;
                }
-        spin_unlock_bh(&rt_hash_table[hash].lock);
+        spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1909,7 +1947,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
         */
        if ((err = fib_lookup(&fl, &res)) != 0) {
                if (!IN_DEV_FORWARD(in_dev))
-                        goto e_inval;
+                        goto e_hostunreach;
                goto no_route;
        }
        free_res = 1;
@@ -1933,7 +1971,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        }
        if (!IN_DEV_FORWARD(in_dev))
-                goto e_inval;
+                goto e_hostunreach;
        if (res.type != RTN_UNICAST)
                goto martian_destination;
@@ -2025,6 +2063,11 @@ martian_destination:
                        "%u.%u.%u.%u, dev %s\n",
                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
 #endif
+e_hostunreach:
+        err = -EHOSTUNREACH;
+        goto done;
 e_inval:
        err = -EINVAL;
        goto done;
@@ -3068,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
 int __init ip_rt_init(void)
 {
-        int i, order, goal, rc = 0;
+        int rc = 0;
        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
                             (jiffies ^ (jiffies >> 7)));
 #ifdef CONFIG_NET_CLS_ROUTE
+        {
+        int order;
        for (order = 0;
             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
                /* NOTHING */;
@@ -3081,6 +3126,7 @@ int __init ip_rt_init(void)
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
        memset(ip_rt_acct, 0, PAGE_SIZE << order);
+        }
 #endif
        ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3091,36 +3137,19 @@ int __init ip_rt_init(void)
        if (!ipv4_dst_ops.kmem_cachep)
                panic("IP: failed to allocate ip_dst_cache\n");
-        goal = num_physpages >> (26 - PAGE_SHIFT);
+        rt_hash_table = (struct rt_hash_bucket *)
-        if (rhash_entries)
+                alloc_large_system_hash("IP route cache",
-                goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
+                                        sizeof(struct rt_hash_bucket),
-        for (order = 0; (1UL << order) < goal; order++)
+                                        rhash_entries,
-                /* NOTHING */;
+                                        (num_physpages >= 128 * 1024) ?
+                                                (27 - PAGE_SHIFT) :
-        do {
+                                                (29 - PAGE_SHIFT),
-                rt_hash_mask = (1UL << order) * PAGE_SIZE /
+                                        HASH_HIGHMEM,
-                        sizeof(struct rt_hash_bucket);
+                                        &rt_hash_log,
-                while (rt_hash_mask & (rt_hash_mask - 1))
+                                        &rt_hash_mask,
-                        rt_hash_mask--;
+                                        0);
-                rt_hash_table = (struct rt_hash_bucket *)
+        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-                        __get_free_pages(GFP_ATOMIC, order);
+        rt_hash_lock_init();
-        } while (rt_hash_table == NULL && --order > 0);
-        if (!rt_hash_table)
-                panic("Failed to allocate IP route cache hash table\n");
-        printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
-               rt_hash_mask,
-               (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
-        for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
-                /* NOTHING */;
-        rt_hash_mask--;
-        for (i = 0; i <= rt_hash_mask; i++) {
-                spin_lock_init(&rt_hash_table[i].lock);
-                rt_hash_table[i].chain = NULL;
-        }
        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
        ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 882436da9a3a..29894c749163 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
                         size_t psize, int flags)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        int mss_now;
+        int mss_now, size_goal;
        int err;
        ssize_t copied;
        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        size_goal = tp->xmit_size_goal;
        copied = 0;
        err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
                int offset = poffset % PAGE_SIZE;
                int size = min_t(size_t, psize, PAGE_SIZE - offset);
-                if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+                if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
                                goto wait_for_memory;
                        skb_entail(sk, tp, skb);
-                        copy = mss_now;
+                        copy = size_goal;
                }
                if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
                if (!(psize -= copy))
                        goto out;
-                if (skb->len != mss_now || (flags & MSG_OOB))
+                if (skb->len < mss_now || (flags & MSG_OOB))
                        continue;
                if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
                        goto do_error;
                mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+                size_goal = tp->xmit_size_goal;
        }
 out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
-        int tmp = tp->mss_cache_std;
+        int tmp = tp->mss_cache;
        if (sk->sk_route_caps & NETIF_F_SG) {
-                int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+                if (sk->sk_route_caps & NETIF_F_TSO)
+                        tmp = 0;
+                else {
+                        int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
-                if (tmp >= pgbreak &&
+                        if (tmp >= pgbreak &&
-                    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+                            tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
-                        tmp = pgbreak;
+                                tmp = pgbreak;
+                }
        }
        return tmp;
 }
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int iovlen, flags;
-        int mss_now;
+        int mss_now, size_goal;
        int err, copied;
        long timeo;
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        size_goal = tp->xmit_size_goal;
        /* Ok commence sending. */
        iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        skb = sk->sk_write_queue.prev;
                        if (!sk->sk_send_head ||
-                            (copy = mss_now - skb->len) <= 0) {
+                            (copy = size_goal - skb->len) <= 0) {
 new_segment:
                                /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
                                        skb->ip_summed = CHECKSUM_HW;
                                skb_entail(sk, tp, skb);
-                                copy = mss_now;
+                                copy = size_goal;
                        }
                        /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                } else if (page) {
-                                        /* If page is cached, align
-                                         * offset to L1 cache boundary
-                                         */
-                                        off = (off + L1_CACHE_BYTES - 1) &
-                                              ~(L1_CACHE_BYTES - 1);
                                        if (off == PAGE_SIZE) {
                                                put_page(page);
                                                TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
                        if ((seglen -= copy) == 0 && iovlen == 0)
                                goto out;
-                        if (skb->len != mss_now || (flags & MSG_OOB))
+                        if (skb->len < mss_now || (flags & MSG_OOB))
                                continue;
                        if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
                                goto do_error;
                        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+                        size_goal = tp->xmit_size_goal;
                }
        }
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        info->tcpi_rto = jiffies_to_usecs(tp->rto);
        info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
-        info->tcpi_snd_mss = tp->mss_cache_std;
+        info->tcpi_snd_mss = tp->mss_cache;
        info->tcpi_rcv_mss = tp->ack.rcv_mss;
        info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
        switch (optname) {
        case TCP_MAXSEG:
-                val = tp->mss_cache_std;
+                val = tp->mss_cache;
                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        val = tp->rx_opt.user_mss;
                break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7bbbbc33eb4b..8de2f1071c2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
        if (!cwnd) {
-                if (tp->mss_cache_std > 1460)
+                if (tp->mss_cache > 1460)
                        cwnd = 2;
                else
-                        cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+                        cwnd = (tp->mss_cache > 1095) ? 3 : 4;
        }
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
        if (sk->sk_route_caps & NETIF_F_TSO) {
                sk->sk_route_caps &= ~NETIF_F_TSO;
                sock_set_flag(sk, SOCK_NO_LARGESEND);
-                tp->mss_cache = tp->mss_cache_std;
+                tp->mss_cache = tp->mss_cache;
        }
        if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                            (IsFack(tp) ||
                             !before(lost_retrans,
                                     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
-                                     tp->mss_cache_std))) {
+                                     tp->mss_cache))) {
                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                                tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
        }
 }
-/* There is one downside to this scheme.  Although we keep the
- * ACK clock ticking, adjusting packet counters and advancing
- * congestion window, we do not liberate socket send buffer
- * space.
- *
- * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
- * then making a write space wakeup callback is a possible
- * future enhancement.  WARNING: it is not trivial to make.
- */
 static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
                         __u32 now, __s32 *seq_rtt)
 {
@@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
                 * the other end.
                 */
                if (after(scb->end_seq, tp->snd_una)) {
-                        if (tcp_skb_pcount(skb) > 1)
+                        if (tcp_skb_pcount(skb) > 1 &&
+                            after(tp->snd_una, scb->seq))
                                acked |= tcp_tso_acked(sk, skb,
                                                       now, &seq_rtt);
                        break;
@@ -3308,6 +3300,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
+static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+{
+        /* If the user specified a specific send buffer setting, do
+         * not modify it.
+         */
+        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+                return 0;
+        /* If we are under global TCP memory pressure, do not expand.  */
+        if (tcp_memory_pressure)
+                return 0;
+        /* If we are under soft global TCP memory pressure, do not expand.  */
+        if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+                return 0;
+        /* If we filled the congestion window, do not expand.  */
+        if (tp->packets_out >= tp->snd_cwnd)
+                return 0;
+        return 1;
+}
 /* When incoming ACK allowed to free some skb from write_queue,
 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3319,11 +3333,8 @@ static void tcp_new_space(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        if (tp->packets_out < tp->snd_cwnd &&
+        if (tcp_should_expand_sndbuf(sk, tp)) {
-            !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
-            !tcp_memory_pressure &&
-            atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
-                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
                        MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
                    demanded = max_t(unsigned int, tp->snd_cwnd,
                                                   tp->reordering + 1);
@@ -3346,22 +3357,9 @@ static inline void tcp_check_space(struct sock *sk)
        }
 }
-static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
-            tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-            tcp_write_xmit(sk, tp->nonagle))
-                tcp_check_probe_timer(sk, tp);
-}
-static __inline__ void tcp_data_snd_check(struct sock *sk)
 {
-        struct sk_buff *skb = sk->sk_send_head;
+        tcp_push_pending_frames(sk, tp);
-        if (skb != NULL)
-                __tcp_data_snd_check(sk, skb);
        tcp_check_space(sk);
 }
@@ -3655,7 +3653,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                 */
                                tcp_ack(sk, skb, 0);
                                __kfree_skb(skb); 
-                                tcp_data_snd_check(sk);
+                                tcp_data_snd_check(sk, tp);
                                return 0;
                        } else { /* Header too small */
                                TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -3721,7 +3719,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
                                /* Well, only one small jumplet in fast path... */
                                tcp_ack(sk, skb, FLAG_DATA);
-                                tcp_data_snd_check(sk);
+                                tcp_data_snd_check(sk, tp);
                                if (!tcp_ack_scheduled(tp))
                                        goto no_ack;
                        }
@@ -3799,7 +3797,7 @@ step5:
        /* step 7: process the segment text */
        tcp_data_queue(sk, skb);
-        tcp_data_snd_check(sk);
+        tcp_data_snd_check(sk, tp);
        tcp_ack_snd_check(sk);
        return 0;
@@ -4109,7 +4107,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                /* Do step6 onward by hand. */
                tcp_urg(sk, skb, th);
                __kfree_skb(skb);
-                tcp_data_snd_check(sk);
+                tcp_data_snd_check(sk, tp);
                return 0;
        }
@@ -4300,7 +4298,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
        /* tcp_data could move socket to TIME-WAIT */
        if (sk->sk_state != TCP_CLOSE) {
-                tcp_data_snd_check(sk);
+                tcp_data_snd_check(sk, tp);
                tcp_ack_snd_check(sk);
        }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
         */
        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
        tp->snd_cwnd_clamp = ~0;
-        tp->mss_cache_std = tp->mss_cache = 536;
+        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
        tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0e17c244875c..e041d057ec86 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
 */
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
 static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
                                    struct sk_buff *skb)
@@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
                tp->ack.pingpong = 1;
 }
-static __inline__ void tcp_event_ack_sent(struct sock *sk)
+static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        tcp_dec_quickack_mode(tp);
+        tcp_dec_quickack_mode(tp, pkts);
        tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 }
@@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                tp->af_specific->send_check(sk, th, skb->len, skb);
                if (tcb->flags & TCPCB_FLAG_ACK)
-                        tcp_event_ack_sent(sk);
+                        tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
                if (skb->len != tcp_header_size)
                        tcp_event_data_sent(tp, skb, sk);
@@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
                sk->sk_send_head = skb;
 }
-static inline void tcp_tso_set_push(struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-        /* Force push to be on for any TSO frames to workaround
-         * problems with busted implementations like Mac OS-X that
-         * hold off socket receive wakeups until push is seen.
-         */
-        if (tcp_skb_pcount(skb) > 1)
-                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        struct sk_buff *skb = sk->sk_send_head;
-        if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
+        if (skb->len <= tp->mss_cache ||
-                /* Send it out now. */
-                TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                tcp_tso_set_push(skb);
-                if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
-                        sk->sk_send_head = NULL;
-                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-                        tcp_packets_out_inc(sk, tp, skb);
-                        return;
-                }
-        }
-}
-void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        if (skb->len <= tp->mss_cache_std ||
            !(sk->sk_route_caps & NETIF_F_TSO)) {
                /* Avoid the costly divide in the normal
                 * non-TSO case.
@@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
        } else {
                unsigned int factor;
-                factor = skb->len + (tp->mss_cache_std - 1);
+                factor = skb->len + (tp->mss_cache - 1);
-                factor /= tp->mss_cache_std;
+                factor /= tp->mss_cache;
                skb_shinfo(skb)->tso_segs = factor;
-                skb_shinfo(skb)->tso_size = tp->mss_cache_std;
+                skb_shinfo(skb)->tso_size = tp->mss_cache;
        }
 }
@@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
        }
        /* Link BUFF into the send queue. */
+        skb_header_release(buff);
        __skb_append(skb, buff);
        return 0;
@@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
        /* And store cached results */
        tp->pmtu_cookie = pmtu;
-        tp->mss_cache = tp->mss_cache_std = mss_now;
+        tp->mss_cache = mss_now;
        return mss_now;
 }
@@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 * cannot be large. However, taking into account rare use of URG, this
 * is not a big flaw.
 */
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
-unsigned int tcp_current_mss(struct sock *sk, int large)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct dst_entry *dst = __sk_dst_get(sk);
-        unsigned int do_large, mss_now;
+        u32 mss_now;
+        u16 xmit_size_goal;
+        int doing_tso = 0;
+        mss_now = tp->mss_cache;
+        if (large_allowed &&
+            (sk->sk_route_caps & NETIF_F_TSO) &&
+            !tp->urg_mode)
+                doing_tso = 1;
-        mss_now = tp->mss_cache_std;
        if (dst) {
                u32 mtu = dst_mtu(dst);
                if (mtu != tp->pmtu_cookie)
                        mss_now = tcp_sync_mss(sk, mtu);
        }
-        do_large = (large &&
+        if (tp->rx_opt.eff_sacks)
-                    (sk->sk_route_caps & NETIF_F_TSO) &&
+                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-                    !tp->urg_mode);
+                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
-        if (do_large) {
+        xmit_size_goal = mss_now;
-                unsigned int large_mss, factor, limit;
-                large_mss = 65535 - tp->af_specific->net_header_len -
+        if (doing_tso) {
+                xmit_size_goal = 65535 -
+                        tp->af_specific->net_header_len -
                        tp->ext_header_len - tp->tcp_header_len;
-                if (tp->max_window && large_mss > (tp->max_window>>1))
+                if (tp->max_window &&
-                        large_mss = max((tp->max_window>>1),
+                    (xmit_size_goal > (tp->max_window >> 1)))
-                                        68U - tp->tcp_header_len);
+                        xmit_size_goal = max((tp->max_window >> 1),
+                                             68U - tp->tcp_header_len);
+                xmit_size_goal -= (xmit_size_goal % mss_now);
+        }
+        tp->xmit_size_goal = xmit_size_goal;
-                factor = large_mss / mss_now;
+        return mss_now;
+}
-                /* Always keep large mss multiple of real mss, but
+/* Congestion window validation. (RFC2861) */
-                 * do not exceed 1/tso_win_divisor of the congestion window
-                 * so we can keep the ACK clock ticking and minimize
-                 * bursting.
-                 */
-                limit = tp->snd_cwnd;
-                if (sysctl_tcp_tso_win_divisor)
-                        limit /= sysctl_tcp_tso_win_divisor;
-                limit = max(1U, limit);
-                if (factor > limit)
-                        factor = limit;
-                tp->mss_cache = mss_now * factor;
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+{
+        __u32 packets_out = tp->packets_out;
+        if (packets_out >= tp->snd_cwnd) {
+                /* Network is feed fully. */
+                tp->snd_cwnd_used = 0;
+                tp->snd_cwnd_stamp = tcp_time_stamp;
+        } else {
+                /* Network starves. */
+                if (tp->packets_out > tp->snd_cwnd_used)
+                        tp->snd_cwnd_used = tp->packets_out;
-                mss_now = tp->mss_cache;
+                if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+                        tcp_cwnd_application_limited(sk);
        }
+}
-        if (tp->rx_opt.eff_sacks)
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
-                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+{
-                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+        u32 window, cwnd_len;
-        return mss_now;
+        window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+        cwnd_len = mss_now * cwnd;
+        return min(window, cwnd_len);
+}
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        u32 in_flight, cwnd;
+        /* Don't be strict about the congestion window for the final FIN.  */
+        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+                return 1;
+        in_flight = tcp_packets_in_flight(tp);
+        cwnd = tp->snd_cwnd;
+        if (in_flight < cwnd)
+                return (cwnd - in_flight);
+        return 0;
+}
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+        int tso_segs = tcp_skb_pcount(skb);
+        if (!tso_segs) {
+                tcp_set_skb_tso_segs(sk, skb);
+                tso_segs = tcp_skb_pcount(skb);
+        }
+        return tso_segs;
+}
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+        return after(tp->snd_sml,tp->snd_una) &&
+                !after(tp->snd_sml, tp->snd_nxt);
+}
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+                                  const struct sk_buff *skb, 
+                                  unsigned mss_now, int nonagle)
+{
+        return (skb->len < mss_now &&
+                ((nonagle&TCP_NAGLE_CORK) ||
+                 (!nonagle &&
+                  tp->packets_out &&
+                  tcp_minshall_check(tp))));
+}
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+                                 unsigned int cur_mss, int nonagle)
+{
+        /* Nagle rule does not apply to frames, which sit in the middle of the
+         * write_queue (they have no chances to get new data).
+         *
+         * This is implemented in the callers, where they modify the 'nonagle'
+         * argument based upon the location of SKB in the send queue.
+         */
+        if (nonagle & TCP_NAGLE_PUSH)
+                return 1;
+        /* Don't use the nagle rule for urgent data (or for the final FIN).  */
+        if (tp->urg_mode ||
+            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+                return 1;
+        if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+                return 1;
+        return 0;
+}
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+        if (skb->len > cur_mss)
+                end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+        return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+                                 unsigned int cur_mss, int nonagle)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        unsigned int cwnd_quota;
+        tcp_init_tso_segs(sk, skb);
+        if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+                return 0;
+        cwnd_quota = tcp_cwnd_test(tp, skb);
+        if (cwnd_quota &&
+            !tcp_snd_wnd_test(tp, skb, cur_mss))
+                cwnd_quota = 0;
+        return cwnd_quota;
+}
+static inline int tcp_skb_is_last(const struct sock *sk, 
+                                  const struct sk_buff *skb)
+{
+        return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+        struct sk_buff *skb = sk->sk_send_head;
+        return (skb &&
+                tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+                             (tcp_skb_is_last(sk, skb) ?
+                              TCP_NAGLE_PUSH :
+                              tp->nonagle)));
+}
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+{
+        struct sk_buff *buff;
+        int nlen = skb->len - len;
+        u16 flags;
+        /* All of a TSO frame must be composed of paged data.  */
+        BUG_ON(skb->len != skb->data_len);
+        buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+        if (unlikely(buff == NULL))
+                return -ENOMEM;
+        buff->truesize = nlen;
+        skb->truesize -= nlen;
+        /* Correct the sequence numbers. */
+        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+        /* PSH and FIN should only be set in the second packet. */
+        flags = TCP_SKB_CB(skb)->flags;
+        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+        TCP_SKB_CB(buff)->flags = flags;
+        /* This packet was never sent out yet, so no SACK bits. */
+        TCP_SKB_CB(buff)->sacked = 0;
+        buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+        skb_split(skb, buff, len);
+        /* Fix up tso_factor for both original and new SKB.  */
+        tcp_set_skb_tso_segs(sk, skb);
+        tcp_set_skb_tso_segs(sk, buff);
+        /* Link BUFF into the send queue. */
+        skb_header_release(buff);
+        __skb_append(skb, buff);
+        return 0;
+}
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+        u32 send_win, cong_win, limit, in_flight;
+        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+                return 0;
+        if (tp->ca_state != TCP_CA_Open)
+                return 0;
+        in_flight = tcp_packets_in_flight(tp);
+        BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+               (tp->snd_cwnd <= in_flight));
+        send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+        /* From in_flight test above, we know that cwnd > in_flight.  */
+        cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+        limit = min(send_win, cong_win);
+        /* If sk_send_head can be sent fully now, just do it.  */
+        if (skb->len <= limit)
+                return 0;
+        if (sysctl_tcp_tso_win_divisor) {
+                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+                /* If at least some fraction of a window is available,
+                 * just use it.
+                 */
+                chunk /= sysctl_tcp_tso_win_divisor;
+                if (limit >= chunk)
+                        return 0;
+        } else {
+                /* Different approach, try not to defer past a single
+                 * ACK.  Receiver should ACK every other full sized
+                 * frame, so if we have space for more than 3 frames
+                 * then send now.
+                 */
+                if (limit > tcp_max_burst(tp) * tp->mss_cache)
+                        return 0;
+        }
+        /* Ok, it looks like it is advisable to defer.  */
+        return 1;
 }
 /* This routine writes packets to the network.  It advances the
@@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
 * Returns 1, if no segments are in flight and we have queued segments, but
 * cannot send anything now because of SWS or another problem.
 */
-int tcp_write_xmit(struct sock *sk, int nonagle)
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        unsigned int mss_now;
+        struct sk_buff *skb;
+        unsigned int tso_segs, sent_pkts;
+        int cwnd_quota;
        /* If we are closed, the bytes will have to remain here.
         * In time closedown will finish, we empty the write queue and all
         * will be happy.
         */
-        if (sk->sk_state != TCP_CLOSE) {
+        if (unlikely(sk->sk_state == TCP_CLOSE))
-                struct sk_buff *skb;
+                return 0;
-                int sent_pkts = 0;
+        skb = sk->sk_send_head;
+        if (unlikely(!skb))
+                return 0;
+        tso_segs = tcp_init_tso_segs(sk, skb);
+        cwnd_quota = tcp_cwnd_test(tp, skb);
+        if (unlikely(!cwnd_quota))
+                goto out;
+        sent_pkts = 0;
+        while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+                BUG_ON(!tso_segs);
+                if (tso_segs == 1) {
+                        if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+                                                     (tcp_skb_is_last(sk, skb) ?
+                                                      nonagle : TCP_NAGLE_PUSH))))
+                                break;
+                } else {
+                        if (tcp_tso_should_defer(sk, tp, skb))
+                                break;
+                }
-                /* Account for SACKS, we may need to fragment due to this.
+                if (tso_segs > 1) {
-                 * It is just like the real MSS changing on us midstream.
+                        u32 limit = tcp_window_allows(tp, skb,
-                 * We also handle things correctly when the user adds some
+                                                      mss_now, cwnd_quota);
-                 * IP options mid-stream.  Silly to do, but cover it.
-                 */
+                        if (skb->len < limit) {
-                mss_now = tcp_current_mss(sk, 1);
+                                unsigned int trim = skb->len % mss_now;
-                while ((skb = sk->sk_send_head) &&
+                                if (trim)
-                       tcp_snd_test(sk, skb, mss_now,
+                                        limit = skb->len - trim;
-                                    tcp_skb_is_last(sk, skb) ? nonagle :
+                        }
-                                                               TCP_NAGLE_PUSH)) {
+                        if (skb->len > limit) {
-                        if (skb->len > mss_now) {
+                                if (tso_fragment(sk, skb, limit))
-                                if (tcp_fragment(sk, skb, mss_now))
                                        break;
                        }
+                } else if (unlikely(skb->len > mss_now)) {
-                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                        if (unlikely(tcp_fragment(sk, skb,  mss_now)))
-                        tcp_tso_set_push(skb);
-                        if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
                                break;
+                }
-                        /* Advance the send_head.  This one is sent out.
+                TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                         * This call will increment packets_out.
-                         */
+                if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
-                        update_send_head(sk, tp, skb);
+                        break;
+                /* Advance the send_head.  This one is sent out.
+                 * This call will increment packets_out.
+                 */
+                update_send_head(sk, tp, skb);
+                tcp_minshall_update(tp, mss_now, skb);
+                sent_pkts++;
+                /* Do not optimize this to use tso_segs. If we chopped up
+                 * the packet above, tso_segs will no longer be valid.
+                 */
+                cwnd_quota -= tcp_skb_pcount(skb);
+                BUG_ON(cwnd_quota < 0);
+                if (!cwnd_quota)
+                        break;
+                skb = sk->sk_send_head;
+                if (!skb)
+                        break;
+                tso_segs = tcp_init_tso_segs(sk, skb);
+        }
+        if (likely(sent_pkts)) {
+                tcp_cwnd_validate(sk, tp);
+                return 0;
+        }
+out:
+        return !tp->packets_out && sk->sk_send_head;
+}
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
+                               unsigned int cur_mss, int nonagle)
+{
+        struct sk_buff *skb = sk->sk_send_head;
-                        tcp_minshall_update(tp, mss_now, skb);
+        if (skb) {
-                        sent_pkts = 1;
+                if (tcp_write_xmit(sk, cur_mss, nonagle))
+                        tcp_check_probe_timer(sk, tp);
+        }
+}
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb = sk->sk_send_head;
+        unsigned int tso_segs, cwnd_quota;
+        BUG_ON(!skb || skb->len < mss_now);
+        tso_segs = tcp_init_tso_segs(sk, skb);
+        cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+        if (likely(cwnd_quota)) {
+                BUG_ON(!tso_segs);
+                if (tso_segs > 1) {
+                        u32 limit = tcp_window_allows(tp, skb,
+                                                      mss_now, cwnd_quota);
+                        if (skb->len < limit) {
+                                unsigned int trim = skb->len % mss_now;
+                                if (trim)
+                                        limit = skb->len - trim;
+                        }
+                        if (skb->len > limit) {
+                                if (unlikely(tso_fragment(sk, skb, limit)))
+                                        return;
+                        }
+                } else if (unlikely(skb->len > mss_now)) {
+                        if (unlikely(tcp_fragment(sk, skb, mss_now)))
+                                return;
                }
-                if (sent_pkts) {
+                /* Send it out now. */
+                TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+                        update_send_head(sk, tp, skb);
                        tcp_cwnd_validate(sk, tp);
-                        return 0;
+                        return;
                }
-                return !tp->packets_out && sk->sk_send_head;
        }
-        return 0;
 }
 /* This function returns the amount that we can raise the
@@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                if (sk->sk_route_caps & NETIF_F_TSO) {
                        sk->sk_route_caps &= ~NETIF_F_TSO;
                        sock_set_flag(sk, SOCK_NO_LARGESEND);
-                        tp->mss_cache = tp->mss_cache_std;
                }
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         * is still in somebody's hands, else make a clone.
         */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-        tcp_tso_set_push(skb);
        err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
                                    pskb_copy(skb, GFP_ATOMIC):
@@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
                                if (sk->sk_route_caps & NETIF_F_TSO) {
                                        sock_set_flag(sk, SOCK_NO_LARGESEND);
                                        sk->sk_route_caps &= ~NETIF_F_TSO;
-                                        tp->mss_cache = tp->mss_cache_std;
                                }
                        } else if (!tcp_skb_pcount(skb))
                                tcp_set_skb_tso_segs(sk, skb);
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                        tcp_tso_set_push(skb);
                        err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
                        if (!err) {
                                update_send_head(sk, tp, skb);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a54d4ef3fd35..77004b9456c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2777,7 +2777,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                read_lock_bh(&idev->lock);
                switch (type) {
                case UNICAST_ADDR:
-                        /* unicast address */
+                        /* unicast address incl. temp addr */
                        for (ifa = idev->addr_list; ifa;
                             ifa = ifa->if_next, ip_idx++) {
                                if (ip_idx < s_ip_idx)
@@ -2788,19 +2788,6 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                                    NLM_F_MULTI)) <= 0)
                                        goto done;
                        }
-                        /* temp addr */
-#ifdef CONFIG_IPV6_PRIVACY
-                        for (ifa = idev->tempaddr_list; ifa; 
-                             ifa = ifa->tmp_next, ip_idx++) {
-                                if (ip_idx < s_ip_idx)
-                                        continue;
-                                if ((err = inet6_fill_ifaddr(skb, ifa, 
-                                    NETLINK_CB(cb->skb).pid, 
-                                    cb->nlh->nlmsg_seq, RTM_NEWADDR,
-                                    NLM_F_MULTI)) <= 0) 
-                                        goto done;
-                        }
-#endif
                        break;
                case MULTICAST_ADDR:
                        /* multicast address */
@@ -2923,6 +2910,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
        r = NLMSG_DATA(nlh);
        r->ifi_family = AF_INET6;
+        r->__ifi_pad = 0;
        r->ifi_type = dev->type;
        r->ifi_index = dev->ifindex;
        r->ifi_flags = dev_get_flags(dev);
@@ -3030,9 +3018,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
        pmsg = NLMSG_DATA(nlh);
        pmsg->prefix_family = AF_INET6;
+        pmsg->prefix_pad1 = 0;
+        pmsg->prefix_pad2 = 0;
        pmsg->prefix_ifindex = idev->dev->ifindex;
        pmsg->prefix_len = pinfo->prefix_len;
        pmsg->prefix_type = pinfo->type;
+        pmsg->prefix_pad3 = 0;
        
        pmsg->prefix_flags = 0;
        if (pinfo->onlink)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
        if (if6_proc_init())
                goto proc_if6_fail;
 #endif
-        ipv6_packet_init();
        ip6_route_init();
        ip6_flowlabel_init();
        err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
        /* Init v6 transport protocols. */
        udpv6_init();
        tcpv6_init();
+        ipv6_packet_init();
        err = 0;
 out:
        return err;
@@ -798,7 +799,6 @@ out:
 addrconf_fail:
        ip6_flowlabel_cleanup();
        ip6_route_cleanup();
-        ipv6_packet_cleanup();
 #ifdef CONFIG_PROC_FS
        if6_proc_exit();
 proc_if6_fail:
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499debb..b6c73da5ff35 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
                opt_space->opt_nflen = 0;
        }
        opt_space->dst1opt = fopt->dst1opt;
-        opt_space->auth = fopt->auth;
        opt_space->opt_flen = fopt->opt_flen;
        return opt_space;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
-        to->security = from->security;
        dst_release(to->dst);
        to->dst = dst_clone(from->dst);
        to->dev = from->dev;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9dac7fdf4726..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
         */
        tp->snd_ssthresh = 0x7fffffff;
        tp->snd_cwnd_clamp = ~0;
-        tp->mss_cache_std = tp->mss_cache = 536;
+        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 70bcd4744d93..fc456a7aaec3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
 static void netlink_remove(struct sock *sk)
 {
        netlink_table_grab();
-        nl_table[sk->sk_protocol].hash.entries--;
+        if (sk_del_node_init(sk))
-        sk_del_node_init(sk);
+                nl_table[sk->sk_protocol].hash.entries--;
        if (nlk_sk(sk)->groups)
                __sk_del_bind_node(sk);
        netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
        err = netlink_insert(sk, pid);
        if (err == -EADDRINUSE)
                goto retry;
-        return 0;
+        /* If 2 threads race to autobind, that is fine.  */
+        if (err == -EBUSY)
+                err = 0;
+        return err;
 }
 static inline int netlink_capable(struct socket *sock, unsigned int flag) 
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8f58cecd6266..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
 obj-y   := sch_generic.o
-obj-$(CONFIG_NET_SCHED)         += sch_api.o sch_fifo.o
+obj-$(CONFIG_NET_SCHED)         += sch_api.o sch_fifo.o sch_blackhole.o
 obj-$(CONFIG_NET_CLS)           += cls_api.o
 obj-$(CONFIG_NET_CLS_ACT)       += act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)    += police.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9594206e6035..249c61936ea0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -439,6 +439,8 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        
        x = (struct rtattr*) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +582,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
        nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        x = (struct rtattr *) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -687,7 +691,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
-        
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        x = (struct rtattr*) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -842,6 +848,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
                        cb->nlh->nlmsg_type, sizeof(*t));
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        x = (struct rtattr *) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1616bf5c9627..3b5714ef4d1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -331,6 +331,8 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
        tcm = NLMSG_DATA(nlh);
        tcm->tcm_family = AF_UNSPEC;
+        tcm->tcm__pad1 = 0;
+        tcm->tcm__pad1 = 0;
        tcm->tcm_ifindex = tp->q->dev->ifindex;
        tcm->tcm_parent = tp->classid;
        tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb9196810..006168d69376 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
        pinfo.protocol = s->protocol;
        pinfo.tunnelid = s->tunnelid;
        pinfo.tunnelhdr = f->tunnelhdr;
+        pinfo.pad = 0;
        RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
        if (f->res.classid)
                RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
        dst->value = skb->protocol;
 }
-META_COLLECTOR(int_security)
-{
-        dst->value = skb->security;
-}
 META_COLLECTOR(int_pkttype)
 {
        dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
                [META_ID(REALDEV)]              = META_FUNC(int_realdev),
                [META_ID(PRIORITY)]             = META_FUNC(int_priority),
                [META_ID(PROTOCOL)]             = META_FUNC(int_protocol),
-                [META_ID(SECURITY)]             = META_FUNC(int_security),
                [META_ID(PKTTYPE)]              = META_FUNC(int_pkttype),
                [META_ID(PKTLEN)]               = META_FUNC(int_pktlen),
                [META_ID(DATALEN)]              = META_FUNC(int_datalen),
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 97c1c75d5c78..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 {
        int err;
        struct rtattr *kind = tca[TCA_KIND-1];
-        void *p = NULL;
        struct Qdisc *sch;
        struct Qdisc_ops *ops;
-        int size;
        ops = qdisc_lookup_ops(kind);
 #ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
        if (ops == NULL)
                goto err_out;
-        /* ensure that the Qdisc and the private data are 32-byte aligned */
+        sch = qdisc_alloc(dev, ops);
-        size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+        if (IS_ERR(sch)) {
-        size += ops->priv_size + QDISC_ALIGN_CONST;
+                err = PTR_ERR(sch);
-        p = kmalloc(size, GFP_KERNEL);
-        err = -ENOBUFS;
-        if (!p)
                goto err_out2;
-        memset(p, 0, size);
+        }
-        sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
-                               & ~QDISC_ALIGN_CONST);
-        sch->padded = (char *)sch - (char *)p;
-        INIT_LIST_HEAD(&sch->list);
-        skb_queue_head_init(&sch->q);
-        if (handle == TC_H_INGRESS)
+        if (handle == TC_H_INGRESS) {
                sch->flags |= TCQ_F_INGRESS;
+                handle = TC_H_MAKE(TC_H_INGRESS, 0);
-        sch->ops = ops;
+        } else if (handle == 0) {
-        sch->enqueue = ops->enqueue;
-        sch->dequeue = ops->dequeue;
-        sch->dev = dev;
-        dev_hold(dev);
-        atomic_set(&sch->refcnt, 1);
-        sch->stats_lock = &dev->queue_lock;
-        if (handle == 0) {
                handle = qdisc_alloc_handle(dev);
                err = -ENOMEM;
                if (handle == 0)
                        goto err_out3;
        }
-        if (handle == TC_H_INGRESS)
+        sch->handle = handle;
-                sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
-        else
-                sch->handle = handle;
        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+#ifdef CONFIG_NET_ESTIMATOR
+                if (tca[TCA_RATE-1]) {
+                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
+                                                sch->stats_lock,
+                                                tca[TCA_RATE-1]);
+                        if (err) {
+                                /*
+                                 * Any broken qdiscs that would require
+                                 * a ops->reset() here? The qdisc was never
+                                 * in action so it shouldn't be necessary.
+                                 */
+                                if (ops->destroy)
+                                        ops->destroy(sch);
+                                goto err_out3;
+                        }
+                }
+#endif
                qdisc_lock_tree(dev);
                list_add_tail(&sch->list, &dev->qdisc_list);
                qdisc_unlock_tree(dev);
-#ifdef CONFIG_NET_ESTIMATOR
-                if (tca[TCA_RATE-1])
-                        gen_new_estimator(&sch->bstats, &sch->rate_est,
-                                sch->stats_lock, tca[TCA_RATE-1]);
-#endif
                return sch;
        }
 err_out3:
        dev_put(dev);
+        kfree((char *) sch - sch->padded);
 err_out2:
        module_put(ops->owner);
 err_out:
        *errp = err;
-        if (p)
-                kfree(p);
        return NULL;
 }
@@ -770,6 +759,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
        tcm = NLMSG_DATA(nlh);
        tcm->tcm_family = AF_UNSPEC;
+        tcm->tcm__pad1 = 0;
+        tcm->tcm__pad2 = 0;
        tcm->tcm_ifindex = q->dev->ifindex;
        tcm->tcm_parent = clid;
        tcm->tcm_handle = q->handle;
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
+/*
+ * net/sched/sch_blackhole.c    Black hole queue
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        qdisc_drop(skb, sch);
+        return NET_XMIT_SUCCESS;
+}
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+        return NULL;
+}
+static struct Qdisc_ops blackhole_qdisc_ops = {
+        .id             = "blackhole",
+        .priv_size      = 0,
+        .enqueue        = blackhole_enqueue,
+        .dequeue        = blackhole_dequeue,
+        .owner          = THIS_MODULE,
+};
+static int __init blackhole_module_init(void)
+{
+        return register_qdisc(&blackhole_qdisc_ops);
+}
+static void __exit blackhole_module_exit(void)
+{
+        unregister_qdisc(&blackhole_qdisc_ops);
+}
+module_init(blackhole_module_init)
+module_exit(blackhole_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8cbf6a..09453f997d8c 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
        opt.strategy = cl->ovl_strategy;
        opt.priority2 = cl->priority2+1;
+        opt.pad = 0;
        opt.penalty = (cl->penalty*1000)/HZ;
        RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
        return skb->len;
@@ -1563,6 +1564,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
        if (cl->police) {
                opt.police = cl->police;
+                opt.__res1 = 0;
+                opt.__res2 = 0;
                RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
        }
        return skb->len;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6a9..73e218e646ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
        .owner          =       THIS_MODULE,
 };
-struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
 {
        void *p;
        struct Qdisc *sch;
-        int size;
+        unsigned int size;
+        int err = -ENOBUFS;
        /* ensure that the Qdisc and the private data are 32-byte aligned */
-        size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+        size = QDISC_ALIGN(sizeof(*sch));
-        size += ops->priv_size + QDISC_ALIGN_CONST;
+        size += ops->priv_size + (QDISC_ALIGNTO - 1);
        p = kmalloc(size, GFP_KERNEL);
        if (!p)
-                return NULL;
+                goto errout;
        memset(p, 0, size);
+        sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
-        sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 
+        sch->padded = (char *) sch - (char *) p;
-                               & ~QDISC_ALIGN_CONST);
-        sch->padded = (char *)sch - (char *)p;
        INIT_LIST_HEAD(&sch->list);
        skb_queue_head_init(&sch->q);
@@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
        dev_hold(dev);
        sch->stats_lock = &dev->queue_lock;
        atomic_set(&sch->refcnt, 1);
+        return sch;
+errout:
+        return ERR_PTR(-err);
+}
+struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+{
+        struct Qdisc *sch;
+        
+        sch = qdisc_alloc(dev, ops);
+        if (IS_ERR(sch))
+                goto errout;
        if (!ops->init || ops->init(sch, NULL) == 0)
                return sch;
-        dev_put(dev);
+errout:
-        kfree(p);
        return NULL;
 }
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
 EXPORT_SYMBOL(noop_qdisc);
 EXPORT_SYMBOL(noop_qdisc_ops);
 EXPORT_SYMBOL(qdisc_create_dflt);
+EXPORT_SYMBOL(qdisc_alloc);
 EXPORT_SYMBOL(qdisc_destroy);
 EXPORT_SYMBOL(qdisc_reset);
 EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2ec0320fac3b..c44bf4165c6e 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -102,9 +102,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
        /* Set up the base timeout information.  */
        ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
        ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
-                SCTP_DEFAULT_TIMEOUT_T1_COOKIE;
+                msecs_to_jiffies(sp->rtoinfo.srto_initial);
        ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
-                SCTP_DEFAULT_TIMEOUT_T1_INIT;
+                msecs_to_jiffies(sp->rtoinfo.srto_initial);
        ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
                msecs_to_jiffies(sp->rtoinfo.srto_initial);
        ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +117,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
        ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
                = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
-        ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] =
+        ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
-                SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+        ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
-        ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+        ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
-                SCTP_DEFAULT_TIMEOUT_SACK;
-        ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
-                sp->autoclose * HZ;
        /* Use SCTP specific send buffer space queues.  */
        ep->sndbuf_policy = sctp_sndbuf_policy;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5135e1a25d25..e7f37faba7c0 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1050,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
        sctp_sndbuf_policy              = 0;
        /* HB.interval              - 30 seconds */
-        sctp_hb_interval                = 30 * HZ;
+        sctp_hb_interval                = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+        /* delayed SACK timeout */
+        sctp_sack_timeout               = SCTP_DEFAULT_TIMEOUT_SACK;
        /* Implementation specific variables. */
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc31849312b..dc4893474f18 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
 static ctl_handler sctp_sysctl_jiffies_ms;
 static long rto_timer_min = 1;
 static long rto_timer_max = 86400000; /* One day */
+static long sack_timer_min = 1;
+static long sack_timer_max = 500;
 static ctl_table sctp_table[] = {
        {
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec
        },
+        {
+                .ctl_name       = NET_SCTP_SACK_TIMEOUT,
+                .procname       = "sack_timeout",
+                .data           = &sctp_sack_timeout,
+                .maxlen         = sizeof(long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+                .strategy       = &sctp_sysctl_jiffies_ms,
+                .extra1         = &sack_timer_min,
+                .extra2         = &sack_timer_max,
+        },
        { .ctl_name = 0 }
 };
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0ec0fde6e6c5..a63b69179607 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -103,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
        /* Set up the heartbeat timer. */
        init_timer(&peer->hb_timer);
-        peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
        peer->hb_timer.function = sctp_generate_heartbeat_event;
        peer->hb_timer.data = (unsigned long)peer;