47 files changed, 1125 insertions, 826 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 37cf1a6ea3ad..05c57f0fcabe 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -259,22 +259,6 @@ config IP_PIMSM_V2
          gated-5). This routing protocol is not used widely, so say N unless
          you want to play with it.
-config ARPD
-        bool "IP: ARP daemon support"
-        ---help---
-          The kernel maintains an internal cache which maps IP addresses to
-          hardware addresses on the local network, so that Ethernet
-          frames are sent to the proper address on the physical networking
-          layer. Normally, kernel uses the ARP protocol to resolve these
-          mappings.
-          Saying Y here adds support to have an user space daemon to do this
-          resolution instead. This is useful for implementing an alternate
-          address resolution protocol (e.g. NHRP on mGRE tunnels) and also for
-          testing purposes.
-          If unsure, say N.
 config SYN_COOKIES
        bool "IP: TCP syncookie support"
        ---help---
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b4d0be2b7ce9..7a1874b7b8fd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1532,18 +1532,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
 }
 EXPORT_SYMBOL_GPL(snmp_mib_init);
-void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
-{
-        int i;
-        BUG_ON(ptr == NULL);
-        for (i = 0; i < SNMP_ARRAY_SZ; i++) {
-                free_percpu(ptr[i]);
-                ptr[i] = NULL;
-        }
-}
-EXPORT_SYMBOL_GPL(snmp_mib_free);
 #ifdef CONFIG_IP_MULTICAST
 static const struct net_protocol igmp_protocol = {
        .handler =      igmp_rcv,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4429b013f269..7808093cede6 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -368,9 +368,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
        } else {
                probes -= neigh->parms->app_probes;
                if (probes < 0) {
-#ifdef CONFIG_ARPD
                        neigh_app_ns(neigh);
-#endif
                        return;
                }
        }
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 34ca6d5a3a4b..a1b5bcbd04ae 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = {
                [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+                [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+                [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
        },
 };
@@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
                [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
                [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+                [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+                [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
        },
 };
@@ -1126,10 +1130,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
                if (len < (int) sizeof(ifr))
                        break;
                memset(&ifr, 0, sizeof(struct ifreq));
-                if (ifa->ifa_label)
+                strcpy(ifr.ifr_name, ifa->ifa_label);
-                        strcpy(ifr.ifr_name, ifa->ifa_label);
-                else
-                        strcpy(ifr.ifr_name, dev->name);
                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
@@ -2097,11 +2098,15 @@ static struct devinet_sysctl_table {
                DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
                DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+                DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+                                        "force_igmp_version"),
+                DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+                                        "igmpv2_unsolicited_report_interval"),
+                DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+                                        "igmpv3_unsolicited_report_interval"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-                DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
-                                              "force_igmp_version"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
                                              "promote_secondaries"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ab3d814bc80a..109ee89f123e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -477,7 +477,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
        }
        return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
-                 net_adj) & ~(align - 1)) + (net_adj - 2);
+                 net_adj) & ~(align - 1)) + net_adj - 2;
 }
 static void esp4_err(struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 26aa65d1fce4..523be38e37de 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -101,6 +101,30 @@ errout:
        return err;
 }
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+        struct fib_result *result = (struct fib_result *) arg->result;
+        struct net_device *dev = result->fi->fib_dev;
+        /* do not accept result if the route does
+         * not meet the required prefix length
+         */
+        if (result->prefixlen <= rule->suppress_prefixlen)
+                goto suppress_route;
+        /* do not accept result if the route uses a device
+         * belonging to a forbidden interface group
+         */
+        if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+                goto suppress_route;
+        return false;
+suppress_route:
+        if (!(arg->flags & FIB_LOOKUP_NOREF))
+                fib_info_put(result->fi);
+        return true;
+}
 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 {
@@ -267,6 +291,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
        .rule_size      = sizeof(struct fib4_rule),
        .addr_size      = sizeof(u32),
        .action         = fib4_rule_action,
+        .suppress       = fib4_rule_suppress,
        .match          = fib4_rule_match,
        .configure      = fib4_rule_configure,
        .delete         = fib4_rule_delete,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 108a1e9c9eac..3df6d3edb2a1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -71,7 +71,6 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
-#include <linux/prefetch.h>
 #include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/ip.h>
@@ -1761,10 +1760,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
                        if (!c)
                                continue;
-                        if (IS_LEAF(c)) {
+                        if (IS_LEAF(c))
-                                prefetch(rcu_dereference_rtnl(p->child[idx]));
                                return (struct leaf *) c;
-                        }
                        /* Rescan start scanning in new node */
                        p = (struct tnode *) c;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index cd71190d2962..d6c0e64ec97f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -88,6 +88,7 @@
 #include <linux/if_arp.h>
 #include <linux/rtnetlink.h>
 #include <linux/times.h>
+#include <linux/pkt_sched.h>
 #include <net/net_namespace.h>
 #include <net/arp.h>
@@ -113,7 +114,8 @@
 #define IGMP_V1_Router_Present_Timeout          (400*HZ)
 #define IGMP_V2_Router_Present_Timeout          (400*HZ)
-#define IGMP_Unsolicited_Report_Interval        (10*HZ)
+#define IGMP_V2_Unsolicited_Report_Interval     (10*HZ)
+#define IGMP_V3_Unsolicited_Report_Interval     (1*HZ)
 #define IGMP_Query_Response_Interval            (10*HZ)
 #define IGMP_Unsolicited_Report_Count           2
@@ -138,6 +140,29 @@
         ((in_dev)->mr_v2_seen && \
          time_before(jiffies, (in_dev)->mr_v2_seen)))
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+        int interval_ms, interval_jiffies;
+        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+                interval_ms = IN_DEV_CONF_GET(
+                        in_dev,
+                        IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+        else /* v3 */
+                interval_ms = IN_DEV_CONF_GET(
+                        in_dev,
+                        IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+        interval_jiffies = msecs_to_jiffies(interval_ms);
+        /* _timer functions can't handle a delay of 0 jiffies so ensure
+         *  we always return a positive value.
+         */
+        if (interval_jiffies <= 0)
+                interval_jiffies = 1;
+        return interval_jiffies;
+}
 static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
 static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
 static void igmpv3_clear_delrec(struct in_device *in_dev);
@@ -315,6 +340,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
                if (size < 256)
                        return NULL;
        }
+        skb->priority = TC_PRIO_CONTROL;
        igmp_skb_size(skb) = size;
        rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
@@ -670,6 +696,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
                ip_rt_put(rt);
                return -1;
        }
+        skb->priority = TC_PRIO_CONTROL;
        skb_dst_set(skb, &rt->dst);
@@ -719,7 +746,8 @@ static void igmp_ifc_timer_expire(unsigned long data)
        igmpv3_send_cr(in_dev);
        if (in_dev->mr_ifc_count) {
                in_dev->mr_ifc_count--;
-                igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+                igmp_ifc_start_timer(in_dev,
+                                     unsolicited_report_interval(in_dev));
        }
        __in_dev_put(in_dev);
 }
@@ -744,7 +772,7 @@ static void igmp_timer_expire(unsigned long data)
        if (im->unsolicit_count) {
                im->unsolicit_count--;
-                igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+                igmp_start_timer(im, unsolicited_report_interval(in_dev));
        }
        im->reporter = 1;
        spin_unlock(&im->lock);
@@ -1323,16 +1351,17 @@ out:
 EXPORT_SYMBOL(ip_mc_inc_group);
 /*
- *      Resend IGMP JOIN report; used for bonding.
+ *      Resend IGMP JOIN report; used by netdev notifier.
- *      Called with rcu_read_lock()
 */
-void ip_mc_rejoin_groups(struct in_device *in_dev)
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
 {
 #ifdef CONFIG_IP_MULTICAST
        struct ip_mc_list *im;
        int type;
-        for_each_pmc_rcu(in_dev, im) {
+        ASSERT_RTNL();
+        for_each_pmc_rtnl(in_dev, im) {
                if (im->multiaddr == IGMP_ALL_HOSTS)
                        continue;
@@ -1349,7 +1378,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev)
        }
 #endif
 }
-EXPORT_SYMBOL(ip_mc_rejoin_groups);
 /*
 *      A socket has left a multicast group on device dev
@@ -2735,8 +2763,42 @@ static struct pernet_operations igmp_net_ops = {
        .exit = igmp_net_exit,
 };
+static int igmp_netdev_event(struct notifier_block *this,
+                             unsigned long event, void *ptr)
+{
+        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+        struct in_device *in_dev;
+        switch (event) {
+        case NETDEV_RESEND_IGMP:
+                in_dev = __in_dev_get_rtnl(dev);
+                if (in_dev)
+                        ip_mc_rejoin_groups(in_dev);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block igmp_notifier = {
+        .notifier_call = igmp_netdev_event,
+};
 int __init igmp_mc_proc_init(void)
 {
-        return register_pernet_subsys(&igmp_net_ops);
+        int err;
+        err = register_pernet_subsys(&igmp_net_ops);
+        if (err)
+                return err;
+        err = register_netdevice_notifier(&igmp_notifier);
+        if (err)
+                goto reg_notif_fail;
+        return 0;
+reg_notif_fail:
+        unregister_pernet_subsys(&igmp_net_ops);
+        return err;
 }
 #endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1f6eab66f7ce..d7aea4c5b940 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -383,7 +383,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
        if (daddr)
                memcpy(&iph->daddr, daddr, 4);
        if (iph->daddr)
-                return t->hlen;
+                return t->hlen + sizeof(*iph);
        return -(t->hlen + sizeof(*iph));
 }
@@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net)
 static void __net_exit ipgre_exit_net(struct net *net)
 {
        struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
-        ip_tunnel_delete_net(itn);
+        ip_tunnel_delete_net(itn, &ipgre_link_ops);
 }
 static struct pernet_operations ipgre_net_ops = {
@@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net)
 static void __net_exit ipgre_tap_exit_net(struct net *net)
 {
        struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
-        ip_tunnel_delete_net(itn);
+        ip_tunnel_delete_net(itn, &ipgre_tap_ops);
 }
 static struct pernet_operations ipgre_tap_net_ops = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 15e3e683adec..054a3e97d822 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -141,6 +141,7 @@
 #include <net/icmp.h>
 #include <net/raw.h>
 #include <net/checksum.h>
+#include <net/inet_ecn.h>
 #include <linux/netfilter_ipv4.h>
 #include <net/xfrm.h>
 #include <linux/mroute.h>
@@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
        if (iph->ihl < 5 || iph->version != 4)
                goto inhdr_error;
+        BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+        BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+        BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+        IP_ADD_STATS_BH(dev_net(dev),
+                        IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+                        max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
        if (!pskb_may_pull(skb, iph->ihl*4))
                goto inhdr_error;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4bcabf3ab4ca..9ee17e3d11c3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -211,14 +211,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
        return -EINVAL;
 }
-static inline int ip_skb_dst_mtu(struct sk_buff *skb)
-{
-        struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
-        return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
-               skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
-}
 static int ip_finish_output(struct sk_buff *skb)
 {
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ca1cb2d5f6e2..ac9fabe0300f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
                struct flowi4 fl4;
                struct rtable *rt;
-                rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+                rt = ip_route_output_tunnel(tunnel->net, &fl4,
                                            tunnel->parms.iph.protocol,
                                            iph->daddr, iph->saddr,
                                            tunnel->parms.o_key,
@@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
        }
        if (!tdev && tunnel->parms.link)
-                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
        if (tdev) {
                hlen = tdev->hard_header_len + tdev->needed_headroom;
@@ -454,15 +454,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
        tstats->rx_bytes += skb->len;
        u64_stats_update_end(&tstats->syncp);
-        if (tunnel->net != dev_net(tunnel->dev))
-                skb_scrub_packet(skb);
        if (tunnel->dev->type == ARPHRD_ETHER) {
                skb->protocol = eth_type_trans(skb, tunnel->dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
        } else {
                skb->dev = tunnel->dev;
        }
+        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
        gro_cells_receive(&tunnel->gro_cells, skb);
        return 0;
@@ -613,9 +613,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                goto tx_error;
        }
-        if (tunnel->net != dev_net(dev))
-                skb_scrub_packet(skb);
        if (tunnel->err_count > 0) {
                if (time_before(jiffies,
                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
@@ -653,9 +650,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                }
        }
-        err = iptunnel_xmit(dev_net(dev), rt, skb,
+        err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
-                            fl4.saddr, fl4.daddr, protocol,
+                            ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df,
-                            ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
+                            !net_eq(tunnel->net, dev_net(dev)));
        iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
        return;
@@ -820,11 +817,10 @@ static void ip_tunnel_dev_free(struct net_device *dev)
 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 {
-        struct net *net = dev_net(dev);
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct ip_tunnel_net *itn;
-        itn = net_generic(net, tunnel->ip_tnl_net_id);
+        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
        if (itn->fb_tunnel_dev != dev) {
                ip_tunnel_del(netdev_priv(dev));
@@ -838,56 +834,68 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 {
        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
        struct ip_tunnel_parm parms;
+        unsigned int i;
-        itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
+        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
-        if (!itn->tunnels)
+                INIT_HLIST_HEAD(&itn->tunnels[i]);
-                return -ENOMEM;
        if (!ops) {
                itn->fb_tunnel_dev = NULL;
                return 0;
        }
        memset(&parms, 0, sizeof(parms));
        if (devname)
                strlcpy(parms.name, devname, IFNAMSIZ);
        rtnl_lock();
        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+        /* FB netdevice is special: we have one, and only one per netns.
+         * Allowing to move it to another netns is clearly unsafe.
+         */
+        if (!IS_ERR(itn->fb_tunnel_dev))
+                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
        rtnl_unlock();
-        if (IS_ERR(itn->fb_tunnel_dev)) {
-                kfree(itn->tunnels);
-                return PTR_ERR(itn->fb_tunnel_dev);
-        }
-        return 0;
+        return PTR_RET(itn->fb_tunnel_dev);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
-static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+                              struct rtnl_link_ops *ops)
 {
+        struct net *net = dev_net(itn->fb_tunnel_dev);
+        struct net_device *dev, *aux;
        int h;
+        for_each_netdev_safe(net, dev, aux)
+                if (dev->rtnl_link_ops == ops)
+                        unregister_netdevice_queue(dev, head);
        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
                struct ip_tunnel *t;
                struct hlist_node *n;
                struct hlist_head *thead = &itn->tunnels[h];
                hlist_for_each_entry_safe(t, n, thead, hash_node)
-                        unregister_netdevice_queue(t->dev, head);
+                        /* If dev is in the same netns, it has already
+                         * been added to the list by the previous loop.
+                         */
+                        if (!net_eq(dev_net(t->dev), net))
+                                unregister_netdevice_queue(t->dev, head);
        }
        if (itn->fb_tunnel_dev)
                unregister_netdevice_queue(itn->fb_tunnel_dev, head);
 }
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
 {
        LIST_HEAD(list);
        rtnl_lock();
-        ip_tunnel_destroy(itn, &list);
+        ip_tunnel_destroy(itn, &list, ops);
        unregister_netdevice_many(&list);
        rtnl_unlock();
-        kfree(itn->tunnels);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
@@ -929,23 +937,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
                         struct ip_tunnel_parm *p)
 {
-        struct ip_tunnel *t, *nt;
+        struct ip_tunnel *t;
-        struct net *net = dev_net(dev);
        struct ip_tunnel *tunnel = netdev_priv(dev);
+        struct net *net = tunnel->net;
        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
        if (dev == itn->fb_tunnel_dev)
                return -EINVAL;
-        nt = netdev_priv(dev);
        t = ip_tunnel_find(itn, p, dev->type);
        if (t) {
                if (t->dev != dev)
                        return -EEXIST;
        } else {
-                t = nt;
+                t = tunnel;
                if (dev->type != ARPHRD_ETHER) {
                        unsigned int nflags = 0;
@@ -984,6 +990,7 @@ int ip_tunnel_init(struct net_device *dev)
        }
        tunnel->dev = dev;
+        tunnel->net = dev_net(dev);
        strcpy(tunnel->parms.name, dev->name);
        iph->version            = 4;
        iph->ihl                = 5;
@@ -994,8 +1001,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init);
 void ip_tunnel_uninit(struct net_device *dev)
 {
-        struct net *net = dev_net(dev);
        struct ip_tunnel *tunnel = netdev_priv(dev);
+        struct net *net = tunnel->net;
        struct ip_tunnel_net *itn;
        itn = net_generic(net, tunnel->ip_tnl_net_id);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 7167b08977df..d6c856b17fd4 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -46,19 +46,17 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
-int iptunnel_xmit(struct net *net, struct rtable *rt,
+int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
-                  struct sk_buff *skb,
                  __be32 src, __be32 dst, __u8 proto,
-                  __u8 tos, __u8 ttl, __be16 df)
+                  __u8 tos, __u8 ttl, __be16 df, bool xnet)
 {
        int pkt_len = skb->len;
        struct iphdr *iph;
        int err;
-        nf_reset(skb);
+        skb_scrub_packet(skb, xnet);
-        secpath_reset(skb);
        skb->rxhash = 0;
-        skb_dst_drop(skb);
        skb_dst_set(skb, &rt->dst);
        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
@@ -76,9 +74,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt,
        iph->daddr      =       dst;
        iph->saddr      =       src;
        iph->ttl        =       ttl;
-        tunnel_ip_select_ident(skb,
+        __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1);
-                               (const struct iphdr *)skb_inner_network_header(skb),
-                               &rt->dst);
        err = ip_local_out(skb);
        if (unlikely(net_xmit_eval(err)))
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 17cc0ffa8c0d..e805e7b3030e 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -44,176 +44,10 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
-#define HASH_SIZE  16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
 static struct rtnl_link_ops vti_link_ops __read_mostly;
 static int vti_net_id __read_mostly;
-struct vti_net {
-        struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
-        struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
-        struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
-        struct ip_tunnel __rcu *tunnels_wc[1];
-        struct ip_tunnel __rcu **tunnels[4];
-        struct net_device *fb_tunnel_dev;
-};
-static int vti_fb_tunnel_init(struct net_device *dev);
 static int vti_tunnel_init(struct net_device *dev);
-static void vti_tunnel_setup(struct net_device *dev);
-static void vti_dev_free(struct net_device *dev);
-static int vti_tunnel_bind_dev(struct net_device *dev);
-#define VTI_XMIT(stats1, stats2) do {                           \
-        int err;                                                \
-        int pkt_len = skb->len;                                 \
-        err = dst_output(skb);                                  \
-        if (net_xmit_eval(err) == 0) {                          \
-                u64_stats_update_begin(&(stats1)->syncp);       \
-                (stats1)->tx_bytes += pkt_len;                  \
-                (stats1)->tx_packets++;                         \
-                u64_stats_update_end(&(stats1)->syncp);         \
-        } else {                                                \
-                (stats2)->tx_errors++;                          \
-                (stats2)->tx_aborted_errors++;                  \
-        }                                                       \
-} while (0)
-static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
-                                           __be32 remote, __be32 local)
-{
-        unsigned h0 = HASH(remote);
-        unsigned h1 = HASH(local);
-        struct ip_tunnel *t;
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
-                if (local == t->parms.iph.saddr &&
-                    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
-                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
-                if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
-                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
-                if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
-                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
-                if (t && (t->dev->flags&IFF_UP))
-                        return t;
-        return NULL;
-}
-static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
-                                             struct ip_tunnel_parm *parms)
-{
-        __be32 remote = parms->iph.daddr;
-        __be32 local = parms->iph.saddr;
-        unsigned h = 0;
-        int prio = 0;
-        if (remote) {
-                prio |= 2;
-                h ^= HASH(remote);
-        }
-        if (local) {
-                prio |= 1;
-                h ^= HASH(local);
-        }
-        return &ipn->tunnels[prio][h];
-}
-static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
-                                                  struct ip_tunnel *t)
-{
-        return __vti_bucket(ipn, &t->parms);
-}
-static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
-{
-        struct ip_tunnel __rcu **tp;
-        struct ip_tunnel *iter;
-        for (tp = vti_bucket(ipn, t);
-             (iter = rtnl_dereference(*tp)) != NULL;
-             tp = &iter->next) {
-                if (t == iter) {
-                        rcu_assign_pointer(*tp, t->next);
-                        break;
-                }
-        }
-}
-static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
-{
-        struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
-        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
-        rcu_assign_pointer(*tp, t);
-}
-static struct ip_tunnel *vti_tunnel_locate(struct net *net,
-                                           struct ip_tunnel_parm *parms,
-                                           int create)
-{
-        __be32 remote = parms->iph.daddr;
-        __be32 local = parms->iph.saddr;
-        struct ip_tunnel *t, *nt;
-        struct ip_tunnel __rcu **tp;
-        struct net_device *dev;
-        char name[IFNAMSIZ];
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        for (tp = __vti_bucket(ipn, parms);
-             (t = rtnl_dereference(*tp)) != NULL;
-             tp = &t->next) {
-                if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
-                        return t;
-        }
-        if (!create)
-                return NULL;
-        if (parms->name[0])
-                strlcpy(name, parms->name, IFNAMSIZ);
-        else
-                strcpy(name, "vti%d");
-        dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
-        if (dev == NULL)
-                return NULL;
-        dev_net_set(dev, net);
-        nt = netdev_priv(dev);
-        nt->parms = *parms;
-        dev->rtnl_link_ops = &vti_link_ops;
-        vti_tunnel_bind_dev(dev);
-        if (register_netdevice(dev) < 0)
-                goto failed_free;
-        dev_hold(dev);
-        vti_tunnel_link(ipn, nt);
-        return nt;
-failed_free:
-        free_netdev(dev);
-        return NULL;
-}
-static void vti_tunnel_uninit(struct net_device *dev)
-{
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        vti_tunnel_unlink(ipn, netdev_priv(dev));
-        dev_put(dev);
-}
 static int vti_err(struct sk_buff *skb, u32 info)
 {
@@ -222,6 +56,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
         * 8 bytes of packet payload. It means, that precise relaying of
         * ICMP in the real Internet is absolutely infeasible.
         */
+        struct net *net = dev_net(skb->dev);
+        struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
        struct iphdr *iph = (struct iphdr *)skb->data;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
@@ -252,7 +88,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
        err = -ENOENT;
-        t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+        t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+                             iph->daddr, iph->saddr, 0);
        if (t == NULL)
                goto out;
@@ -281,8 +118,11 @@ static int vti_rcv(struct sk_buff *skb)
 {
        struct ip_tunnel *tunnel;
        const struct iphdr *iph = ip_hdr(skb);
+        struct net *net = dev_net(skb->dev);
+        struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
-        tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+                                  iph->saddr, iph->daddr, 0);
        if (tunnel != NULL) {
                struct pcpu_tstats *tstats;
@@ -311,7 +151,6 @@ static int vti_rcv(struct sk_buff *skb)
 static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
-        struct pcpu_tstats *tstats;
        struct iphdr  *tiph = &tunnel->parms.iph;
        u8     tos;
        struct rtable *rt;              /* Route to the other host */
@@ -319,6 +158,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
        struct iphdr  *old_iph = ip_hdr(skb);
        __be32 dst = tiph->daddr;
        struct flowi4 fl4;
+        int err;
        if (skb->protocol != htons(ETH_P_IP))
                goto tx_error;
@@ -367,8 +207,10 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
        nf_reset(skb);
        skb->dev = skb_dst(skb)->dev;
-        tstats = this_cpu_ptr(dev->tstats);
+        err = dst_output(skb);
-        VTI_XMIT(tstats, &dev->stats);
+        if (net_xmit_eval(err) == 0)
+                err = skb->len;
+        iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
        return NETDEV_TX_OK;
 tx_error_icmp:
@@ -379,198 +221,57 @@ tx_error:
        return NETDEV_TX_OK;
 }
-static int vti_tunnel_bind_dev(struct net_device *dev)
-{
-        struct net_device *tdev = NULL;
-        struct ip_tunnel *tunnel;
-        struct iphdr *iph;
-        tunnel = netdev_priv(dev);
-        iph = &tunnel->parms.iph;
-        if (iph->daddr) {
-                struct rtable *rt;
-                struct flowi4 fl4;
-                memset(&fl4, 0, sizeof(fl4));
-                flowi4_init_output(&fl4, tunnel->parms.link,
-                                   be32_to_cpu(tunnel->parms.i_key),
-                                   RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-                                   IPPROTO_IPIP, 0,
-                                   iph->daddr, iph->saddr, 0, 0);
-                rt = ip_route_output_key(dev_net(dev), &fl4);
-                if (!IS_ERR(rt)) {
-                        tdev = rt->dst.dev;
-                        ip_rt_put(rt);
-                }
-                dev->flags |= IFF_POINTOPOINT;
-        }
-        if (!tdev && tunnel->parms.link)
-                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
-        if (tdev) {
-                dev->hard_header_len = tdev->hard_header_len +
-                                       sizeof(struct iphdr);
-                dev->mtu = tdev->mtu;
-        }
-        dev->iflink = tunnel->parms.link;
-        return dev->mtu;
-}
 static int
 vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
        int err = 0;
        struct ip_tunnel_parm p;
-        struct ip_tunnel *t;
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        switch (cmd) {
-        case SIOCGETTUNNEL:
-                t = NULL;
-                if (dev == ipn->fb_tunnel_dev) {
-                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
-                                           sizeof(p))) {
-                                err = -EFAULT;
-                                break;
-                        }
-                        t = vti_tunnel_locate(net, &p, 0);
-                }
-                if (t == NULL)
-                        t = netdev_priv(dev);
-                memcpy(&p, &t->parms, sizeof(p));
-                p.i_flags |= GRE_KEY | VTI_ISVTI;
-                p.o_flags |= GRE_KEY;
-                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-                        err = -EFAULT;
-                break;
-        case SIOCADDTUNNEL:
-        case SIOCCHGTUNNEL:
-                err = -EPERM;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-                        goto done;
-                err = -EFAULT;
+        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+                return -EFAULT;
-                        goto done;
-                err = -EINVAL;
+        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
                    p.iph.ihl != 5)
-                        goto done;
+                        return -EINVAL;
+        }
-                t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-                if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
-                        if (t != NULL) {
-                                if (t->dev != dev) {
-                                        err = -EEXIST;
-                                        break;
-                                }
-                        } else {
-                                if (((dev->flags&IFF_POINTOPOINT) &&
-                                    !p.iph.daddr) ||
-                                    (!(dev->flags&IFF_POINTOPOINT) &&
-                                    p.iph.daddr)) {
-                                        err = -EINVAL;
-                                        break;
-                                }
-                                t = netdev_priv(dev);
-                                vti_tunnel_unlink(ipn, t);
-                                synchronize_net();
-                                t->parms.iph.saddr = p.iph.saddr;
-                                t->parms.iph.daddr = p.iph.daddr;
-                                t->parms.i_key = p.i_key;
-                                t->parms.o_key = p.o_key;
-                                t->parms.iph.protocol = IPPROTO_IPIP;
-                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
-                                memcpy(dev->broadcast, &p.iph.daddr, 4);
-                                vti_tunnel_link(ipn, t);
-                                netdev_state_change(dev);
-                        }
-                }
-                if (t) {
-                        err = 0;
-                        if (cmd == SIOCCHGTUNNEL) {
-                                t->parms.i_key = p.i_key;
-                                t->parms.o_key = p.o_key;
-                                if (t->parms.link != p.link) {
-                                        t->parms.link = p.link;
-                                        vti_tunnel_bind_dev(dev);
-                                        netdev_state_change(dev);
-                                }
-                        }
-                        p.i_flags |= GRE_KEY | VTI_ISVTI;
-                        p.o_flags |= GRE_KEY;
-                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
-                                         sizeof(p)))
-                                err = -EFAULT;
-                } else
-                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
-                break;
-        case SIOCDELTUNNEL:
+        err = ip_tunnel_ioctl(dev, &p, cmd);
-                err = -EPERM;
+        if (err)
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                return err;
-                        goto done;
-                if (dev == ipn->fb_tunnel_dev) {
-                        err = -EFAULT;
-                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
-                                           sizeof(p)))
-                                goto done;
-                        err = -ENOENT;
-                        t = vti_tunnel_locate(net, &p, 0);
-                        if (t == NULL)
-                                goto done;
-                        err = -EPERM;
-                        if (t->dev == ipn->fb_tunnel_dev)
-                                goto done;
-                        dev = t->dev;
-                }
-                unregister_netdevice(dev);
-                err = 0;
-                break;
-        default:
+        if (cmd != SIOCDELTUNNEL) {
-                err = -EINVAL;
+                p.i_flags |= GRE_KEY | VTI_ISVTI;
+                p.o_flags |= GRE_KEY;
        }
-done:
+        if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-        return err;
+                return -EFAULT;
-}
-static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
-        if (new_mtu < 68 || new_mtu > 0xFFF8)
-                return -EINVAL;
-        dev->mtu = new_mtu;
        return 0;
 }
 static const struct net_device_ops vti_netdev_ops = {
        .ndo_init       = vti_tunnel_init,
-        .ndo_uninit     = vti_tunnel_uninit,
+        .ndo_uninit     = ip_tunnel_uninit,
        .ndo_start_xmit = vti_tunnel_xmit,
        .ndo_do_ioctl   = vti_tunnel_ioctl,
-        .ndo_change_mtu = vti_tunnel_change_mtu,
+        .ndo_change_mtu = ip_tunnel_change_mtu,
        .ndo_get_stats64 = ip_tunnel_get_stats64,
 };
-static void vti_dev_free(struct net_device *dev)
+static void vti_tunnel_setup(struct net_device *dev)
 {
-        free_percpu(dev->tstats);
+        dev->netdev_ops         = &vti_netdev_ops;
-        free_netdev(dev);
+        ip_tunnel_setup(dev, vti_net_id);
 }
-static void vti_tunnel_setup(struct net_device *dev)
+static int vti_tunnel_init(struct net_device *dev)
 {
-        dev->netdev_ops         = &vti_netdev_ops;
+        struct ip_tunnel *tunnel = netdev_priv(dev);
-        dev->destructor         = vti_dev_free;
+        struct iphdr *iph = &tunnel->parms.iph;
+        memcpy(dev->dev_addr, &iph->saddr, 4);
+        memcpy(dev->broadcast, &iph->daddr, 4);
        dev->type               = ARPHRD_TUNNEL;
        dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -581,38 +282,18 @@ static void vti_tunnel_setup(struct net_device *dev)
        dev->features           |= NETIF_F_NETNS_LOCAL;
        dev->features           |= NETIF_F_LLTX;
        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
-}
-static int vti_tunnel_init(struct net_device *dev)
+        return ip_tunnel_init(dev);
-{
-        struct ip_tunnel *tunnel = netdev_priv(dev);
-        tunnel->dev = dev;
-        strcpy(tunnel->parms.name, dev->name);
-        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
-        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
-        dev->tstats = alloc_percpu(struct pcpu_tstats);
-        if (!dev->tstats)
-                return -ENOMEM;
-        return 0;
 }
-static int __net_init vti_fb_tunnel_init(struct net_device *dev)
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;
-        struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
        iph->version            = 4;
        iph->protocol           = IPPROTO_IPIP;
        iph->ihl                = 5;
-        dev_hold(dev);
-        rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
-        return 0;
 }
 static struct xfrm_tunnel vti_handler __read_mostly = {
@@ -621,76 +302,30 @@ static struct xfrm_tunnel vti_handler __read_mostly = {
        .priority       =       1,
 };
-static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
-{
-        int prio;
-        for (prio = 1; prio < 4; prio++) {
-                int h;
-                for (h = 0; h < HASH_SIZE; h++) {
-                        struct ip_tunnel *t;
-                        t = rtnl_dereference(ipn->tunnels[prio][h]);
-                        while (t != NULL) {
-                                unregister_netdevice_queue(t->dev, head);
-                                t = rtnl_dereference(t->next);
-                        }
-                }
-        }
-}
 static int __net_init vti_init_net(struct net *net)
 {
        int err;
-        struct vti_net *ipn = net_generic(net, vti_net_id);
+        struct ip_tunnel_net *itn;
-        ipn->tunnels[0] = ipn->tunnels_wc;
-        ipn->tunnels[1] = ipn->tunnels_l;
-        ipn->tunnels[2] = ipn->tunnels_r;
-        ipn->tunnels[3] = ipn->tunnels_r_l;
-        ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
-                                          "ip_vti0",
-                                          vti_tunnel_setup);
-        if (!ipn->fb_tunnel_dev) {
-                err = -ENOMEM;
-                goto err_alloc_dev;
-        }
-        dev_net_set(ipn->fb_tunnel_dev, net);
-        err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
-        if (err)
-                goto err_reg_dev;
-        ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
-        err = register_netdev(ipn->fb_tunnel_dev);
+        err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
        if (err)
-                goto err_reg_dev;
+                return err;
+        itn = net_generic(net, vti_net_id);
+        vti_fb_tunnel_init(itn->fb_tunnel_dev);
        return 0;
-err_reg_dev:
-        vti_dev_free(ipn->fb_tunnel_dev);
-err_alloc_dev:
-        /* nothing */
-        return err;
 }
 static void __net_exit vti_exit_net(struct net *net)
 {
-        struct vti_net *ipn = net_generic(net, vti_net_id);
+        struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
-        LIST_HEAD(list);
+        ip_tunnel_delete_net(itn, &vti_link_ops);
-        rtnl_lock();
-        vti_destroy_tunnels(ipn, &list);
-        unregister_netdevice_many(&list);
-        rtnl_unlock();
 }
 static struct pernet_operations vti_net_ops = {
        .init = vti_init_net,
        .exit = vti_exit_net,
        .id   = &vti_net_id,
-        .size = sizeof(struct vti_net),
+        .size = sizeof(struct ip_tunnel_net),
 };
 static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -728,78 +363,19 @@ static void vti_netlink_parms(struct nlattr *data[],
 static int vti_newlink(struct net *src_net, struct net_device *dev,
                       struct nlattr *tb[], struct nlattr *data[])
 {
-        struct ip_tunnel *nt;
+        struct ip_tunnel_parm parms;
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        int mtu;
-        int err;
-        nt = netdev_priv(dev);
-        vti_netlink_parms(data, &nt->parms);
-        if (vti_tunnel_locate(net, &nt->parms, 0))
-                return -EEXIST;
-        mtu = vti_tunnel_bind_dev(dev);
+        vti_netlink_parms(data, &parms);
-        if (!tb[IFLA_MTU])
+        return ip_tunnel_newlink(dev, tb, &parms);
-                dev->mtu = mtu;
-        err = register_netdevice(dev);
-        if (err)
-                goto out;
-        dev_hold(dev);
-        vti_tunnel_link(ipn, nt);
-out:
-        return err;
 }
 static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
                          struct nlattr *data[])
 {
-        struct ip_tunnel *t, *nt;
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
        struct ip_tunnel_parm p;
-        int mtu;
-        if (dev == ipn->fb_tunnel_dev)
-                return -EINVAL;
-        nt = netdev_priv(dev);
        vti_netlink_parms(data, &p);
+        return ip_tunnel_changelink(dev, tb, &p);
-        t = vti_tunnel_locate(net, &p, 0);
-        if (t) {
-                if (t->dev != dev)
-                        return -EEXIST;
-        } else {
-                t = nt;
-                vti_tunnel_unlink(ipn, t);
-                t->parms.iph.saddr = p.iph.saddr;
-                t->parms.iph.daddr = p.iph.daddr;
-                t->parms.i_key = p.i_key;
-                t->parms.o_key = p.o_key;
-                if (dev->type != ARPHRD_ETHER) {
-                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
-                        memcpy(dev->broadcast, &p.iph.daddr, 4);
-                }
-                vti_tunnel_link(ipn, t);
-                netdev_state_change(dev);
-        }
-        if (t->parms.link != p.link) {
-                t->parms.link = p.link;
-                mtu = vti_tunnel_bind_dev(dev);
-                if (!tb[IFLA_MTU])
-                        dev->mtu = mtu;
-                netdev_state_change(dev);
-        }
-        return 0;
 }
 static size_t vti_get_size(const struct net_device *dev)
@@ -865,7 +441,7 @@ static int __init vti_init(void)
        err = xfrm4_mode_tunnel_input_register(&vti_handler);
        if (err < 0) {
                unregister_pernet_device(&vti_net_ops);
-                pr_info(KERN_INFO "vti init: can't register tunnel\n");
+                pr_info("vti init: can't register tunnel\n");
        }
        err = rtnl_link_register(&vti_link_ops);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 51fc2a1dcdd3..7f80fb4b82d3 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -190,15 +190,14 @@ static int ipip_rcv(struct sk_buff *skb)
        struct ip_tunnel *tunnel;
        const struct iphdr *iph;
-        if (iptunnel_pull_header(skb, 0, tpi.proto))
-                goto drop;
        iph = ip_hdr(skb);
        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
                        iph->saddr, iph->daddr, 0);
        if (tunnel) {
                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                        goto drop;
+                if (iptunnel_pull_header(skb, 0, tpi.proto))
+                        goto drop;
                return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
        }
@@ -286,7 +285,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
        dev->flags              = IFF_NOARP;
        dev->iflink             = 0;
        dev->addr_len           = 4;
-        dev->features           |= NETIF_F_NETNS_LOCAL;
        dev->features           |= NETIF_F_LLTX;
        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
@@ -437,7 +435,7 @@ static int __net_init ipip_init_net(struct net *net)
 static void __net_exit ipip_exit_net(struct net *net)
 {
        struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
-        ip_tunnel_delete_net(itn);
+        ip_tunnel_delete_net(itn, &ipip_link_ops);
 }
 static struct pernet_operations ipip_net_ops = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 132a09664704..9ae54b09254f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -127,9 +127,9 @@ static struct kmem_cache *mrt_cachep __read_mostly;
 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
 static void ipmr_free_table(struct mr_table *mrt);
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
-                         struct sk_buff *skb, struct mfc_cache *cache,
+                          struct sk_buff *skb, struct mfc_cache *cache,
-                         int local);
+                          int local);
 static int ipmr_cache_report(struct mr_table *mrt,
                             struct sk_buff *pkt, vifi_t vifi, int assert);
 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
@@ -1795,9 +1795,9 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
 /* "local" means that we should preserve one skb (for local delivery) */
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
-                         struct sk_buff *skb, struct mfc_cache *cache,
+                          struct sk_buff *skb, struct mfc_cache *cache,
-                         int local)
+                          int local)
 {
        int psend = -1;
        int vif, ct;
@@ -1903,14 +1903,13 @@ last_forward:
                                ipmr_queue_xmit(net, mrt, skb2, cache, psend);
                } else {
                        ipmr_queue_xmit(net, mrt, skb, cache, psend);
-                        return 0;
+                        return;
                }
        }
 dont_forward:
        if (!local)
                kfree_skb(skb);
-        return 0;
 }
 static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
@@ -2068,9 +2067,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
        skb_reset_network_header(skb);
        skb->protocol = htons(ETH_P_IP);
        skb->ip_summed = CHECKSUM_NONE;
-        skb->pkt_type = PACKET_HOST;
-        skb_tunnel_rx(skb, reg_dev);
+        skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
        netif_rx(skb);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4e9028017428..1657e39b291f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT
          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_SYNPROXY
+        tristate "SYNPROXY target support"
+        depends on NF_CONNTRACK && NETFILTER_ADVANCED
+        select NETFILTER_SYNPROXY
+        select SYN_COOKIES
+        help
+          The SYNPROXY target allows you to intercept TCP connections and
+          establish them using syncookies before they are passed on to the
+          server. This allows to avoid conntrack and server resource usage
+          during SYN-flood attacks.
+          To compile it as a module, choose M here. If unsure, say N.
 config IP_NF_TARGET_ULOG
        tristate "ULOG target support (obsolete)"
        default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 007b128eecc9..3622b248b6dd 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 # generic ARP tables
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index eadab1ed6500..a865f6f94013 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -48,7 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net)
        net->ipv4.arptable_filter =
                arpt_register_table(net, &packet_filter, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.arptable_filter);
+        return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
 }
 static void __net_exit arptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 30e4de940567..00352ce0f0de 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this,
                NF_CT_ASSERT(dev->ifindex != 0);
                nf_ct_iterate_cleanup(net, device_cmp,
-                                      (void *)(long)dev->ifindex);
+                                      (void *)(long)dev->ifindex, 0, 0);
        }
        return NOTIFY_DONE;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 04b18c1ac345..b969131ad1c1 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -119,7 +119,26 @@ static void send_reset(struct sk_buff *oldskb, int hook)
        nf_ct_attach(nskb, oldskb);
-        ip_local_out(nskb);
+#ifdef CONFIG_BRIDGE_NETFILTER
+        /* If we use ip_local_out for bridged traffic, the MAC source on
+         * the RST will be ours, instead of the destination's.  This confuses
+         * some routers/firewalls, and they drop the packet.  So we need to
+         * build the eth header using the original destination's MAC as the
+         * source, and send the RST packet directly.
+         */
+        if (oldskb->nf_bridge) {
+                struct ethhdr *oeth = eth_hdr(oldskb);
+                nskb->dev = oldskb->nf_bridge->physindev;
+                niph->tot_len = htons(nskb->len);
+                ip_send_check(niph);
+                if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+                                    oeth->h_source, oeth->h_dest, nskb->len) < 0)
+                        goto free_nskb;
+                dev_queue_xmit(nskb);
+        } else
+#endif
+                ip_local_out(nskb);
        return;
 free_nskb:
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 000000000000..67e17dcda65e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+static struct iphdr *
+synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+        struct iphdr *iph;
+        skb_reset_network_header(skb);
+        iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+        iph->version    = 4;
+        iph->ihl        = sizeof(*iph) / 4;
+        iph->tos        = 0;
+        iph->id         = 0;
+        iph->frag_off   = htons(IP_DF);
+        iph->ttl        = sysctl_ip_default_ttl;
+        iph->protocol   = IPPROTO_TCP;
+        iph->check      = 0;
+        iph->saddr      = saddr;
+        iph->daddr      = daddr;
+        return iph;
+}
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+                  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+                  struct iphdr *niph, struct tcphdr *nth,
+                  unsigned int tcp_hdr_size)
+{
+        nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+        nskb->ip_summed   = CHECKSUM_PARTIAL;
+        nskb->csum_start  = (unsigned char *)nth - nskb->head;
+        nskb->csum_offset = offsetof(struct tcphdr, check);
+        skb_dst_set_noref(nskb, skb_dst(skb));
+        nskb->protocol = htons(ETH_P_IP);
+        if (ip_route_me_harder(nskb, RTN_UNSPEC))
+                goto free_nskb;
+        if (nfct) {
+                nskb->nfct = nfct;
+                nskb->nfctinfo = ctinfo;
+                nf_conntrack_get(nfct);
+        }
+        ip_local_out(nskb);
+        return;
+free_nskb:
+        kfree_skb(nskb);
+}
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+                            const struct synproxy_options *opts)
+{
+        struct sk_buff *nskb;
+        struct iphdr *iph, *niph;
+        struct tcphdr *nth;
+        unsigned int tcp_hdr_size;
+        u16 mss = opts->mss;
+        iph = ip_hdr(skb);
+        tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+        nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+                         GFP_ATOMIC);
+        if (nskb == NULL)
+                return;
+        skb_reserve(nskb, MAX_TCP_HEADER);
+        niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+        skb_reset_transport_header(nskb);
+        nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+        nth->source     = th->dest;
+        nth->dest       = th->source;
+        nth->seq        = htonl(__cookie_v4_init_sequence(iph, th, &mss));
+        nth->ack_seq    = htonl(ntohl(th->seq) + 1);
+        tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+        if (opts->options & XT_SYNPROXY_OPT_ECN)
+                tcp_flag_word(nth) |= TCP_FLAG_ECE;
+        nth->doff       = tcp_hdr_size / 4;
+        nth->window     = 0;
+        nth->check      = 0;
+        nth->urg_ptr    = 0;
+        synproxy_build_options(nth, opts);
+        synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+                          niph, nth, tcp_hdr_size);
+}
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+                         const struct sk_buff *skb, const struct tcphdr *th,
+                         const struct synproxy_options *opts, u32 recv_seq)
+{
+        struct sk_buff *nskb;
+        struct iphdr *iph, *niph;
+        struct tcphdr *nth;
+        unsigned int tcp_hdr_size;
+        iph = ip_hdr(skb);
+        tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+        nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+                         GFP_ATOMIC);
+        if (nskb == NULL)
+                return;
+        skb_reserve(nskb, MAX_TCP_HEADER);
+        niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+        skb_reset_transport_header(nskb);
+        nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+        nth->source     = th->source;
+        nth->dest       = th->dest;
+        nth->seq        = htonl(recv_seq - 1);
+        /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+         * sequence number translation once a connection tracking entry exists.
+         */
+        nth->ack_seq    = htonl(ntohl(th->ack_seq) - 1);
+        tcp_flag_word(nth) = TCP_FLAG_SYN;
+        if (opts->options & XT_SYNPROXY_OPT_ECN)
+                tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+        nth->doff       = tcp_hdr_size / 4;
+        nth->window     = th->window;
+        nth->check      = 0;
+        nth->urg_ptr    = 0;
+        synproxy_build_options(nth, opts);
+        synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+                          niph, nth, tcp_hdr_size);
+}
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+                         const struct ip_ct_tcp *state,
+                         const struct sk_buff *skb, const struct tcphdr *th,
+                         const struct synproxy_options *opts)
+{
+        struct sk_buff *nskb;
+        struct iphdr *iph, *niph;
+        struct tcphdr *nth;
+        unsigned int tcp_hdr_size;
+        iph = ip_hdr(skb);
+        tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+        nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+                         GFP_ATOMIC);
+        if (nskb == NULL)
+                return;
+        skb_reserve(nskb, MAX_TCP_HEADER);
+        niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+        skb_reset_transport_header(nskb);
+        nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+        nth->source     = th->dest;
+        nth->dest       = th->source;
+        nth->seq        = htonl(ntohl(th->ack_seq));
+        nth->ack_seq    = htonl(ntohl(th->seq) + 1);
+        tcp_flag_word(nth) = TCP_FLAG_ACK;
+        nth->doff       = tcp_hdr_size / 4;
+        nth->window     = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+        nth->check      = 0;
+        nth->urg_ptr    = 0;
+        synproxy_build_options(nth, opts);
+        synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+                         const struct sk_buff *skb, const struct tcphdr *th,
+                         const struct synproxy_options *opts)
+{
+        struct sk_buff *nskb;
+        struct iphdr *iph, *niph;
+        struct tcphdr *nth;
+        unsigned int tcp_hdr_size;
+        iph = ip_hdr(skb);
+        tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+        nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+                         GFP_ATOMIC);
+        if (nskb == NULL)
+                return;
+        skb_reserve(nskb, MAX_TCP_HEADER);
+        niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+        skb_reset_transport_header(nskb);
+        nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+        nth->source     = th->source;
+        nth->dest       = th->dest;
+        nth->seq        = htonl(ntohl(th->seq) + 1);
+        nth->ack_seq    = th->ack_seq;
+        tcp_flag_word(nth) = TCP_FLAG_ACK;
+        nth->doff       = tcp_hdr_size / 4;
+        nth->window     = ntohs(htons(th->window) >> opts->wscale);
+        nth->check      = 0;
+        nth->urg_ptr    = 0;
+        synproxy_build_options(nth, opts);
+        synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+                         const struct sk_buff *skb, const struct tcphdr *th,
+                         struct synproxy_options *opts, u32 recv_seq)
+{
+        int mss;
+        mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+        if (mss == 0) {
+                this_cpu_inc(snet->stats->cookie_invalid);
+                return false;
+        }
+        this_cpu_inc(snet->stats->cookie_valid);
+        opts->mss = mss;
+        if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+                synproxy_check_timestamp_cookie(opts);
+        synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+        return true;
+}
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+        const struct xt_synproxy_info *info = par->targinfo;
+        struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+        struct synproxy_options opts = {};
+        struct tcphdr *th, _th;
+        if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+                return NF_DROP;
+        th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+        if (th == NULL)
+                return NF_DROP;
+        synproxy_parse_options(skb, par->thoff, th, &opts);
+        if (th->syn && !(th->ack || th->fin || th->rst)) {
+                /* Initial SYN from client */
+                this_cpu_inc(snet->stats->syn_received);
+                if (th->ece && th->cwr)
+                        opts.options |= XT_SYNPROXY_OPT_ECN;
+                opts.options &= info->options;
+                if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+                        synproxy_init_timestamp_cookie(info, &opts);
+                else
+                        opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+                                          XT_SYNPROXY_OPT_SACK_PERM |
+                                          XT_SYNPROXY_OPT_ECN);
+                synproxy_send_client_synack(skb, th, &opts);
+                return NF_DROP;
+        } else if (th->ack && !(th->fin || th->rst || th->syn)) {
+                /* ACK from client */
+                synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+                return NF_DROP;
+        }
+        return XT_CONTINUE;
+}
+static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
+                                       struct sk_buff *skb,
+                                       const struct net_device *in,
+                                       const struct net_device *out,
+                                       int (*okfn)(struct sk_buff *))
+{
+        struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
+        enum ip_conntrack_info ctinfo;
+        struct nf_conn *ct;
+        struct nf_conn_synproxy *synproxy;
+        struct synproxy_options opts = {};
+        const struct ip_ct_tcp *state;
+        struct tcphdr *th, _th;
+        unsigned int thoff;
+        ct = nf_ct_get(skb, &ctinfo);
+        if (ct == NULL)
+                return NF_ACCEPT;
+        synproxy = nfct_synproxy(ct);
+        if (synproxy == NULL)
+                return NF_ACCEPT;
+        if (nf_is_loopback_packet(skb))
+                return NF_ACCEPT;
+        thoff = ip_hdrlen(skb);
+        th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+        if (th == NULL)
+                return NF_DROP;
+        state = &ct->proto.tcp;
+        switch (state->state) {
+        case TCP_CONNTRACK_CLOSE:
+                if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+                        nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+                                                      ntohl(th->seq) + 1);
+                        break;
+                }
+                if (!th->syn || th->ack ||
+                    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+                        break;
+                /* Reopened connection - reset the sequence number and timestamp
+                 * adjustments, they will get initialized once the connection is
+                 * reestablished.
+                 */
+                nf_ct_seqadj_init(ct, ctinfo, 0);
+                synproxy->tsoff = 0;
+                this_cpu_inc(snet->stats->conn_reopened);
+                /* fall through */
+        case TCP_CONNTRACK_SYN_SENT:
+                synproxy_parse_options(skb, thoff, th, &opts);
+                if (!th->syn && th->ack &&
+                    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+                        /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+                         * therefore we need to add 1 to make the SYN sequence
+                         * number match the one of first SYN.
+                         */
+                        if (synproxy_recv_client_ack(snet, skb, th, &opts,
+                                                     ntohl(th->seq) + 1))
+                                this_cpu_inc(snet->stats->cookie_retrans);
+                        return NF_DROP;
+                }
+                synproxy->isn = ntohl(th->ack_seq);
+                if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+                        synproxy->its = opts.tsecr;
+                break;
+        case TCP_CONNTRACK_SYN_RECV:
+                if (!th->syn || !th->ack)
+                        break;
+                synproxy_parse_options(skb, thoff, th, &opts);
+                if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+                        synproxy->tsoff = opts.tsval - synproxy->its;
+                opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+                                  XT_SYNPROXY_OPT_WSCALE |
+                                  XT_SYNPROXY_OPT_SACK_PERM);
+                swap(opts.tsval, opts.tsecr);
+                synproxy_send_server_ack(snet, state, skb, th, &opts);
+                nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+                swap(opts.tsval, opts.tsecr);
+                synproxy_send_client_ack(snet, skb, th, &opts);
+                consume_skb(skb);
+                return NF_STOLEN;
+        default:
+                break;
+        }
+        synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+        return NF_ACCEPT;
+}
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+        const struct ipt_entry *e = par->entryinfo;
+        if (e->ip.proto != IPPROTO_TCP ||
+            e->ip.invflags & XT_INV_PROTO)
+                return -EINVAL;
+        return nf_ct_l3proto_try_module_get(par->family);
+}
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+        nf_ct_l3proto_module_put(par->family);
+}
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+        .name           = "SYNPROXY",
+        .family         = NFPROTO_IPV4,
+        .target         = synproxy_tg4,
+        .targetsize     = sizeof(struct xt_synproxy_info),
+        .checkentry     = synproxy_tg4_check,
+        .destroy        = synproxy_tg4_destroy,
+        .me             = THIS_MODULE,
+};
+static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+        {
+                .hook           = ipv4_synproxy_hook,
+                .owner          = THIS_MODULE,
+                .pf             = NFPROTO_IPV4,
+                .hooknum        = NF_INET_LOCAL_IN,
+                .priority       = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+        },
+        {
+                .hook           = ipv4_synproxy_hook,
+                .owner          = THIS_MODULE,
+                .pf             = NFPROTO_IPV4,
+                .hooknum        = NF_INET_POST_ROUTING,
+                .priority       = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+        },
+};
+static int __init synproxy_tg4_init(void)
+{
+        int err;
+        err = nf_register_hooks(ipv4_synproxy_ops,
+                                ARRAY_SIZE(ipv4_synproxy_ops));
+        if (err < 0)
+                goto err1;
+        err = xt_register_target(&synproxy_tg4_reg);
+        if (err < 0)
+                goto err2;
+        return 0;
+err2:
+        nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+err1:
+        return err;
+}
+static void __exit synproxy_tg4_exit(void)
+{
+        xt_unregister_target(&synproxy_tg4_reg);
+        nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+}
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 6b3da5cf54e9..50af5b45c050 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -69,7 +69,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
        net->ipv4.iptable_filter =
                ipt_register_table(net, &packet_filter, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_filter);
+        return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
 }
 static void __net_exit iptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index cba5658ec82c..0d8cd82e0fad 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -107,7 +107,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
        net->ipv4.iptable_mangle =
                ipt_register_table(net, &packet_mangler, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_mangle);
+        return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
 }
 static void __net_exit iptable_mangle_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 6383273d54e1..683bfaffed65 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -292,7 +292,7 @@ static int __net_init iptable_nat_net_init(struct net *net)
                return -ENOMEM;
        net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.nat_table);
+        return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
 }
 static void __net_exit iptable_nat_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 03d9696d3c6e..1f82aea11df6 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,7 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
        net->ipv4.iptable_raw =
                ipt_register_table(net, &packet_raw, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_raw);
+        return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
 }
 static void __net_exit iptable_raw_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index b283d8e2601a..f867a8d38bf7 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,7 +66,7 @@ static int __net_init iptable_security_net_init(struct net *net)
        net->ipv4.iptable_security =
                ipt_register_table(net, &security_table, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_security);
+        return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
 }
 static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 0a2e0e3e95ba..86f5b34a4ed1 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
@@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
        /* adjust seqs for loopback traffic only in outgoing direction */
        if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
            !nf_is_loopback_packet(skb)) {
-                typeof(nf_nat_seq_adjust_hook) seq_adjust;
+                if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
-                seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-                if (!seq_adjust ||
-                    !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
                        NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
                        return NF_DROP;
                }
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 746427c9e719..d7d9882d4cae 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1082,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
        __u16 srcp = ntohs(inet->inet_sport);
        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 6577a1149a47..4a0335854b89 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
        SNMP_MIB_SENTINEL
 };
-/* Following RFC4293 items are displayed in /proc/net/netstat */
+/* Following items are displayed in /proc/net/netstat */
 static const struct snmp_mib snmp4_ipextstats_list[] = {
        SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
        SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
@@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
        SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
        SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
        SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+        /* Non RFC4293 fields */
        SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+        SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+        SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+        SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+        SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
        SNMP_MIB_SENTINEL
 };
@@ -273,7 +278,7 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
        SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
        SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
-        SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
+        SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
        SNMP_MIB_SENTINEL
 };
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index dd44e0ab600c..a86c7ae71881 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -571,7 +571,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
                           RT_SCOPE_UNIVERSE,
                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-                           inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP,
+                           inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP |
+                            (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
                           daddr, saddr, 0, 0);
        if (!inet->hdrincl) {
@@ -987,7 +988,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
              srcp  = inet->inet_num;
        seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
+                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
                i, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a9a54a236832..727f4365bcdf 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,7 +112,8 @@
 #define RT_FL_TOS(oldflp4) \
        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
-#define IP_MAX_MTU      0xFFF0
+/* IPv4 datagram length is stored into 16bit field (tot_len) */
+#define IP_MAX_MTU      0xFFFF
 #define RT_GC_TIMEOUT (300*HZ)
@@ -435,12 +436,12 @@ static inline int ip_rt_proc_init(void)
 static inline bool rt_is_expired(const struct rtable *rth)
 {
-        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
+        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 }
 void rt_cache_flush(struct net *net)
 {
-        rt_genid_bump(net);
+        rt_genid_bump_ipv4(net);
 }
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
                        mtu = 576;
        }
-        if (mtu > IP_MAX_MTU)
+        return min_t(unsigned int, mtu, IP_MAX_MTU);
-                mtu = IP_MAX_MTU;
-        return mtu;
 }
 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
@@ -1458,7 +1456,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
        rth->dst.output = ip_rt_bug;
-        rth->rt_genid   = rt_genid(dev_net(dev));
+        rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
        rth->rt_flags   = RTCF_MULTICAST;
        rth->rt_type    = RTN_MULTICAST;
        rth->rt_is_input= 1;
@@ -1589,7 +1587,7 @@ static int __mkroute_input(struct sk_buff *skb,
                goto cleanup;
        }
-        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
+        rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
        rth->rt_flags = flags;
        rth->rt_type = res->type;
        rth->rt_is_input = 1;
@@ -1760,7 +1758,7 @@ local_input:
        rth->dst.tclassid = itag;
 #endif
-        rth->rt_genid = rt_genid(net);
+        rth->rt_genid = rt_genid_ipv4(net);
        rth->rt_flags   = flags|RTCF_LOCAL;
        rth->rt_type    = res.type;
        rth->rt_is_input = 1;
@@ -1945,7 +1943,7 @@ add:
        rth->dst.output = ip_output;
-        rth->rt_genid = rt_genid(dev_net(dev_out));
+        rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
        rth->rt_flags   = flags;
        rth->rt_type    = type;
        rth->rt_is_input = 0;
@@ -2227,7 +2225,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
                rt->rt_iif = ort->rt_iif;
                rt->rt_pmtu = ort->rt_pmtu;
-                rt->rt_genid = rt_genid(net);
+                rt->rt_genid = rt_genid_ipv4(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
                rt->rt_gateway = ort->rt_gateway;
@@ -2665,7 +2663,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 static __net_init int rt_genid_init(struct net *net)
 {
-        atomic_set(&net->rt_genid, 0);
+        atomic_set(&net->ipv4.rt_genid, 0);
        atomic_set(&net->fnhe_genid, 0);
        get_random_bytes(&net->ipv4.dev_addr_genid,
                         sizeof(net->ipv4.dev_addr_genid));
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b05c96e7af8b..14a15c49129d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -160,26 +160,33 @@ static __u16 const msstab[] = {
 * Generate a syncookie.  mssp points to the mss, which is returned
 * rounded down to the value encoded in the cookie.
 */
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+                              u16 *mssp)
 {
-        const struct iphdr *iph = ip_hdr(skb);
-        const struct tcphdr *th = tcp_hdr(skb);
        int mssind;
        const __u16 mss = *mssp;
-        tcp_synq_overflow(sk);
        for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
                if (mss >= msstab[mssind])
                        break;
        *mssp = msstab[mssind];
-        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
        return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
                                     th->source, th->dest, ntohl(th->seq),
                                     jiffies / (HZ * 60), mssind);
 }
+EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+        const struct iphdr *iph = ip_hdr(skb);
+        const struct tcphdr *th = tcp_hdr(skb);
+        tcp_synq_overflow(sk);
+        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+        return __cookie_v4_init_sequence(iph, th, mssp);
+}
 /*
 * This (misnamed) value is the age of syncookie which is permitted.
@@ -192,10 +199,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
 * Check if a ack sequence number is a valid syncookie.
 * Return the decoded mss if it is, or 0 if not.
 */
-static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
+                      u32 cookie)
 {
-        const struct iphdr *iph = ip_hdr(skb);
-        const struct tcphdr *th = tcp_hdr(skb);
        __u32 seq = ntohl(th->seq) - 1;
        __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
                                            th->source, th->dest, seq,
@@ -204,6 +210,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
        return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
+EXPORT_SYMBOL_GPL(__cookie_v4_check);
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
                                           struct request_sock *req,
@@ -284,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
                goto out;
        if (tcp_synq_no_recent_overflow(sk) ||
-            (mss = cookie_check(skb, cookie)) == 0) {
+            (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) {
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
                goto out;
        }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 610e324348d1..540279f4c531 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
 static int zero;
 static int one = 1;
 static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -559,6 +560,13 @@ static struct ctl_table ipv4_table[] = {
                .extra1         = &one,
        },
        {
+                .procname       = "tcp_notsent_lowat",
+                .data           = &sysctl_tcp_notsent_lowat,
+                .maxlen         = sizeof(sysctl_tcp_notsent_lowat),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
                .procname       = "tcp_rmem",
                .data           = &sysctl_tcp_rmem,
                .maxlen         = sizeof(sysctl_tcp_rmem),
@@ -754,6 +762,15 @@ static struct ctl_table ipv4_table[] = {
                .extra2         = &four,
        },
        {
+                .procname       = "tcp_min_tso_segs",
+                .data           = &sysctl_tcp_min_tso_segs,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &gso_max_segs,
+        },
+        {
                .procname       = "udp_mem",
                .data           = &sysctl_udp_mem,
                .maxlen         = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5423223e93c2..6e5617b9f9db 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -410,10 +412,6 @@ void tcp_init_sock(struct sock *sk)
        icsk->icsk_sync_mss = tcp_sync_mss;
-        /* Presumed zeroed, in order of appearance:
-         *      cookie_in_always, cookie_out_never,
-         *      s_data_constant, s_data_in, s_data_out
-         */
        sk->sk_sndbuf = sysctl_tcp_wmem[1];
        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -499,7 +497,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                        mask |= POLLIN | POLLRDNORM;
                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-                        if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+                        if (sk_stream_is_writeable(sk)) {
                                mask |= POLLOUT | POLLWRNORM;
                        } else {  /* send SIGIO later */
                                set_bit(SOCK_ASYNC_NOSPACE,
@@ -510,7 +508,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                                 * wspace test but before the flags are set,
                                 * IO signal will be lost.
                                 */
-                                if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+                                if (sk_stream_is_writeable(sk))
                                        mask |= POLLOUT | POLLWRNORM;
                        }
                } else
@@ -789,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
        xmit_size_goal = mss_now;
        if (large_allowed && sk_can_gso(sk)) {
-                xmit_size_goal = ((sk->sk_gso_max_size - 1) -
+                u32 gso_size, hlen;
-                                  inet_csk(sk)->icsk_af_ops->net_header_len -
-                                  inet_csk(sk)->icsk_ext_hdr_len -
+                /* Maybe we should/could use sk->sk_prot->max_header here ? */
-                                  tp->tcp_header_len);
+                hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+                       inet_csk(sk)->icsk_ext_hdr_len +
+                       tp->tcp_header_len;
+                /* Goal is to send at least one packet per ms,
+                 * not one big TSO packet every 100 ms.
+                 * This preserves ACK clocking and is consistent
+                 * with tcp_tso_should_defer() heuristic.
+                 */
+                gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+                gso_size = max_t(u32, gso_size,
+                                 sysctl_tcp_min_tso_segs * mss_now);
+                xmit_size_goal = min_t(u32, gso_size,
+                                       sk->sk_gso_max_size - 1 - hlen);
-                /* TSQ : try to have two TSO segments in flight */
+                /* TSQ : try to have at least two segments in flight
+                 * (one in NIC TX ring, another in Qdisc)
+                 */
                xmit_size_goal = min_t(u32, xmit_size_goal,
                                       sysctl_tcp_limit_output_bytes >> 1);
@@ -1121,6 +1135,13 @@ new_segment:
                                        goto wait_for_memory;
                                /*
+                                 * All packets are restored as if they have
+                                 * already been sent.
+                                 */
+                                if (tp->repair)
+                                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                                /*
                                 * Check whether we can use HW checksum.
                                 */
                                if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
@@ -2447,10 +2468,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
        case TCP_THIN_DUPACK:
                if (val < 0 || val > 1)
                        err = -EINVAL;
-                else
+                else {
                        tp->thin_dupack = val;
                        if (tp->thin_dupack)
                                tcp_disable_early_retrans(tp);
+                }
                break;
        case TCP_REPAIR:
@@ -2631,6 +2653,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                else
                        tp->tsoffset = val - tcp_time_stamp;
                break;
+        case TCP_NOTSENT_LOWAT:
+                tp->notsent_lowat = val;
+                sk->sk_write_space(sk);
+                break;
        default:
                err = -ENOPROTOOPT;
                break;
@@ -2847,6 +2873,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
        case TCP_TIMESTAMP:
                val = tcp_time_stamp + tp->tsoffset;
                break;
+        case TCP_NOTSENT_LOWAT:
+                val = tp->notsent_lowat;
+                break;
        default:
                return -ENOPROTOOPT;
        }
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index a9077f441cb2..b6ae92a51f58 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -206,8 +206,8 @@ static u32 cubic_root(u64 a)
 */
 static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 {
-        u64 offs;
+        u32 delta, bic_target, max_cnt;
-        u32 delta, t, bic_target, max_cnt;
+        u64 offs, t;
        ca->ack_cnt++;  /* count the number of ACKs */
@@ -250,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
         * if the cwnd < 1 million packets !!!
         */
+        t = (s32)(tcp_time_stamp - ca->epoch_start);
+        t += msecs_to_jiffies(ca->delay_min >> 3);
        /* change the unit from HZ to bictcp_HZ */
-        t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3)
+        t <<= BICTCP_HZ;
-              - ca->epoch_start) << BICTCP_HZ) / HZ;
+        do_div(t, HZ);
        if (t < ca->bic_K)              /* t - K */
                offs = ca->bic_K - t;
@@ -414,7 +416,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
                return;
        /* Discard delay samples right after fast recovery */
-        if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+        if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
                return;
        delay = (rtt_us << 3) / USEC_PER_MSEC;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8f7ef0ad80e5..ab7bd35bb312 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -58,23 +58,22 @@ error:		kfree(ctx);
        return err;
 }
-/* Computes the fastopen cookie for the peer.
+/* Computes the fastopen cookie for the IP path.
- * The peer address is a 128 bits long (pad with zeros for IPv4).
+ * The path is a 128 bits long (pad with zeros for IPv4).
 *
 * The caller must check foc->len to determine if a valid cookie
 * has been generated successfully.
 */
-void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
+void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
+                             struct tcp_fastopen_cookie *foc)
 {
-        __be32 peer_addr[4] = { addr, 0, 0, 0 };
+        __be32 path[4] = { src, dst, 0, 0 };
        struct tcp_fastopen_context *ctx;
        rcu_read_lock();
        ctx = rcu_dereference(tcp_fastopen_ctx);
        if (ctx) {
-                crypto_cipher_encrypt_one(ctx->tfm,
+                crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path);
-                                          foc->val,
-                                          (__u8 *)peer_addr);
                foc->len = TCP_FASTOPEN_COOKIE_SIZE;
        }
        rcu_read_unlock();
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28af45abe062..1969e16d936d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
        }
 }
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        u64 rate;
+        /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+        rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+        rate *= max(tp->snd_cwnd, tp->packets_out);
+        /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+         * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+         * We probably need usec resolution in the future.
+         * Note: This also takes care of possible srtt=0 case,
+         * when tcp_rtt_estimator() was not yet called.
+         */
+        if (tp->srtt > 8 + 2)
+                do_div(rate, tp->srtt);
+        sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
@@ -1048,6 +1076,7 @@ struct tcp_sacktag_state {
        int reord;
        int fack_count;
        int flag;
+        s32 rtt; /* RTT measured by SACKing never-retransmitted data */
 };
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1108,7 +1137,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 static u8 tcp_sacktag_one(struct sock *sk,
                          struct tcp_sacktag_state *state, u8 sacked,
                          u32 start_seq, u32 end_seq,
-                          bool dup_sack, int pcount)
+                          int dup_sack, int pcount, u32 xmit_time)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int fack_count = state->fack_count;
@@ -1148,6 +1177,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
                                                           state->reord);
                                if (!after(end_seq, tp->high_seq))
                                        state->flag |= FLAG_ORIG_SACK_ACKED;
+                                /* Pick the earliest sequence sacked for RTT */
+                                if (state->rtt < 0)
+                                        state->rtt = tcp_time_stamp - xmit_time;
                        }
                        if (sacked & TCPCB_LOST) {
@@ -1205,7 +1237,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
         * tcp_highest_sack_seq() when skb is highest_sack.
         */
        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
-                        start_seq, end_seq, dup_sack, pcount);
+                        start_seq, end_seq, dup_sack, pcount,
+                        TCP_SKB_CB(skb)->when);
        if (skb == tp->lost_skb_hint)
                tp->lost_cnt_hint += pcount;
@@ -1479,7 +1512,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                                TCP_SKB_CB(skb)->seq,
                                                TCP_SKB_CB(skb)->end_seq,
                                                dup_sack,
-                                                tcp_skb_pcount(skb));
+                                                tcp_skb_pcount(skb),
+                                                TCP_SKB_CB(skb)->when);
                        if (!before(TCP_SKB_CB(skb)->seq,
                                    tcp_highest_sack_seq(tp)))
@@ -1536,7 +1570,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
 static int
 tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
-                        u32 prior_snd_una)
+                        u32 prior_snd_una, s32 *sack_rtt)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1554,6 +1588,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
        state.flag = 0;
        state.reord = tp->packets_out;
+        state.rtt = -1;
        if (!tp->sacked_out) {
                if (WARN_ON(tp->fackets_out))
@@ -1737,6 +1772,7 @@ out:
        WARN_ON((int)tp->retrans_out < 0);
        WARN_ON((int)tcp_packets_in_flight(tp) < 0);
 #endif
+        *sack_rtt = state.rtt;
        return state.flag;
 }
@@ -1869,8 +1905,13 @@ void tcp_enter_loss(struct sock *sk, int how)
        }
        tcp_verify_left_out(tp);
-        tp->reordering = min_t(unsigned int, tp->reordering,
+        /* Timeout in disordered state after receiving substantial DUPACKs
-                               sysctl_tcp_reordering);
+         * suggests that the degree of reordering is over-estimated.
+         */
+        if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+            tp->sacked_out >= sysctl_tcp_reordering)
+                tp->reordering = min_t(unsigned int, tp->reordering,
+                                       sysctl_tcp_reordering);
        tcp_set_ca_state(sk, TCP_CA_Loss);
        tp->high_seq = tp->snd_nxt;
        TCP_ECN_queue_cwr(tp);
@@ -2472,8 +2513,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
        if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
                tcp_try_keep_open(sk);
-                if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
-                        tcp_moderate_cwnd(tp);
        } else {
                tcp_cwnd_reduction(sk, prior_unsacked, 0);
        }
@@ -2792,65 +2831,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
        tcp_xmit_retransmit_queue(sk);
 }
-void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+                                      s32 seq_rtt, s32 sack_rtt)
 {
-        tcp_rtt_estimator(sk, seq_rtt);
+        const struct tcp_sock *tp = tcp_sk(sk);
-        tcp_set_rto(sk);
-        inet_csk(sk)->icsk_backoff = 0;
+        /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
-}
+         * broken middle-boxes or peers may corrupt TS-ECR fields. But
-EXPORT_SYMBOL(tcp_valid_rtt_meas);
+         * Karn's algorithm forbids taking RTT if some retransmitted data
+         * is acked (RFC6298).
+         */
+        if (flag & FLAG_RETRANS_DATA_ACKED)
+                seq_rtt = -1;
+        if (seq_rtt < 0)
+                seq_rtt = sack_rtt;
-/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Supersedes RFC1323)
- */
-static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
-{
        /* RTTM Rule: A TSecr value received in a segment is used to
         * update the averaged RTT measurement only if the segment
         * acknowledges some new data, i.e., only if it advances the
         * left edge of the send window.
-         *
         * See draft-ietf-tcplw-high-performance-00, section 3.3.
-         * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
-         *
-         * Changed: reset backoff as soon as we see the first valid sample.
-         * If we do not, we get strongly overestimated rto. With timestamps
-         * samples are accepted even from very old segments: f.e., when rtt=1
-         * increases to 8, we retransmit 5 times and after 8 seconds delayed
-         * answer arrives rto becomes 120 seconds! If at least one of segments
-         * in window is lost... Voila.                          --ANK (010210)
         */
-        struct tcp_sock *tp = tcp_sk(sk);
+        if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+                seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-        tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-}
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
+        if (seq_rtt < 0)
-{
+                return false;
-        /* We don't have a timestamp. Can only use
-         * packets that are not retransmitted to determine
-         * rtt estimates. Also, we must not reset the
-         * backoff for rto until we get a non-retransmitted
-         * packet. This allows us to deal with a situation
-         * where the network delay has increased suddenly.
-         * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
-         */
-        if (flag & FLAG_RETRANS_DATA_ACKED)
+        tcp_rtt_estimator(sk, seq_rtt);
-                return;
+        tcp_set_rto(sk);
-        tcp_valid_rtt_meas(sk, seq_rtt);
+        /* RFC6298: only reset backoff on valid RTT measurement. */
+        inet_csk(sk)->icsk_backoff = 0;
+        return true;
 }
-static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-                                      const s32 seq_rtt)
+static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
-        /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
+        s32 seq_rtt = -1;
-        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-                tcp_ack_saw_tstamp(sk, flag);
+        if (tp->lsndtime && !tp->total_retrans)
-        else if (seq_rtt >= 0)
+                seq_rtt = tcp_time_stamp - tp->lsndtime;
-                tcp_ack_no_tstamp(sk, seq_rtt, flag);
+        tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
 }
 static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
@@ -2939,7 +2964,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 * arrived at the other end.
 */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-                               u32 prior_snd_una)
+                               u32 prior_snd_una, s32 sack_rtt)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2978,8 +3003,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                        if (sacked & TCPCB_SACKED_RETRANS)
                                tp->retrans_out -= acked_pcount;
                        flag |= FLAG_RETRANS_DATA_ACKED;
-                        ca_seq_rtt = -1;
-                        seq_rtt = -1;
                } else {
                        ca_seq_rtt = now - scb->when;
                        last_ackt = skb->tstamp;
@@ -3031,6 +3054,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                flag |= FLAG_SACK_RENEGING;
+        if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
+            (flag & FLAG_ACKED))
+                tcp_rearm_rto(sk);
        if (flag & FLAG_ACKED) {
                const struct tcp_congestion_ops *ca_ops
                        = inet_csk(sk)->icsk_ca_ops;
@@ -3040,9 +3067,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                        tcp_mtup_probe_success(sk);
                }
-                tcp_ack_update_rtt(sk, flag, seq_rtt);
-                tcp_rearm_rto(sk);
                if (tcp_is_reno(tp)) {
                        tcp_remove_reno_sacks(sk, pkts_acked);
                } else {
@@ -3130,11 +3154,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
                inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
+/* Decide wheather to run the increase function of congestion control. */
 static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        if (tcp_in_cwnd_reduction(sk))
-        return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
+                return false;
-                !tcp_in_cwnd_reduction(sk);
+        /* If reordering is high then always grow cwnd whenever data is
+         * delivered regardless of its ordering. Otherwise stay conservative
+         * and only grow cwnd on in-order delivery in Open state, and retain
+         * cwnd in Disordered state (RFC5681). A stretched ACK with
+         * new SACK or ECE mark may first advance cwnd here and later reduce
+         * cwnd in tcp_fastretrans_alert() based on more states.
+         */
+        if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+                return flag & FLAG_FORWARD_PROGRESS;
+        return inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+               flag & FLAG_DATA_ACKED;
 }
 /* Check that window update is acceptable.
@@ -3269,11 +3306,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        bool is_dupack = false;
-        u32 prior_in_flight;
+        u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
        u32 prior_fackets;
        int prior_packets = tp->packets_out;
        const int prior_unsacked = tp->packets_out - tp->sacked_out;
        int acked = 0; /* Number of packets newly acked */
+        s32 sack_rtt = -1;
        /* If the ack is older than previous acks
         * then we can probably ignore it.
@@ -3330,7 +3368,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
                if (TCP_SKB_CB(skb)->sacked)
-                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+                                                        &sack_rtt);
                if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
                        flag |= FLAG_ECE;
@@ -3349,21 +3388,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        /* See if we can take anything off of the retransmit queue. */
        acked = tp->packets_out;
-        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
+        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
        acked -= tp->packets_out;
+        /* Advance cwnd if state allows */
+        if (tcp_may_raise_cwnd(sk, flag))
+                tcp_cong_avoid(sk, ack, prior_in_flight);
        if (tcp_ack_is_dubious(sk, flag)) {
-                /* Advance CWND, if state allows this. */
-                if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
-                        tcp_cong_avoid(sk, ack, prior_in_flight);
                is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
                tcp_fastretrans_alert(sk, acked, prior_unsacked,
                                      is_dupack, flag);
-        } else {
-                if (flag & FLAG_DATA_ACKED)
-                        tcp_cong_avoid(sk, ack, prior_in_flight);
        }
        if (tp->tlp_high_seq)
                tcp_process_tlp_ack(sk, ack, flag);
@@ -3375,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                tcp_schedule_loss_probe(sk);
+        if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+                tcp_update_pacing_rate(sk);
        return 1;
 no_queue:
@@ -3402,7 +3440,8 @@ old_ack:
         * If data was DSACKed, see if we can undo a cwnd reduction.
         */
        if (TCP_SKB_CB(skb)->sacked) {
-                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+                                                &sack_rtt);
                tcp_fastretrans_alert(sk, acked, prior_unsacked,
                                      is_dupack, flag);
        }
@@ -3535,7 +3574,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
                ++ptr;
                tp->rx_opt.rcv_tsval = ntohl(*ptr);
                ++ptr;
-                tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+                if (*ptr)
+                        tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+                else
+                        tp->rx_opt.rcv_tsecr = 0;
                return true;
        }
        return false;
@@ -3560,7 +3602,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
        }
        tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
-        if (tp->rx_opt.saw_tstamp)
+        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
                tp->rx_opt.rcv_tsecr -= tp->tsoffset;
        return true;
@@ -5010,8 +5052,8 @@ discard:
 *      the rest is checked inline. Fast processing is turned on in
 *      tcp_data_queue when everything is OK.
 */
-int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-                        const struct tcphdr *th, unsigned int len)
+                         const struct tcphdr *th, unsigned int len)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -5088,7 +5130,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                tcp_ack(sk, skb, 0);
                                __kfree_skb(skb);
                                tcp_data_snd_check(sk);
-                                return 0;
+                                return;
                        } else { /* Header too small */
                                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
                                goto discard;
@@ -5181,7 +5223,7 @@ no_ack:
                        if (eaten)
                                kfree_skb_partial(skb, fragstolen);
                        sk->sk_data_ready(sk, 0);
-                        return 0;
+                        return;
                }
        }
@@ -5197,7 +5239,7 @@ slow_path:
         */
        if (!tcp_validate_incoming(sk, skb, th, 1))
-                return 0;
+                return;
 step5:
        if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
@@ -5213,7 +5255,7 @@ step5:
        tcp_data_snd_check(sk);
        tcp_ack_snd_check(sk);
-        return 0;
+        return;
 csum_error:
        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
@@ -5221,7 +5263,6 @@ csum_error:
 discard:
        __kfree_skb(skb);
-        return 0;
 }
 EXPORT_SYMBOL(tcp_rcv_established);
@@ -5316,7 +5357,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
        int saved_clamp = tp->rx_opt.mss_clamp;
        tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
-        if (tp->rx_opt.saw_tstamp)
+        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
                tp->rx_opt.rcv_tsecr -= tp->tsoffset;
        if (th->ack) {
@@ -5624,9 +5665,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                 * so release it.
                 */
                if (req) {
-                        tcp_synack_rtt_meas(sk, req);
                        tp->total_retrans = req->num_retrans;
                        reqsk_fastopen_remove(sk, req, false);
                } else {
                        /* Make sure socket is routed, for correct metrics. */
@@ -5651,6 +5690,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
                tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+                tcp_synack_rtt_meas(sk, req);
                if (tp->rx_opt.tstamp_ok)
                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b299da5ff499..b14266bb91eb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                              struct request_sock *req,
-                              u16 queue_mapping,
+                              u16 queue_mapping)
-                              bool nocache)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
@@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
 {
-        int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
+        int res = tcp_v4_send_synack(sk, NULL, req, 0);
        if (!res)
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -890,7 +889,7 @@ bool tcp_syn_flood_action(struct sock *sk,
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
-        if (!lopt->synflood_warned) {
+        if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
                lopt->synflood_warned = 1;
                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
                        proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -1316,9 +1315,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                return true;
        }
        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
-                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+                                                ip_hdr(skb)->daddr, valid_foc);
                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
                            memcmp(&foc->val[0], &valid_foc->val[0],
                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
@@ -1329,14 +1330,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                return true;
        } else if (foc->len == 0) { /* Client requesting a cookie */
-                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+                                        ip_hdr(skb)->daddr, valid_foc);
                NET_INC_STATS_BH(sock_net(sk),
                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
        } else {
                /* Client sent a cookie with wrong size. Treat it
                 * the same as invalid and return a valid one.
                 */
-                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+                                        ip_hdr(skb)->daddr, valid_foc);
        }
        return false;
 }
@@ -1462,7 +1465,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
         * limitations, they conserve resources and peer is
         * evidently real one.
         */
-        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+        if ((sysctl_tcp_syncookies == 2 ||
+             inet_csk_reqsk_queue_is_full(sk)) && !isn) {
                want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
                if (!want_cookie)
                        goto drop;
@@ -1671,8 +1675,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
        tcp_initialize_rcv_mss(newsk);
-        tcp_synack_rtt_meas(newsk, req);
-        newtp->total_retrans = req->num_retrans;
 #ifdef CONFIG_TCP_MD5SIG
        /* Copy over the MD5 key from the original socket */
@@ -1797,10 +1799,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
                                sk->sk_rx_dst = NULL;
                        }
                }
-                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
+                tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
-                        rsk = sk;
-                        goto reset;
-                }
                return 0;
        }
@@ -2605,7 +2604,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
        long delta = req->expires - jiffies;
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
+                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
                i,
                ireq->loc_addr,
                ntohs(inet_sk(sk)->inet_sport),
@@ -2663,7 +2662,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
-                        "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
+                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
                i, src, srcp, dest, destp, sk->sk_state,
                tp->write_seq - tp->snd_una,
                rx_queue,
@@ -2802,6 +2801,7 @@ struct proto tcp_prot = {
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
        .enter_memory_pressure  = tcp_enter_memory_pressure,
+        .stream_memory_free     = tcp_stream_memory_free,
        .sockets_allocated      = &tcp_sockets_allocated,
        .orphan_count           = &tcp_orphan_count,
        .memory_allocated       = &tcp_memory_allocated,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index da14436c1735..8a57d79b0b16 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
        return 0;
 }
-static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
+static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
                            const char *buffer)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        unsigned long long val;
        int ret = 0;
@@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg)
        return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
 }
-static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
+static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        u64 val;
        switch (cft->private) {
@@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
        return val;
 }
-static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event)
+static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 {
        struct mem_cgroup *memcg;
        struct tcp_memcontrol *tcp;
        struct cg_proto *cg_proto;
-        memcg = mem_cgroup_from_cont(cont);
+        memcg = mem_cgroup_from_css(css);
        cg_proto = tcp_prot.proto_cgroup(memcg);
        if (!cg_proto)
                return 0;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f6a005c485a9..4a22f3e715df 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk)
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_metrics_block *tm;
-        u32 val;
+        u32 val, crtt = 0; /* cached RTT scaled by 8 */
        if (dst == NULL)
                goto reset;
@@ -478,15 +478,19 @@ void tcp_init_metrics(struct sock *sk)
                tp->reordering = val;
        }
-        val = tcp_metric_get(tm, TCP_METRIC_RTT);
+        crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
-        if (val == 0 || tp->srtt == 0) {
+        rcu_read_unlock();
-                rcu_read_unlock();
+reset:
-                goto reset;
+        /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
-        }
+         * to seed the RTO for later data packets because SYN packets are
-        /* Initial rtt is determined from SYN,SYN-ACK.
+         * small. Use the per-dst cached values to seed the RTO but keep
-         * The segment is small and rtt may appear much
+         * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
-         * less than real one. Use per-dst memory
+         * Later the RTO will be updated immediately upon obtaining the first
-         * to make it more realistic.
+         * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
+         * influences the first RTO but not later RTT estimation.
+         *
+         * But if RTT is not available from the SYN (due to retransmits or
+         * syn cookies) or the cache, force a conservative 3secs timeout.
         *
         * A bit of theory. RTT is time passed after "normal" sized packet
         * is sent until it is ACKed. In normal circumstances sending small
@@ -497,21 +501,9 @@ void tcp_init_metrics(struct sock *sk)
         * to low value, and then abruptly stops to do it and starts to delay
         * ACKs, wait for troubles.
         */
-        val = msecs_to_jiffies(val);
+        if (crtt > tp->srtt) {
-        if (val > tp->srtt) {
+                inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
-                tp->srtt = val;
+        } else if (tp->srtt == 0) {
-                tp->rtt_seq = tp->snd_nxt;
-        }
-        val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
-        if (val > tp->mdev) {
-                tp->mdev = val;
-                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
-        }
-        rcu_read_unlock();
-        tcp_set_rto(sk);
-reset:
-        if (tp->srtt == 0) {
                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
                 * 3WHS. This is most likely due to retransmission,
                 * including spurious one. Reset the RTO back to 3secs
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ab1c08658528..58a3e69aef64 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
                tcp_enable_early_retrans(newtp);
                newtp->tlp_high_seq = 0;
+                newtp->lsndtime = treq->snt_synack;
+                newtp->total_retrans = req->num_retrans;
                /* So many TCP implementations out there (incorrectly) count the
                 * initial SYN frame in their delayed-ACK and congestion control
@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        if (!(flg & TCP_FLAG_ACK))
                return NULL;
-        /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
-        if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
-                tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
-        else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
-                tcp_rsk(req)->snt_synack = 0;
        /* For Fast Open no more processing is needed (sk is the
         * child socket).
         */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 92fde8d1aa82..7c83cb8bf137 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                           int push_one, gfp_t gfp);
@@ -1628,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
        /* If a full-sized TSO skb can be sent, do it. */
        if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
-                           sk->sk_gso_max_segs * tp->mss_cache))
+                           tp->xmit_size_goal_segs * tp->mss_cache))
                goto send_now;
        /* Middle in queue won't get any more data, full sendable already? */
@@ -2670,7 +2673,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        int tcp_header_size;
        int mss;
-        skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC));
+        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
        if (unlikely(!skb)) {
                dst_release(dst);
                return NULL;
@@ -2814,6 +2817,8 @@ void tcp_connect_init(struct sock *sk)
        if (likely(!tp->repair))
                tp->rcv_nxt = 0;
+        else
+                tp->rcv_tstamp = tcp_time_stamp;
        tp->rcv_wup = tp->rcv_nxt;
        tp->copied_seq = tp->rcv_nxt;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index d4943f67aff2..611beab38a00 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -46,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096;
 MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
 module_param(bufsize, uint, 0);
+static unsigned int fwmark __read_mostly = 0;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
 static int full __read_mostly;
 MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
 module_param(full, int, 0);
@@ -54,12 +58,16 @@ static const char procname[] = "tcpprobe";
 struct tcp_log {
        ktime_t tstamp;
-        __be32  saddr, daddr;
+        union {
-        __be16  sport, dport;
+                struct sockaddr         raw;
+                struct sockaddr_in      v4;
+                struct sockaddr_in6     v6;
+        }       src, dst;
        u16     length;
        u32     snd_nxt;
        u32     snd_una;
        u32     snd_wnd;
+        u32     rcv_wnd;
        u32     snd_cwnd;
        u32     ssthresh;
        u32     srtt;
@@ -86,19 +94,45 @@ static inline int tcp_probe_avail(void)
        return bufsize - tcp_probe_used() - 1;
 }
+#define tcp_probe_copy_fl_to_si4(inet, si4, mem)                \
+        do {                                                    \
+                si4.sin_family = AF_INET;                       \
+                si4.sin_port = inet->inet_##mem##port;          \
+                si4.sin_addr.s_addr = inet->inet_##mem##addr;   \
+        } while (0)                                             \
+#if IS_ENABLED(CONFIG_IPV6)
+#define tcp_probe_copy_fl_to_si6(inet, si6, mem)                \
+        do {                                                    \
+                struct ipv6_pinfo *pi6 = inet->pinet6;          \
+                si6.sin6_family = AF_INET6;                     \
+                si6.sin6_port = inet->inet_##mem##port;         \
+                si6.sin6_addr = pi6->mem##addr;                 \
+                si6.sin6_flowinfo = 0; /* No need here. */      \
+                si6.sin6_scope_id = 0;  /* No need here. */     \
+        } while (0)
+#else
+#define tcp_probe_copy_fl_to_si6(fl, si6, mem)                  \
+        do {                                                    \
+                memset(&si6, 0, sizeof(si6));                   \
+        } while (0)
+#endif
 /*
 * Hook inserted to be called before each receive packet.
 * Note: arguments must match tcp_rcv_established()!
 */
-static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-                               struct tcphdr *th, unsigned int len)
+                                 const struct tcphdr *th, unsigned int len)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_sock *inet = inet_sk(sk);
-        /* Only update if port matches */
+        /* Only update if port or skb mark matches */
-        if ((port == 0 || ntohs(inet->inet_dport) == port ||
+        if (((port == 0 && fwmark == 0) ||
-             ntohs(inet->inet_sport) == port) &&
+             ntohs(inet->inet_dport) == port ||
+             ntohs(inet->inet_sport) == port ||
+             (fwmark > 0 && skb->mark == fwmark)) &&
            (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
                spin_lock(&tcp_probe.lock);
@@ -107,15 +141,25 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        struct tcp_log *p = tcp_probe.log + tcp_probe.head;
                        p->tstamp = ktime_get();
-                        p->saddr = inet->inet_saddr;
+                        switch (sk->sk_family) {
-                        p->sport = inet->inet_sport;
+                        case AF_INET:
-                        p->daddr = inet->inet_daddr;
+                                tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
-                        p->dport = inet->inet_dport;
+                                tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
+                                break;
+                        case AF_INET6:
+                                tcp_probe_copy_fl_to_si6(inet, p->src.v6, s);
+                                tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d);
+                                break;
+                        default:
+                                BUG();
+                        }
                        p->length = skb->len;
                        p->snd_nxt = tp->snd_nxt;
                        p->snd_una = tp->snd_una;
                        p->snd_cwnd = tp->snd_cwnd;
                        p->snd_wnd = tp->snd_wnd;
+                        p->rcv_wnd = tp->rcv_wnd;
                        p->ssthresh = tcp_current_ssthresh(sk);
                        p->srtt = tp->srtt >> 3;
@@ -128,7 +172,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
        }
        jprobe_return();
-        return 0;
 }
 static struct jprobe tcp_jprobe = {
@@ -157,13 +200,11 @@ static int tcpprobe_sprint(char *tbuf, int n)
                = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
        return scnprintf(tbuf, n,
-                        "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
+                        "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
                        (unsigned long) tv.tv_sec,
                        (unsigned long) tv.tv_nsec,
-                        &p->saddr, ntohs(p->sport),
+                        &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
-                        &p->daddr, ntohs(p->dport),
+                        p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
-                        p->length, p->snd_nxt, p->snd_una,
-                        p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt);
 }
 static ssize_t tcpprobe_read(struct file *file, char __user *buf,
@@ -176,7 +217,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
                return -EINVAL;
        while (cnt < len) {
-                char tbuf[164];
+                char tbuf[256];
                int width;
                /* Wait for data in buffer */
@@ -223,6 +264,13 @@ static __init int tcpprobe_init(void)
 {
        int ret = -ENOMEM;
+        /* Warning: if the function signature of tcp_rcv_established,
+         * has been changed, you also have to change the signature of
+         * jtcp_rcv_established, otherwise you end up right here!
+         */
+        BUILD_BUG_ON(__same_type(tcp_rcv_established,
+                                 jtcp_rcv_established) == 0);
        init_waitqueue_head(&tcp_probe.wait);
        spin_lock_init(&tcp_probe.lock);
@@ -241,7 +289,8 @@ static __init int tcpprobe_init(void)
        if (ret)
                goto err1;
-        pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize);
+        pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+                port, fwmark, bufsize);
        return 0;
 err1:
        remove_proc_entry(procname, init_net.proc_net);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 766e6bab9113..74d2c95db57f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
 *      @src:   source IP address
 *      @dst:   destination IP address
 */
-static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
 {
        struct udphdr *uh = udp_hdr(skb);
        struct sk_buff *frags = skb_shinfo(skb)->frag_list;
@@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
                        uh->check = CSUM_MANGLED_0;
        }
 }
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
 static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
 {
@@ -2158,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
        __u16 srcp        = ntohs(inet->inet_sport);
        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
@@ -2336,7 +2337,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
                uh->len = htons(skb->len - udp_offset);
                /* csum segment if tunnel sets skb with csum. */
-                if (unlikely(uh->check)) {
+                if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
                        struct iphdr *iph = ip_hdr(skb);
                        uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
@@ -2347,7 +2348,18 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
                        if (uh->check == 0)
                                uh->check = CSUM_MANGLED_0;
+                } else if (protocol == htons(ETH_P_IPV6)) {
+                        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+                        u32 len = skb->len - udp_offset;
+                        uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+                                                     len, IPPROTO_UDP, 0);
+                        uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
+                        if (uh->check == 0)
+                                uh->check = CSUM_MANGLED_0;
+                        skb->ip_summed = CHECKSUM_NONE;
                }
                skb->protocol = protocol;
        } while ((skb = skb->next));
 out:
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 327a617d594c..baa0f63731fd 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -21,7 +21,6 @@
 static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 {
        int mtu, ret = 0;
-        struct dst_entry *dst;
        if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
                goto out;
@@ -29,12 +28,10 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
        if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
                goto out;
-        dst = skb_dst(skb);
+        mtu = dst_mtu(skb_dst(skb));
-        mtu = dst_mtu(dst);
        if (skb->len > mtu) {
                if (skb->sk)
-                        ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr,
+                        xfrm_local_error(skb, mtu);
-                                       inet_sk(skb->sk)->inet_dport, mtu);
                else
                        icmp_send(skb, ICMP_DEST_UNREACH,
                                  ICMP_FRAG_NEEDED, htonl(mtu));
@@ -99,3 +96,12 @@ int xfrm4_output(struct sk_buff *skb)
                            x->outer_mode->afinfo->output_finish,
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
+void xfrm4_local_error(struct sk_buff *skb, u32 mtu)
+{
+        struct iphdr *hdr;
+        hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
+        ip_local_error(skb->sk, EMSGSIZE, hdr->daddr,
+                       inet_sk(skb->sk)->inet_dport, mtu);
+}
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 9258e751baba..0b2a0641526a 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -83,6 +83,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
        .extract_input          = xfrm4_extract_input,
        .extract_output         = xfrm4_extract_output,
        .transport_finish       = xfrm4_transport_finish,
+        .local_error            = xfrm4_local_error,
 };
 void __init xfrm4_state_init(void)