63 files changed, 2021 insertions, 4121 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fe4582ca969a..766c59658563 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -212,6 +212,26 @@ int inet_listen(struct socket *sock, int backlog)
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != TCP_LISTEN) {
+                /* Check special setups for testing purpose to enable TFO w/o
+                 * requiring TCP_FASTOPEN sockopt.
+                 * Note that only TCP sockets (SOCK_STREAM) will reach here.
+                 * Also fastopenq may already been allocated because this
+                 * socket was in TCP_LISTEN state previously but was
+                 * shutdown() (rather than close()).
+                 */
+                if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+                    inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+                        if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+                                err = fastopen_init_queue(sk, backlog);
+                        else if ((sysctl_tcp_fastopen &
+                                  TFO_SERVER_WO_SOCKOPT2) != 0)
+                                err = fastopen_init_queue(sk,
+                                    ((uint)sysctl_tcp_fastopen) >> 16);
+                        else
+                                err = 0;
+                        if (err)
+                                goto out;
+                }
                err = inet_csk_listen_start(sk, backlog);
                if (err)
                        goto out;
@@ -701,7 +721,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
        sock_rps_record_flow(sk2);
        WARN_ON(!((1 << sk2->sk_state) &
-                  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+                  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+                  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
        sock_graft(sk2, newsock);
@@ -1364,7 +1385,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
        if (*(u8 *)iph != 0x45)
                goto out_unlock;
-        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+        if (unlikely(ip_fast_csum((u8 *)iph, 5)))
                goto out_unlock;
        id = ntohl(*(__be32 *)&iph->id);
@@ -1380,7 +1401,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
                iph2 = ip_hdr(p);
                if ((iph->protocol ^ iph2->protocol) |
-                    (iph->tos ^ iph2->tos) |
                    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
                    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
@@ -1390,6 +1410,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
                /* All fields must match except length and checksum. */
                NAPI_GRO_CB(p)->flush |=
                        (iph->ttl ^ iph2->ttl) |
+                        (iph->tos ^ iph2->tos) |
                        ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
                NAPI_GRO_CB(p)->flush |= flush;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e12fad773852..2a6abc163ed2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -94,25 +94,22 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
        [IFA_LABEL]             = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 };
-/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+#define IN4_ADDR_HSIZE_SHIFT    8
- * value.  So if you change this define, make appropriate changes to
+#define IN4_ADDR_HSIZE          (1U << IN4_ADDR_HSIZE_SHIFT)
- * inet_addr_hash as well.
- */
-#define IN4_ADDR_HSIZE  256
 static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
 static DEFINE_SPINLOCK(inet_addr_hash_lock);
-static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+static u32 inet_addr_hash(struct net *net, __be32 addr)
 {
-        u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+        u32 val = (__force u32) addr ^ net_hash_mix(net);
-        return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+        return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
-                (IN4_ADDR_HSIZE - 1));
 }
 static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
 {
-        unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
+        u32 hash = inet_addr_hash(net, ifa->ifa_local);
        spin_lock(&inet_addr_hash_lock);
        hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
@@ -136,18 +133,18 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
 */
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-        unsigned int hash = inet_addr_hash(net, addr);
+        u32 hash = inet_addr_hash(net, addr);
        struct net_device *result = NULL;
        struct in_ifaddr *ifa;
        struct hlist_node *node;
        rcu_read_lock();
        hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
-                struct net_device *dev = ifa->ifa_dev->dev;
-                if (!net_eq(dev_net(dev), net))
-                        continue;
                if (ifa->ifa_local == addr) {
+                        struct net_device *dev = ifa->ifa_dev->dev;
+                        if (!net_eq(dev_net(dev), net))
+                                continue;
                        result = dev;
                        break;
                }
@@ -182,10 +179,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 static void devinet_sysctl_register(struct in_device *idev);
 static void devinet_sysctl_unregister(struct in_device *idev);
 #else
-static inline void devinet_sysctl_register(struct in_device *idev)
+static void devinet_sysctl_register(struct in_device *idev)
 {
 }
-static inline void devinet_sysctl_unregister(struct in_device *idev)
+static void devinet_sysctl_unregister(struct in_device *idev)
 {
 }
 #endif
@@ -205,7 +202,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
        kfree(ifa);
 }
-static inline void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct in_ifaddr *ifa)
 {
        call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 }
@@ -314,7 +311,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 }
 static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
-                         int destroy, struct nlmsghdr *nlh, u32 pid)
+                         int destroy, struct nlmsghdr *nlh, u32 portid)
 {
        struct in_ifaddr *promote = NULL;
        struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -348,7 +345,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
                                inet_hash_remove(ifa);
                                *ifap1 = ifa->ifa_next;
-                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
+                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
                                blocking_notifier_call_chain(&inetaddr_chain,
                                                NETDEV_DOWN, ifa);
                                inet_free_ifa(ifa);
@@ -385,7 +382,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
           is valid, it will try to restore deleted routes... Grr.
           So that, this order is correct.
         */
-        rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
+        rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
        if (promote) {
@@ -398,7 +395,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
                }
                promote->ifa_flags &= ~IFA_F_SECONDARY;
-                rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
+                rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
                blocking_notifier_call_chain(&inetaddr_chain,
                                NETDEV_UP, promote);
                for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
@@ -420,7 +417,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 }
 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-                             u32 pid)
+                             u32 portid)
 {
        struct in_device *in_dev = ifa->ifa_dev;
        struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -467,7 +464,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
        /* Send message first, then call notifier.
           Notifier will trigger FIB update, so that
           listeners of netlink will know about new ifaddr */
-        rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
+        rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
        return 0;
@@ -566,7 +563,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
                    !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
                        continue;
-                __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
+                __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
                return 0;
        }
@@ -652,14 +649,14 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
        if (IS_ERR(ifa))
                return PTR_ERR(ifa);
-        return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
+        return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
 }
 /*
 *      Determine a default network mask, based on the IP address.
 */
-static inline int inet_abc_len(__be32 addr)
+static int inet_abc_len(__be32 addr)
 {
        int rc = -1;    /* Something else, probably a multicast. */
@@ -1124,7 +1121,7 @@ skip:
        }
 }
-static inline bool inetdev_valid_mtu(unsigned int mtu)
+static bool inetdev_valid_mtu(unsigned int mtu)
 {
        return mtu >= 68;
 }
@@ -1239,7 +1236,7 @@ static struct notifier_block ip_netdev_notifier = {
        .notifier_call = inetdev_event,
 };
-static inline size_t inet_nlmsg_size(void)
+static size_t inet_nlmsg_size(void)
 {
        return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
               + nla_total_size(4) /* IFA_ADDRESS */
@@ -1249,12 +1246,12 @@ static inline size_t inet_nlmsg_size(void)
 }
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-                            u32 pid, u32 seq, int event, unsigned int flags)
+                            u32 portid, u32 seq, int event, unsigned int flags)
 {
        struct ifaddrmsg *ifm;
        struct nlmsghdr  *nlh;
-        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
+        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
@@ -1316,7 +1313,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
                                if (ip_idx < s_ip_idx)
                                        continue;
                                if (inet_fill_ifaddr(skb, ifa,
-                                             NETLINK_CB(cb->skb).pid,
+                                             NETLINK_CB(cb->skb).portid,
                                             cb->nlh->nlmsg_seq,
                                             RTM_NEWADDR, NLM_F_MULTI) <= 0) {
                                        rcu_read_unlock();
@@ -1338,7 +1335,7 @@ done:
 }
 static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-                      u32 pid)
+                      u32 portid)
 {
        struct sk_buff *skb;
        u32 seq = nlh ? nlh->nlmsg_seq : 0;
@@ -1350,14 +1347,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
        if (skb == NULL)
                goto errout;
-        err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
+        err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
-        rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+        rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
        return;
 errout:
        if (err < 0)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8e2b475da9fa..68c93d1bb03a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -218,7 +218,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
        scope = RT_SCOPE_UNIVERSE;
        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
                fl4.flowi4_oif = 0;
-                fl4.flowi4_iif = net->loopback_dev->ifindex;
+                fl4.flowi4_iif = LOOPBACK_IFINDEX;
                fl4.daddr = ip_hdr(skb)->saddr;
                fl4.saddr = 0;
                fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
@@ -557,7 +557,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
        cfg->fc_flags = rtm->rtm_flags;
        cfg->fc_nlflags = nlh->nlmsg_flags;
-        cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
        cfg->fc_nlinfo.nlh = nlh;
        cfg->fc_nlinfo.nl_net = net;
@@ -955,7 +955,7 @@ static void nl_fib_input(struct sk_buff *skb)
        struct fib_result_nl *frn;
        struct nlmsghdr *nlh;
        struct fib_table *tb;
-        u32 pid;
+        u32 portid;
        net = sock_net(skb->sk);
        nlh = nlmsg_hdr(skb);
@@ -973,10 +973,10 @@ static void nl_fib_input(struct sk_buff *skb)
        nl_fib_lookup(frn, tb);
-        pid = NETLINK_CB(skb).pid;      /* pid of sending process */
+        portid = NETLINK_CB(skb).portid;      /* pid of sending process */
-        NETLINK_CB(skb).pid = 0;        /* from kernel */
+        NETLINK_CB(skb).portid = 0;        /* from kernel */
        NETLINK_CB(skb).dst_group = 0;  /* unicast */
-        netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
+        netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
 }
 static int __net_init nl_fib_lookup_init(struct net *net)
@@ -986,7 +986,7 @@ static int __net_init nl_fib_lookup_init(struct net *net)
                .input  = nl_fib_input,
        };
-        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
+        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
        if (sk == NULL)
                return -EAFNOSUPPORT;
        net->ipv4.fibnl = sk;
@@ -1041,7 +1041,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
        struct net_device *dev = ptr;
-        struct in_device *in_dev = __in_dev_get_rtnl(dev);
+        struct in_device *in_dev;
        struct net *net = dev_net(dev);
        if (event == NETDEV_UNREGISTER) {
@@ -1050,8 +1050,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                return NOTIFY_DONE;
        }
-        if (!in_dev)
+        in_dev = __in_dev_get_rtnl(dev);
-                return NOTIFY_DONE;
        switch (event) {
        case NETDEV_UP:
@@ -1062,16 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                fib_sync_up(dev);
 #endif
                atomic_inc(&net->ipv4.dev_addr_genid);
-                rt_cache_flush(dev_net(dev));
+                rt_cache_flush(net);
                break;
        case NETDEV_DOWN:
                fib_disable_ip(dev, 0);
                break;
        case NETDEV_CHANGEMTU:
        case NETDEV_CHANGE:
-                rt_cache_flush(dev_net(dev));
+                rt_cache_flush(net);
-                break;
-        case NETDEV_UNREGISTER_BATCH:
                break;
        }
        return NOTIFY_DONE;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da80dc14cc76..3509065e409a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -391,7 +391,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
        if (skb == NULL)
                goto errout;
-        err = fib_dump_info(skb, info->pid, seq, event, tb_id,
+        err = fib_dump_info(skb, info->portid, seq, event, tb_id,
                            fa->fa_type, key, dst_len,
                            fa->fa_tos, fa->fa_info, nlm_flags);
        if (err < 0) {
@@ -400,7 +400,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
                kfree_skb(skb);
                goto errout;
        }
-        rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
                    info->nlh, GFP_KERNEL);
        return;
 errout:
@@ -989,14 +989,14 @@ failure:
        return ERR_PTR(err);
 }
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
                  struct fib_info *fi, unsigned int flags)
 {
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
-        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
+        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d1b93595b4a7..31d771ca9a70 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1550,7 +1550,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
                 * state.directly.
                 */
                if (pref_mismatch) {
-                        int mp = KEYLENGTH - fls(pref_mismatch);
+                        /* fls(x) = __fls(x) + 1 */
+                        int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
                                goto backtrace;
@@ -1655,7 +1656,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
        if (!l)
                return -ESRCH;
-        fa_head = get_fa_head(l, plen);
+        li = find_leaf_info(l, plen);
+        if (!li)
+                return -ESRCH;
+        fa_head = &li->falh;
        fa = fib_find_alias(fa_head, tos, 0);
        if (!fa)
@@ -1691,9 +1697,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
                  &cfg->fc_nlinfo, 0);
-        l = fib_find_node(t, key);
-        li = find_leaf_info(l, plen);
        list_del_rcu(&fa->fa_list);
        if (!plen)
@@ -1870,7 +1873,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
                        continue;
                }
-                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+                if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
                                  cb->nlh->nlmsg_seq,
                                  RTM_NEWROUTE,
                                  tb->tb_id,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6699f23e6f55..736ab70fd179 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -815,14 +815,15 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
        return 1;
 }
-static void igmp_heard_report(struct in_device *in_dev, __be32 group)
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
 {
        struct ip_mc_list *im;
        /* Timers are only set for non-local groups */
        if (group == IGMP_ALL_HOSTS)
-                return;
+                return false;
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, im) {
@@ -832,9 +833,11 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
                }
        }
        rcu_read_unlock();
+        return false;
 }
-static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
        int len)
 {
        struct igmphdr          *ih = igmp_hdr(skb);
@@ -866,7 +869,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                /* clear deleted report items */
                igmpv3_clear_delrec(in_dev);
        } else if (len < 12) {
-                return; /* ignore bogus packet; freed by caller */
+                return true;    /* ignore bogus packet; freed by caller */
        } else if (IGMP_V1_SEEN(in_dev)) {
                /* This is a v3 query with v1 queriers present */
                max_delay = IGMP_Query_Response_Interval;
@@ -883,13 +886,13 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                        max_delay = 1;  /* can't mod w/ 0 */
        } else { /* v3 */
                if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
-                        return;
+                        return true;
                ih3 = igmpv3_query_hdr(skb);
                if (ih3->nsrcs) {
                        if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
                                           + ntohs(ih3->nsrcs)*sizeof(__be32)))
-                                return;
+                                return true;
                        ih3 = igmpv3_query_hdr(skb);
                }
@@ -901,9 +904,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                        in_dev->mr_qrv = ih3->qrv;
                if (!group) { /* general query */
                        if (ih3->nsrcs)
-                                return; /* no sources allowed */
+                                return false;   /* no sources allowed */
                        igmp_gq_start_timer(in_dev);
-                        return;
+                        return false;
                }
                /* mark sources to include, if group & source-specific */
                mark = ih3->nsrcs != 0;
@@ -939,6 +942,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                        igmp_mod_timer(im, max_delay);
        }
        rcu_read_unlock();
+        return false;
 }
 /* called in rcu_read_lock() section */
@@ -948,6 +952,7 @@ int igmp_rcv(struct sk_buff *skb)
        struct igmphdr *ih;
        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
        int len = skb->len;
+        bool dropped = true;
        if (in_dev == NULL)
                goto drop;
@@ -969,7 +974,7 @@ int igmp_rcv(struct sk_buff *skb)
        ih = igmp_hdr(skb);
        switch (ih->type) {
        case IGMP_HOST_MEMBERSHIP_QUERY:
-                igmp_heard_query(in_dev, skb, len);
+                dropped = igmp_heard_query(in_dev, skb, len);
                break;
        case IGMP_HOST_MEMBERSHIP_REPORT:
        case IGMPV2_HOST_MEMBERSHIP_REPORT:
@@ -979,7 +984,7 @@ int igmp_rcv(struct sk_buff *skb)
                /* don't rely on MC router hearing unicast reports */
                if (skb->pkt_type == PACKET_MULTICAST ||
                    skb->pkt_type == PACKET_BROADCAST)
-                        igmp_heard_report(in_dev, ih->group);
+                        dropped = igmp_heard_report(in_dev, ih->group);
                break;
        case IGMP_PIM:
 #ifdef CONFIG_IP_PIMSM_V1
@@ -997,7 +1002,10 @@ int igmp_rcv(struct sk_buff *skb)
        }
 drop:
-        kfree_skb(skb);
+        if (dropped)
+                kfree_skb(skb);
+        else
+                consume_skb(skb);
        return 0;
 }
@@ -1896,6 +1904,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
        rtnl_unlock();
        return ret;
 }
+EXPORT_SYMBOL(ip_mc_leave_group);
 int ip_mc_source(int add, int omode, struct sock *sk, struct
        ip_mreq_source *mreqs, int ifindex)
@@ -2435,6 +2444,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
                struct ip_mc_list *im = (struct ip_mc_list *)v;
                struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
                char   *querier;
+                long delta;
 #ifdef CONFIG_IP_MULTICAST
                querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
                          IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2448,11 +2459,12 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
                                   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
                }
+                delta = im->timer.expires - jiffies;
                seq_printf(seq,
                           "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
                           im->multiaddr, im->users,
-                           im->tm_running, im->tm_running ?
+                           im->tm_running,
-                           jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+                           im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
                           im->reporter);
        }
        return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7f75f21d7b83..f0c5b9c1a957 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct sock *newsk;
+        struct request_sock *req;
        int error;
        lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                goto out_err;
        /* Find already established connection */
-        if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+        if (reqsk_queue_empty(queue)) {
                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
                /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                if (error)
                        goto out_err;
        }
+        req = reqsk_queue_remove(queue);
-        newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+        newsk = req->sk;
-        WARN_ON(newsk->sk_state == TCP_SYN_RECV);
+        sk_acceptq_removed(sk);
+        if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
+                spin_lock_bh(&queue->fastopenq->lock);
+                if (tcp_rsk(req)->listener) {
+                        /* We are still waiting for the final ACK from 3WHS
+                         * so can't free req now. Instead, we set req->sk to
+                         * NULL to signify that the child socket is taken
+                         * so reqsk_fastopen_remove() will free the req
+                         * when 3WHS finishes (or is aborted).
+                         */
+                        req->sk = NULL;
+                        req = NULL;
+                }
+                spin_unlock_bh(&queue->fastopenq->lock);
+        }
 out:
        release_sock(sk);
+        if (req)
+                __reqsk_free(req);
        return newsk;
 out_err:
        newsk = NULL;
+        req = NULL;
        *err = error;
        goto out;
 }
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 void inet_csk_listen_stop(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct request_sock *acc_req;
        struct request_sock *req;
        inet_csk_delete_keepalive_timer(sk);
        /* make all the listen_opt local to us */
-        acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+        acc_req = reqsk_queue_yank_acceptq(queue);
        /* Following specs, it would be better either to send FIN
         * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
         * To be honest, we are not able to make either
         * of the variants now.                 --ANK
         */
-        reqsk_queue_destroy(&icsk->icsk_accept_queue);
+        reqsk_queue_destroy(queue);
        while ((req = acc_req) != NULL) {
                struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
                percpu_counter_inc(sk->sk_prot->orphan_count);
+                if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
+                        BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+                        BUG_ON(sk != tcp_rsk(req)->listener);
+                        /* Paranoid, to prevent race condition if
+                         * an inbound pkt destined for child is
+                         * blocked by sock lock in tcp_v4_rcv().
+                         * Also to satisfy an assertion in
+                         * tcp_v4_destroy_sock().
+                         */
+                        tcp_sk(child)->fastopen_rsk = NULL;
+                        sock_put(sk);
+                }
                inet_csk_destroy_sock(child);
                bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
                sk_acceptq_removed(sk);
                __reqsk_free(req);
        }
+        if (queue->fastopenq != NULL) {
+                /* Free all the reqs queued in rskq_rst_head. */
+                spin_lock_bh(&queue->fastopenq->lock);
+                acc_req = queue->fastopenq->rskq_rst_head;
+                queue->fastopenq->rskq_rst_head = NULL;
+                spin_unlock_bh(&queue->fastopenq->lock);
+                while ((req = acc_req) != NULL) {
+                        acc_req = req->dl_next;
+                        __reqsk_free(req);
+                }
+        }
        WARN_ON(sk->sk_ack_backlog);
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8bc005b1435f..535584c00f91 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -70,7 +70,7 @@ static inline void inet_diag_unlock_handler(
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
                              struct sk_buff *skb, struct inet_diag_req_v2 *req,
                              struct user_namespace *user_ns,                   
-                              u32 pid, u32 seq, u16 nlmsg_flags,
+                              u32 portid, u32 seq, u16 nlmsg_flags,
                              const struct nlmsghdr *unlh)
 {
        const struct inet_sock *inet = inet_sk(sk);
@@ -84,7 +84,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
        handler = inet_diag_table[req->sdiag_protocol];
        BUG_ON(handler == NULL);
-        nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+        nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
                        nlmsg_flags);
        if (!nlh)
                return -EMSGSIZE;
@@ -201,23 +201,23 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
 static int inet_csk_diag_fill(struct sock *sk,
                              struct sk_buff *skb, struct inet_diag_req_v2 *req,
                              struct user_namespace *user_ns,
-                              u32 pid, u32 seq, u16 nlmsg_flags,
+                              u32 portid, u32 seq, u16 nlmsg_flags,
                              const struct nlmsghdr *unlh)
 {
        return inet_sk_diag_fill(sk, inet_csk(sk),
-                        skb, req, user_ns, pid, seq, nlmsg_flags, unlh);
+                        skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
 }
 static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
                               struct sk_buff *skb, struct inet_diag_req_v2 *req,
-                               u32 pid, u32 seq, u16 nlmsg_flags,
+                               u32 portid, u32 seq, u16 nlmsg_flags,
                               const struct nlmsghdr *unlh)
 {
        long tmo;
        struct inet_diag_msg *r;
        struct nlmsghdr *nlh;
-        nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+        nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
                        nlmsg_flags);
        if (!nlh)
                return -EMSGSIZE;
@@ -260,14 +260,14 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
                        struct inet_diag_req_v2 *r,
                        struct user_namespace *user_ns,
-                        u32 pid, u32 seq, u16 nlmsg_flags,
+                        u32 portid, u32 seq, u16 nlmsg_flags,
                        const struct nlmsghdr *unlh)
 {
        if (sk->sk_state == TCP_TIME_WAIT)
                return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
-                                           skb, r, pid, seq, nlmsg_flags,
+                                           skb, r, portid, seq, nlmsg_flags,
                                           unlh);
-        return inet_csk_diag_fill(sk, skb, r, user_ns, pid, seq, nlmsg_flags, unlh);
+        return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
 }
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -316,14 +316,14 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
        err = sk_diag_fill(sk, rep, req,
                           sk_user_ns(NETLINK_CB(in_skb).ssk),
-                           NETLINK_CB(in_skb).pid,
+                           NETLINK_CB(in_skb).portid,
                           nlh->nlmsg_seq, 0, nlh);
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
                nlmsg_free(rep);
                goto out;
        }
-        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
                              MSG_DONTWAIT);
        if (err > 0)
                err = 0;
@@ -557,7 +557,7 @@ static int inet_csk_diag_dump(struct sock *sk,
        return inet_csk_diag_fill(sk, skb, r,
                                  sk_user_ns(NETLINK_CB(cb->skb).ssk),
-                                  NETLINK_CB(cb->skb).pid,
+                                  NETLINK_CB(cb->skb).portid,
                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -592,14 +592,14 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
        }
        return inet_twsk_diag_fill(tw, skb, r,
-                                   NETLINK_CB(cb->skb).pid,
+                                   NETLINK_CB(cb->skb).portid,
                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
 static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
                              struct request_sock *req,
                              struct user_namespace *user_ns,
-                              u32 pid, u32 seq,
+                              u32 portid, u32 seq,
                              const struct nlmsghdr *unlh)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
@@ -608,7 +608,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
        struct nlmsghdr *nlh;
        long tmo;
-        nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+        nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
                        NLM_F_MULTI);
        if (!nlh)
                return -EMSGSIZE;
@@ -711,7 +711,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
                        err = inet_diag_fill_req(skb, sk, req,
                                               sk_user_ns(NETLINK_CB(cb->skb).ssk),
-                                               NETLINK_CB(cb->skb).pid,
+                                               NETLINK_CB(cb->skb).portid,
                                               cb->nlh->nlmsg_seq, cb->nlh);
                        if (err < 0) {
                                cb->args[3] = j + 1;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 85190e69297b..4750d2b74d79 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -89,7 +89,7 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
        nf->low_thresh = 0;
        local_bh_disable();
-        inet_frag_evictor(nf, f);
+        inet_frag_evictor(nf, f, true);
        local_bh_enable();
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -158,11 +158,16 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f)
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 {
        struct inet_frag_queue *q;
        int work, evicted = 0;
+        if (!force) {
+                if (atomic_read(&nf->mem) <= nf->high_thresh)
+                        return 0;
+        }
        work = atomic_read(&nf->mem) - nf->low_thresh;
        while (work > 0) {
                read_lock(&f->lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409c..448e68546827 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -219,7 +219,7 @@ static void ip_evictor(struct net *net)
 {
        int evicted;
-        evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
+        evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
        if (evicted)
                IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
 }
@@ -523,6 +523,10 @@ found:
        if (offset == 0)
                qp->q.last_in |= INET_FRAG_FIRST_IN;
+        if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+            skb->len + ihl > qp->q.max_size)
+                qp->q.max_size = skb->len + ihl;
        if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
            qp->q.meat == qp->q.len)
                return ip_frag_reasm(qp, prev, dev);
@@ -646,9 +650,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
        head->next = NULL;
        head->dev = dev;
        head->tstamp = qp->q.stamp;
+        IPCB(head)->frag_max_size = qp->q.max_size;
        iph = ip_hdr(head);
-        iph->frag_off = 0;
+        /* max_size != 0 implies at least one fragment had IP_DF set */
+        iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
        iph->tot_len = htons(len);
        iph->tos |= ecn;
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -678,8 +684,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
        /* Start by cleaning up the memory. */
-        if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
+        ip_evictor(net);
-                ip_evictor(net);
        /* Lookup (or create) queue header */
        if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b062a98574f2..7240f8e2dd45 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -120,6 +120,10 @@
   Alexey Kuznetsov.
 */
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void ipgre_tunnel_setup(struct net_device *dev);
@@ -204,7 +208,9 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
        tot->rx_crc_errors = dev->stats.rx_crc_errors;
        tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
        tot->rx_length_errors = dev->stats.rx_length_errors;
+        tot->rx_frame_errors = dev->stats.rx_frame_errors;
        tot->rx_errors = dev->stats.rx_errors;
        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
        tot->tx_dropped = dev->stats.tx_dropped;
@@ -214,11 +220,25 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
        return tot;
 }
+/* Does key in tunnel parameters match packet */
+static bool ipgre_key_match(const struct ip_tunnel_parm *p,
+                            __be16 flags, __be32 key)
+{
+        if (p->i_flags & GRE_KEY) {
+                if (flags & GRE_KEY)
+                        return key == p->i_key;
+                else
+                        return false;   /* key expected, none present */
+        } else
+                return !(flags & GRE_KEY);
+}
 /* Given src, dst and key, find appropriate for input tunnel. */
 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
                                             __be32 remote, __be32 local,
-                                             __be32 key, __be16 gre_proto)
+                                             __be16 flags, __be32 key,
+                                             __be16 gre_proto)
 {
        struct net *net = dev_net(dev);
        int link = dev->ifindex;
@@ -233,10 +253,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
                if (local != t->parms.iph.saddr ||
                    remote != t->parms.iph.daddr ||
-                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;
+                if (!ipgre_key_match(&t->parms, flags, key))
+                        continue;
                if (t->dev->type != ARPHRD_IPGRE &&
                    t->dev->type != dev_type)
                        continue;
@@ -257,10 +279,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
                if (remote != t->parms.iph.daddr ||
-                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;
+                if (!ipgre_key_match(&t->parms, flags, key))
+                        continue;
                if (t->dev->type != ARPHRD_IPGRE &&
                    t->dev->type != dev_type)
                        continue;
@@ -283,10 +307,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
                if ((local != t->parms.iph.saddr &&
                     (local != t->parms.iph.daddr ||
                      !ipv4_is_multicast(local))) ||
-                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;
+                if (!ipgre_key_match(&t->parms, flags, key))
+                        continue;
                if (t->dev->type != ARPHRD_IPGRE &&
                    t->dev->type != dev_type)
                        continue;
@@ -489,6 +515,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
        const int code = icmp_hdr(skb)->code;
        struct ip_tunnel *t;
        __be16 flags;
+        __be32 key = 0;
        flags = p[0];
        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
@@ -505,6 +532,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
        if (skb_headlen(skb) < grehlen)
                return;
+        if (flags & GRE_KEY)
+                key = *(((__be32 *)p) + (grehlen / 4) - 1);
        switch (type) {
        default:
        case ICMP_PARAMETERPROB:
@@ -533,49 +563,34 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
                break;
        }
-        rcu_read_lock();
        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
-                                flags & GRE_KEY ?
+                                flags, key, p[1]);
-                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
-                                p[1]);
        if (t == NULL)
-                goto out;
+                return;
        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
                                 t->parms.link, 0, IPPROTO_GRE, 0);
-                goto out;
+                return;
        }
        if (type == ICMP_REDIRECT) {
                ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
                              IPPROTO_GRE, 0);
-                goto out;
+                return;
        }
        if (t->parms.iph.daddr == 0 ||
            ipv4_is_multicast(t->parms.iph.daddr))
-                goto out;
+                return;
        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-                goto out;
+                return;
        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
                t->err_count++;
        else
                t->err_count = 1;
        t->err_time = jiffies;
-out:
-        rcu_read_unlock();
-}
-static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
-{
-        if (INET_ECN_is_ce(iph->tos)) {
-                if (skb->protocol == htons(ETH_P_IP)) {
-                        IP_ECN_set_ce(ip_hdr(skb));
-                } else if (skb->protocol == htons(ETH_P_IPV6)) {
-                        IP6_ECN_set_ce(ipv6_hdr(skb));
-                }
-        }
 }
 static inline u8
@@ -600,9 +615,10 @@ static int ipgre_rcv(struct sk_buff *skb)
        struct ip_tunnel *tunnel;
        int    offset = 4;
        __be16 gre_proto;
+        int    err;
        if (!pskb_may_pull(skb, 16))
-                goto drop_nolock;
+                goto drop;
        iph = ip_hdr(skb);
        h = skb->data;
@@ -613,7 +629,7 @@ static int ipgre_rcv(struct sk_buff *skb)
                   - We do not support routing headers.
                 */
                if (flags&(GRE_VERSION|GRE_ROUTING))
-                        goto drop_nolock;
+                        goto drop;
                if (flags&GRE_CSUM) {
                        switch (skb->ip_summed) {
@@ -641,10 +657,10 @@ static int ipgre_rcv(struct sk_buff *skb)
        gre_proto = *(__be16 *)(h + 2);
-        rcu_read_lock();
+        tunnel = ipgre_tunnel_lookup(skb->dev,
-        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
+                                     iph->saddr, iph->daddr, flags, key,
-                                          iph->saddr, iph->daddr, key,
+                                     gre_proto);
-                                          gre_proto))) {
+        if (tunnel) {
                struct pcpu_tstats *tstats;
                secpath_reset(skb);
@@ -703,27 +719,33 @@ static int ipgre_rcv(struct sk_buff *skb)
                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
                }
+                __skb_tunnel_rx(skb, tunnel->dev);
+                skb_reset_network_header(skb);
+                err = IP_ECN_decapsulate(iph, skb);
+                if (unlikely(err)) {
+                        if (log_ecn_error)
+                                net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+                                                     &iph->saddr, iph->tos);
+                        if (err > 1) {
+                                ++tunnel->dev->stats.rx_frame_errors;
+                                ++tunnel->dev->stats.rx_errors;
+                                goto drop;
+                        }
+                }
                tstats = this_cpu_ptr(tunnel->dev->tstats);
                u64_stats_update_begin(&tstats->syncp);
                tstats->rx_packets++;
                tstats->rx_bytes += skb->len;
                u64_stats_update_end(&tstats->syncp);
-                __skb_tunnel_rx(skb, tunnel->dev);
+                gro_cells_receive(&tunnel->gro_cells, skb);
-                skb_reset_network_header(skb);
-                ipgre_ecn_decapsulate(iph, skb);
-                netif_rx(skb);
-                rcu_read_unlock();
                return 0;
        }
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 drop:
-        rcu_read_unlock();
-drop_nolock:
        kfree_skb(skb);
        return 0;
 }
@@ -745,6 +767,10 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
        __be32 dst;
        int    mtu;
+        if (skb->ip_summed == CHECKSUM_PARTIAL &&
+            skb_checksum_help(skb))
+                goto tx_error;
        if (dev->type == ARPHRD_ETHER)
                IPCB(skb)->flags = 0;
@@ -1292,10 +1318,18 @@ static const struct net_device_ops ipgre_netdev_ops = {
 static void ipgre_dev_free(struct net_device *dev)
 {
+        struct ip_tunnel *tunnel = netdev_priv(dev);
+        gro_cells_destroy(&tunnel->gro_cells);
        free_percpu(dev->tstats);
        free_netdev(dev);
 }
+#define GRE_FEATURES (NETIF_F_SG |              \
+                      NETIF_F_FRAGLIST |        \
+                      NETIF_F_HIGHDMA |         \
+                      NETIF_F_HW_CSUM)
 static void ipgre_tunnel_setup(struct net_device *dev)
 {
        dev->netdev_ops         = &ipgre_netdev_ops;
@@ -1309,12 +1343,16 @@ static void ipgre_tunnel_setup(struct net_device *dev)
        dev->addr_len           = 4;
        dev->features           |= NETIF_F_NETNS_LOCAL;
        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
+        dev->features           |= GRE_FEATURES;
+        dev->hw_features        |= GRE_FEATURES;
 }
 static int ipgre_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel;
        struct iphdr *iph;
+        int err;
        tunnel = netdev_priv(dev);
        iph = &tunnel->parms.iph;
@@ -1341,6 +1379,12 @@ static int ipgre_tunnel_init(struct net_device *dev)
        if (!dev->tstats)
                return -ENOMEM;
+        err = gro_cells_init(&tunnel->gro_cells, dev);
+        if (err) {
+                free_percpu(dev->tstats);
+                return err;
+        }
        return 0;
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c196d749daf2..24a29a39e9a8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -467,7 +467,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
        iph = ip_hdr(skb);
-        if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+        if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+                     (IPCB(skb)->frag_max_size &&
+                      IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                          htonl(ip_skb_dst_mtu(skb)));
@@ -791,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
                            struct flowi4 *fl4,
                            struct sk_buff_head *queue,
                            struct inet_cork *cork,
+                            struct page_frag *pfrag,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
@@ -985,47 +988,30 @@ alloc_new_skb:
                        }
                } else {
                        int i = skb_shinfo(skb)->nr_frags;
-                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-                        struct page *page = cork->page;
-                        int off = cork->off;
-                        unsigned int left;
-                        if (page && (left = PAGE_SIZE - off) > 0) {
-                                if (copy >= left)
-                                        copy = left;
-                                if (page != skb_frag_page(frag)) {
-                                        if (i == MAX_SKB_FRAGS) {
-                                                err = -EMSGSIZE;
-                                                goto error;
-                                        }
-                                        skb_fill_page_desc(skb, i, page, off, 0);
-                                        skb_frag_ref(skb, i);
-                                        frag = &skb_shinfo(skb)->frags[i];
-                                }
-                        } else if (i < MAX_SKB_FRAGS) {
-                                if (copy > PAGE_SIZE)
-                                        copy = PAGE_SIZE;
-                                page = alloc_pages(sk->sk_allocation, 0);
-                                if (page == NULL)  {
-                                        err = -ENOMEM;
-                                        goto error;
-                                }
-                                cork->page = page;
-                                cork->off = 0;
-                                skb_fill_page_desc(skb, i, page, 0, 0);
+                        err = -ENOMEM;
-                                frag = &skb_shinfo(skb)->frags[i];
+                        if (!sk_page_frag_refill(sk, pfrag))
-                        } else {
-                                err = -EMSGSIZE;
-                                goto error;
-                        }
-                        if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
-                                    offset, copy, skb->len, skb) < 0) {
-                                err = -EFAULT;
                                goto error;
+                        if (!skb_can_coalesce(skb, i, pfrag->page,
+                                              pfrag->offset)) {
+                                err = -EMSGSIZE;
+                                if (i == MAX_SKB_FRAGS)
+                                        goto error;
+                                __skb_fill_page_desc(skb, i, pfrag->page,
+                                                     pfrag->offset, 0);
+                                skb_shinfo(skb)->nr_frags = ++i;
+                                get_page(pfrag->page);
                        }
-                        cork->off += copy;
+                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
-                        skb_frag_size_add(frag, copy);
+                        if (getfrag(from,
+                                    page_address(pfrag->page) + pfrag->offset,
+                                    offset, copy, skb->len, skb) < 0)
+                                goto error_efault;
+                        pfrag->offset += copy;
+                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                        skb->len += copy;
                        skb->data_len += copy;
                        skb->truesize += copy;
@@ -1037,6 +1023,8 @@ alloc_new_skb:
        return 0;
+error_efault:
+        err = -EFAULT;
 error:
        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1077,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
        cork->dst = &rt->dst;
        cork->length = 0;
        cork->tx_flags = ipc->tx_flags;
-        cork->page = NULL;
-        cork->off = 0;
        return 0;
 }
@@ -1115,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                transhdrlen = 0;
        }
-        return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
+        return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+                                sk_page_frag(sk), getfrag,
                                from, length, transhdrlen, flags);
 }
@@ -1437,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
        if (err)
                return ERR_PTR(err);
-        err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
+        err = __ip_append_data(sk, fl4, &queue, &cork,
+                               &current->task_frag, getfrag,
                               from, length, transhdrlen, flags);
        if (err) {
                __ip_flush_pending_frames(sk, &queue, &cork);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3511ffba7bd4..978bca4818ae 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -304,7 +304,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
        err = -ENOENT;
-        rcu_read_lock();
        t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
        if (t == NULL)
                goto out;
@@ -326,7 +325,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
                t->err_count = 1;
        t->err_time = jiffies;
 out:
-        rcu_read_unlock();
        return err;
 }
@@ -336,7 +334,6 @@ static int vti_rcv(struct sk_buff *skb)
        struct ip_tunnel *tunnel;
        const struct iphdr *iph = ip_hdr(skb);
-        rcu_read_lock();
        tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
        if (tunnel != NULL) {
                struct pcpu_tstats *tstats;
@@ -348,10 +345,8 @@ static int vti_rcv(struct sk_buff *skb)
                u64_stats_update_end(&tstats->syncp);
                skb->dev = tunnel->dev;
-                rcu_read_unlock();
                return 1;
        }
-        rcu_read_unlock();
        return -1;
 }
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 67e8a6b086ea..798358b10717 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -583,6 +583,17 @@ static void __init ic_rarp_send_if(struct ic_device *d)
 #endif
 /*
+ *  Predefine Nameservers
+ */
+static inline void __init ic_nameservers_predef(void)
+{
+        int i;
+        for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+                ic_nameservers[i] = NONE;
+}
+/*
 *      DHCP/BOOTP support.
 */
@@ -747,10 +758,7 @@ static void __init ic_bootp_init_ext(u8 *e)
 */
 static inline void __init ic_bootp_init(void)
 {
-        int i;
+        ic_nameservers_predef();
-        for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
-                ic_nameservers[i] = NONE;
        dev_add_pack(&bootp_packet_type);
 }
@@ -1379,6 +1387,7 @@ static int __init ip_auto_config(void)
        int retries = CONF_OPEN_RETRIES;
 #endif
        int err;
+        unsigned int i;
 #ifdef CONFIG_PROC_FS
        proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1499,7 +1508,15 @@ static int __init ip_auto_config(void)
                &ic_servaddr, &root_server_addr, root_server_path);
        if (ic_dev_mtu)
                pr_cont(", mtu=%d", ic_dev_mtu);
-        pr_cont("\n");
+        for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+                if (ic_nameservers[i] != NONE) {
+                        pr_info("     nameserver%u=%pI4",
+                                i, &ic_nameservers[i]);
+                        break;
+                }
+        for (i++; i < CONF_NAMESERVERS_MAX; i++)
+                if (ic_nameservers[i] != NONE)
+                        pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
 #endif /* !SILENT */
        return 0;
@@ -1570,6 +1587,8 @@ static int __init ip_auto_config_setup(char *addrs)
                return 1;
        }
+        ic_nameservers_predef();
        /* Parse string for static IP assignment.  */
        ip = addrs;
        while (ip && *ip) {
@@ -1613,6 +1632,20 @@ static int __init ip_auto_config_setup(char *addrs)
                                        ic_enable = 0;
                                }
                                break;
+                        case 7:
+                                if (CONF_NAMESERVERS_MAX >= 1) {
+                                        ic_nameservers[0] = in_aton(ip);
+                                        if (ic_nameservers[0] == ANY)
+                                                ic_nameservers[0] = NONE;
+                                }
+                                break;
+                        case 8:
+                                if (CONF_NAMESERVERS_MAX >= 2) {
+                                        ic_nameservers[1] = in_aton(ip);
+                                        if (ic_nameservers[1] == ANY)
+                                                ic_nameservers[1] = NONE;
+                                }
+                                break;
                        }
                }
                ip = cp;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 99af1f0cc658..e15b45297c09 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -120,6 +120,10 @@
 #define HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static int ipip_net_id __read_mostly;
 struct ipip_net {
        struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
@@ -365,8 +369,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
        }
        err = -ENOENT;
-        rcu_read_lock();
        t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
        if (t == NULL)
                goto out;
@@ -398,34 +400,22 @@ static int ipip_err(struct sk_buff *skb, u32 info)
                t->err_count = 1;
        t->err_time = jiffies;
 out:
-        rcu_read_unlock();
-        return err;
-}
-static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
-                                        struct sk_buff *skb)
-{
-        struct iphdr *inner_iph = ip_hdr(skb);
-        if (INET_ECN_is_ce(outer_iph->tos))
+        return err;
-                IP_ECN_set_ce(inner_iph);
 }
 static int ipip_rcv(struct sk_buff *skb)
 {
        struct ip_tunnel *tunnel;
        const struct iphdr *iph = ip_hdr(skb);
+        int err;
-        rcu_read_lock();
        tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
        if (tunnel != NULL) {
                struct pcpu_tstats *tstats;
-                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
-                        rcu_read_unlock();
+                        goto drop;
-                        kfree_skb(skb);
-                        return 0;
-                }
                secpath_reset(skb);
@@ -434,24 +424,35 @@ static int ipip_rcv(struct sk_buff *skb)
                skb->protocol = htons(ETH_P_IP);
                skb->pkt_type = PACKET_HOST;
+                __skb_tunnel_rx(skb, tunnel->dev);
+                err = IP_ECN_decapsulate(iph, skb);
+                if (unlikely(err)) {
+                        if (log_ecn_error)
+                                net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+                                                     &iph->saddr, iph->tos);
+                        if (err > 1) {
+                                ++tunnel->dev->stats.rx_frame_errors;
+                                ++tunnel->dev->stats.rx_errors;
+                                goto drop;
+                        }
+                }
                tstats = this_cpu_ptr(tunnel->dev->tstats);
                u64_stats_update_begin(&tstats->syncp);
                tstats->rx_packets++;
                tstats->rx_bytes += skb->len;
                u64_stats_update_end(&tstats->syncp);
-                __skb_tunnel_rx(skb, tunnel->dev);
-                ipip_ecn_decapsulate(iph, skb);
                netif_rx(skb);
-                rcu_read_unlock();
                return 0;
        }
-        rcu_read_unlock();
        return -1;
+drop:
+        kfree_skb(skb);
+        return 0;
 }
 /*
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ebdf06f938bf..1daa95c2a0ba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -626,7 +626,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
                        e->error = -ETIMEDOUT;
                        memset(&e->msg, 0, sizeof(e->msg));
-                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
                } else {
                        kfree_skb(skb);
                }
@@ -870,7 +870,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
                                memset(&e->msg, 0, sizeof(e->msg));
                        }
-                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
                } else {
                        ip_mr_forward(net, mrt, skb, c, 0);
                }
@@ -1808,7 +1808,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
                .flowi4_oif = (rt_is_output_route(rt) ?
                               skb->dev->ifindex : 0),
                .flowi4_iif = (rt_is_output_route(rt) ?
-                               net->loopback_dev->ifindex :
+                               LOOPBACK_IFINDEX :
                               skb->dev->ifindex),
                .flowi4_mark = skb->mark,
        };
@@ -2117,12 +2117,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 }
 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-                            u32 pid, u32 seq, struct mfc_cache *c)
+                            u32 portid, u32 seq, struct mfc_cache *c)
 {
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
-        nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
+        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
        if (nlh == NULL)
                return -EMSGSIZE;
@@ -2176,7 +2176,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
                                if (e < s_e)
                                        goto next_entry;
                                if (ipmr_fill_mroute(mrt, skb,
-                                                     NETLINK_CB(cb->skb).pid,
+                                                     NETLINK_CB(cb->skb).portid,
                                                     cb->nlh->nlmsg_seq,
                                                     mfc) < 0)
                                        goto done;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ed1b36783192..4c0cf63dd92e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -72,43 +72,6 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
 }
 EXPORT_SYMBOL(ip_route_me_harder);
-#ifdef CONFIG_XFRM
-int ip_xfrm_me_harder(struct sk_buff *skb)
-{
-        struct flowi fl;
-        unsigned int hh_len;
-        struct dst_entry *dst;
-        if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
-                return 0;
-        if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
-                return -1;
-        dst = skb_dst(skb);
-        if (dst->xfrm)
-                dst = ((struct xfrm_dst *)dst)->route;
-        dst_hold(dst);
-        dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
-        if (IS_ERR(dst))
-                return -1;
-        skb_dst_drop(skb);
-        skb_dst_set(skb, dst);
-        /* Change in oif may mean change in hh_len. */
-        hh_len = skb_dst(skb)->dev->hard_header_len;
-        if (skb_headroom(skb) < hh_len &&
-            pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
-                return -1;
-        return 0;
-}
-EXPORT_SYMBOL(ip_xfrm_me_harder);
-#endif
-void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(ip_nat_decode_session);
 /*
 * Extra routing may needed on local out, as the QUEUE target never
 * returns control to the table.
@@ -225,12 +188,12 @@ static const struct nf_afinfo nf_ip_afinfo = {
        .route_key_size         = sizeof(struct ip_rt_info),
 };
-static int ipv4_netfilter_init(void)
+static int __init ipv4_netfilter_init(void)
 {
        return nf_register_afinfo(&nf_ip_afinfo);
 }
-static void ipv4_netfilter_fini(void)
+static void __exit ipv4_netfilter_fini(void)
 {
        nf_unregister_afinfo(&nf_ip_afinfo);
 }
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index fcc543cd987a..d8d6f2a5bf12 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -143,25 +143,22 @@ config IP_NF_TARGET_ULOG
          To compile it as a module, choose M here.  If unsure, say N.
 # NAT + specific targets: nf_conntrack
-config NF_NAT
+config NF_NAT_IPV4
-        tristate "Full NAT"
+        tristate "IPv4 NAT"
        depends on NF_CONNTRACK_IPV4
        default m if NETFILTER_ADVANCED=n
+        select NF_NAT
        help
-          The Full NAT option allows masquerading, port forwarding and other
+          The IPv4 NAT option allows masquerading, port forwarding and other
          forms of full Network Address Port Translation.  It is controlled by
          the `nat' table in iptables: see the man page for iptables(8).
          To compile it as a module, choose M here.  If unsure, say N.
-config NF_NAT_NEEDED
+if NF_NAT_IPV4
-        bool
-        depends on NF_NAT
-        default y
 config IP_NF_TARGET_MASQUERADE
        tristate "MASQUERADE target support"
-        depends on NF_NAT
        default m if NETFILTER_ADVANCED=n
        help
          Masquerading is a special case of NAT: all outgoing connections are
@@ -174,30 +171,27 @@ config IP_NF_TARGET_MASQUERADE
 config IP_NF_TARGET_NETMAP
        tristate "NETMAP target support"
-        depends on NF_NAT
        depends on NETFILTER_ADVANCED
-        help
+        select NETFILTER_XT_TARGET_NETMAP
-          NETMAP is an implementation of static 1:1 NAT mapping of network
+        ---help---
-          addresses. It maps the network address part, while keeping the host
+        This is a backwards-compat option for the user's convenience
-          address part intact.
+        (e.g. when running oldconfig). It selects
+        CONFIG_NETFILTER_XT_TARGET_NETMAP.
-          To compile it as a module, choose M here.  If unsure, say N.
 config IP_NF_TARGET_REDIRECT
        tristate "REDIRECT target support"
-        depends on NF_NAT
        depends on NETFILTER_ADVANCED
-        help
+        select NETFILTER_XT_TARGET_REDIRECT
-          REDIRECT is a special case of NAT: all incoming connections are
+        ---help---
-          mapped onto the incoming interface's address, causing the packets to
+        This is a backwards-compat option for the user's convenience
-          come to the local machine instead of passing through.  This is
+        (e.g. when running oldconfig). It selects
-          useful for transparent proxies.
+        CONFIG_NETFILTER_XT_TARGET_REDIRECT.
-          To compile it as a module, choose M here.  If unsure, say N.
+endif
 config NF_NAT_SNMP_BASIC
        tristate "Basic SNMP-ALG support"
-        depends on NF_CONNTRACK_SNMP && NF_NAT
+        depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
        depends on NETFILTER_ADVANCED
        default NF_NAT && NF_CONNTRACK_SNMP
        ---help---
@@ -219,61 +213,21 @@ config NF_NAT_SNMP_BASIC
 #           <expr> '&&' <expr>                   (6)
 #
 # (6) Returns the result of min(/expr/, /expr/).
-config NF_NAT_PROTO_DCCP
-        tristate
-        depends on NF_NAT && NF_CT_PROTO_DCCP
-        default NF_NAT && NF_CT_PROTO_DCCP
 config NF_NAT_PROTO_GRE
        tristate
-        depends on NF_NAT && NF_CT_PROTO_GRE
+        depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
-config NF_NAT_PROTO_UDPLITE
-        tristate
-        depends on NF_NAT && NF_CT_PROTO_UDPLITE
-        default NF_NAT && NF_CT_PROTO_UDPLITE
-config NF_NAT_PROTO_SCTP
-        tristate
-        default NF_NAT && NF_CT_PROTO_SCTP
-        depends on NF_NAT && NF_CT_PROTO_SCTP
-        select LIBCRC32C
-config NF_NAT_FTP
-        tristate
-        depends on NF_CONNTRACK && NF_NAT
-        default NF_NAT && NF_CONNTRACK_FTP
-config NF_NAT_IRC
-        tristate
-        depends on NF_CONNTRACK && NF_NAT
-        default NF_NAT && NF_CONNTRACK_IRC
-config NF_NAT_TFTP
-        tristate
-        depends on NF_CONNTRACK && NF_NAT
-        default NF_NAT && NF_CONNTRACK_TFTP
-config NF_NAT_AMANDA
-        tristate
-        depends on NF_CONNTRACK && NF_NAT
-        default NF_NAT && NF_CONNTRACK_AMANDA
 config NF_NAT_PPTP
        tristate
-        depends on NF_CONNTRACK && NF_NAT
+        depends on NF_CONNTRACK && NF_NAT_IPV4
-        default NF_NAT && NF_CONNTRACK_PPTP
+        default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
        select NF_NAT_PROTO_GRE
 config NF_NAT_H323
        tristate
-        depends on NF_CONNTRACK && NF_NAT
+        depends on NF_CONNTRACK && NF_NAT_IPV4
-        default NF_NAT && NF_CONNTRACK_H323
+        default NF_NAT_IPV4 && NF_CONNTRACK_H323
-config NF_NAT_SIP
-        tristate
-        depends on NF_CONNTRACK && NF_NAT
-        default NF_NAT && NF_CONNTRACK_SIP
 # mangle + specific targets
 config IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c20674dc9452..007b128eecc9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,32 +10,22 @@ nf_conntrack_ipv4-objs	+= nf_conntrack_l3proto_ipv4_compat.o
 endif
 endif
-nf_nat-y                := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
-iptable_nat-y   := nf_nat_rule.o nf_nat_standalone.o
 # connection tracking
 obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
-obj-$(CONFIG_NF_NAT) += nf_nat.o
+nf_nat_ipv4-y           := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 # defrag
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 # NAT helpers (nf_conntrack)
-obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
-obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
-obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
-obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 # NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
-obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
@@ -43,7 +33,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 # the three instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
 obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
 obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
 obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
@@ -55,8 +45,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
-obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
-obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index cbb6a1a6f6f7..5d5d4d1be9c2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,9 +19,9 @@
 #include <net/ip.h>
 #include <net/checksum.h>
 #include <net/route.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -49,7 +49,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        struct nf_conn *ct;
        struct nf_conn_nat *nat;
        enum ip_conntrack_info ctinfo;
-        struct nf_nat_ipv4_range newrange;
+        struct nf_nat_range newrange;
        const struct nf_nat_ipv4_multi_range_compat *mr;
        const struct rtable *rt;
        __be32 newsrc, nh;
@@ -80,10 +80,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        nat->masq_index = par->out->ifindex;
        /* Transfer from original range. */
-        newrange = ((struct nf_nat_ipv4_range)
+        memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
-                { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
+        memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
-                  newsrc, newsrc,
+        newrange.flags       = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
-                  mr->range[0].min, mr->range[0].max });
+        newrange.min_addr.ip = newsrc;
+        newrange.max_addr.ip = newsrc;
+        newrange.min_proto   = mr->range[0].min;
+        newrange.max_proto   = mr->range[0].max;
        /* Hand modified range to generic setup. */
        return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
@@ -96,7 +99,8 @@ device_cmp(struct nf_conn *i, void *ifindex)
        if (!nat)
                return 0;
+        if (nf_ct_l3num(i) != NFPROTO_IPV4)
+                return 0;
        return nat->masq_index == (int)(long)ifindex;
 }
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
deleted file mode 100644
index b5bfbbabf70d..000000000000
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/* NETMAP - static NAT mapping of IP network addresses (1:1).
- * The mapping can be applied to source (POSTROUTING),
- * destination (PREROUTING), or both (with separate rules).
- */
-/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_rule.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
-MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
-static int netmap_tg_check(const struct xt_tgchk_param *par)
-{
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) {
-                pr_debug("bad MAP_IPS.\n");
-                return -EINVAL;
-        }
-        if (mr->rangesize != 1) {
-                pr_debug("bad rangesize %u.\n", mr->rangesize);
-                return -EINVAL;
-        }
-        return 0;
-}
-static unsigned int
-netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
-        struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-        __be32 new_ip, netmask;
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        struct nf_nat_ipv4_range newrange;
-        NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
-                     par->hooknum == NF_INET_POST_ROUTING ||
-                     par->hooknum == NF_INET_LOCAL_OUT ||
-                     par->hooknum == NF_INET_LOCAL_IN);
-        ct = nf_ct_get(skb, &ctinfo);
-        netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
-        if (par->hooknum == NF_INET_PRE_ROUTING ||
-            par->hooknum == NF_INET_LOCAL_OUT)
-                new_ip = ip_hdr(skb)->daddr & ~netmask;
-        else
-                new_ip = ip_hdr(skb)->saddr & ~netmask;
-        new_ip |= mr->range[0].min_ip & netmask;
-        newrange = ((struct nf_nat_ipv4_range)
-                { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
-                  new_ip, new_ip,
-                  mr->range[0].min, mr->range[0].max });
-        /* Hand modified range to generic setup. */
-        return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
-}
-static struct xt_target netmap_tg_reg __read_mostly = {
-        .name           = "NETMAP",
-        .family         = NFPROTO_IPV4,
-        .target         = netmap_tg,
-        .targetsize     = sizeof(struct nf_nat_ipv4_multi_range_compat),
-        .table          = "nat",
-        .hooks          = (1 << NF_INET_PRE_ROUTING) |
-                          (1 << NF_INET_POST_ROUTING) |
-                          (1 << NF_INET_LOCAL_OUT) |
-                          (1 << NF_INET_LOCAL_IN),
-        .checkentry     = netmap_tg_check,
-        .me             = THIS_MODULE
-};
-static int __init netmap_tg_init(void)
-{
-        return xt_register_target(&netmap_tg_reg);
-}
-static void __exit netmap_tg_exit(void)
-{
-        xt_unregister_target(&netmap_tg_reg);
-}
-module_init(netmap_tg_init);
-module_exit(netmap_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
deleted file mode 100644
index 7c0103a5203e..000000000000
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Redirect.  Simple mapping which alters dst to a local IP address. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <net/protocol.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_rule.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
-/* FIXME: Take multiple ranges --RR */
-static int redirect_tg_check(const struct xt_tgchk_param *par)
-{
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
-                pr_debug("bad MAP_IPS.\n");
-                return -EINVAL;
-        }
-        if (mr->rangesize != 1) {
-                pr_debug("bad rangesize %u.\n", mr->rangesize);
-                return -EINVAL;
-        }
-        return 0;
-}
-static unsigned int
-redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
-        struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-        __be32 newdst;
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        struct nf_nat_ipv4_range newrange;
-        NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
-                     par->hooknum == NF_INET_LOCAL_OUT);
-        ct = nf_ct_get(skb, &ctinfo);
-        NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-        /* Local packets: make them go to loopback */
-        if (par->hooknum == NF_INET_LOCAL_OUT)
-                newdst = htonl(0x7F000001);
-        else {
-                struct in_device *indev;
-                struct in_ifaddr *ifa;
-                newdst = 0;
-                rcu_read_lock();
-                indev = __in_dev_get_rcu(skb->dev);
-                if (indev && (ifa = indev->ifa_list))
-                        newdst = ifa->ifa_local;
-                rcu_read_unlock();
-                if (!newdst)
-                        return NF_DROP;
-        }
-        /* Transfer from original range. */
-        newrange = ((struct nf_nat_ipv4_range)
-                { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
-                  newdst, newdst,
-                  mr->range[0].min, mr->range[0].max });
-        /* Hand modified range to generic setup. */
-        return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
-}
-static struct xt_target redirect_tg_reg __read_mostly = {
-        .name           = "REDIRECT",
-        .family         = NFPROTO_IPV4,
-        .target         = redirect_tg,
-        .targetsize     = sizeof(struct nf_nat_ipv4_multi_range_compat),
-        .table          = "nat",
-        .hooks          = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
-        .checkentry     = redirect_tg_check,
-        .me             = THIS_MODULE,
-};
-static int __init redirect_tg_init(void)
-{
-        return xt_register_target(&redirect_tg_reg);
-}
-static void __exit redirect_tg_exit(void)
-{
-        xt_unregister_target(&redirect_tg_reg);
-}
-module_init(redirect_tg_init);
-module_exit(redirect_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 1109f7f6c254..b5ef3cba2250 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -396,8 +396,7 @@ static int __init ulog_tg_init(void)
        for (i = 0; i < ULOG_MAXNLGROUPS; i++)
                setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
-        nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+        nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
-                                        THIS_MODULE, &cfg);
        if (!nflognl)
                return -ENOMEM;
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 31371be8174b..c30130062cd6 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
                        return ipv4_is_local_multicast(iph->daddr) ^ invert;
                flow.flowi4_iif = 0;
        } else {
-                flow.flowi4_iif = dev_net(par->in)->loopback_dev->ifindex;
+                flow.flowi4_iif = LOOPBACK_IFINDEX;
        }
        flow.daddr = iph->saddr;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 851acec852d2..6b3da5cf54e9 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -69,9 +69,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
        net->ipv4.iptable_filter =
                ipt_register_table(net, &packet_filter, repl);
        kfree(repl);
-        if (IS_ERR(net->ipv4.iptable_filter))
+        return PTR_RET(net->ipv4.iptable_filter);
-                return PTR_ERR(net->ipv4.iptable_filter);
-        return 0;
 }
 static void __net_exit iptable_filter_net_exit(struct net *net)
@@ -96,14 +94,10 @@ static int __init iptable_filter_init(void)
        filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
        if (IS_ERR(filter_ops)) {
                ret = PTR_ERR(filter_ops);
-                goto cleanup_table;
+                unregister_pernet_subsys(&iptable_filter_net_ops);
        }
        return ret;
- cleanup_table:
-        unregister_pernet_subsys(&iptable_filter_net_ops);
-        return ret;
 }
 static void __exit iptable_filter_fini(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index aef5d1fbe77d..85d88f206447 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -104,9 +104,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
        net->ipv4.iptable_mangle =
                ipt_register_table(net, &packet_mangler, repl);
        kfree(repl);
-        if (IS_ERR(net->ipv4.iptable_mangle))
+        return PTR_RET(net->ipv4.iptable_mangle);
-                return PTR_ERR(net->ipv4.iptable_mangle);
-        return 0;
 }
 static void __net_exit iptable_mangle_net_exit(struct net *net)
@@ -131,14 +129,10 @@ static int __init iptable_mangle_init(void)
        mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
        if (IS_ERR(mangle_ops)) {
                ret = PTR_ERR(mangle_ops);
-                goto cleanup_table;
+                unregister_pernet_subsys(&iptable_mangle_net_ops);
        }
        return ret;
- cleanup_table:
-        unregister_pernet_subsys(&iptable_mangle_net_ops);
-        return ret;
 }
 static void __exit iptable_mangle_fini(void)
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/iptable_nat.c
index 3828a4229822..9e0ffaf1d942 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -1,84 +1,71 @@
 /* (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/types.h>
-#include <linux/icmp.h>
+#include <linux/module.h>
-#include <linux/gfp.h>
-#include <linux/ip.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/skbuff.h>
+#include <linux/ip.h>
-#include <linux/proc_fs.h>
 #include <net/ip.h>
-#include <net/checksum.h>
-#include <linux/spinlock.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
 #include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_l3proto.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
+static const struct xt_table nf_nat_ipv4_table = {
+        .name           = "nat",
+        .valid_hooks    = (1 << NF_INET_PRE_ROUTING) |
+                          (1 << NF_INET_POST_ROUTING) |
+                          (1 << NF_INET_LOCAL_OUT) |
+                          (1 << NF_INET_LOCAL_IN),
+        .me             = THIS_MODULE,
+        .af             = NFPROTO_IPV4,
+};
-#ifdef CONFIG_XFRM
+static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
-static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
 {
-        struct flowi4 *fl4 = &fl->u.ip4;
+        /* Force range to this IP; let proto decide mapping for
-        const struct nf_conn *ct;
+         * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
-        const struct nf_conntrack_tuple *t;
+         */
-        enum ip_conntrack_info ctinfo;
+        struct nf_nat_range range;
-        enum ip_conntrack_dir dir;
-        unsigned long statusbit;
+        range.flags = 0;
+        pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
-        ct = nf_ct_get(skb, &ctinfo);
+                 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
-        if (ct == NULL)
+                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
-                return;
+                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-        dir = CTINFO2DIR(ctinfo);
-        t = &ct->tuplehash[dir].tuple;
+        return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+}
-        if (dir == IP_CT_DIR_ORIGINAL)
-                statusbit = IPS_DST_NAT;
-        else
-                statusbit = IPS_SRC_NAT;
-        if (ct->status & statusbit) {
-                fl4->daddr = t->dst.u3.ip;
-                if (t->dst.protonum == IPPROTO_TCP ||
-                    t->dst.protonum == IPPROTO_UDP ||
-                    t->dst.protonum == IPPROTO_UDPLITE ||
-                    t->dst.protonum == IPPROTO_DCCP ||
-                    t->dst.protonum == IPPROTO_SCTP)
-                        fl4->fl4_dport = t->dst.u.tcp.port;
-        }
-        statusbit ^= IPS_NAT_MASK;
+static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
+                                     const struct net_device *in,
+                                     const struct net_device *out,
+                                     struct nf_conn *ct)
+{
+        struct net *net = nf_ct_net(ct);
+        unsigned int ret;
-        if (ct->status & statusbit) {
+        ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
-                fl4->saddr = t->src.u3.ip;
+        if (ret == NF_ACCEPT) {
-                if (t->dst.protonum == IPPROTO_TCP ||
+                if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
-                    t->dst.protonum == IPPROTO_UDP ||
+                        ret = alloc_null_binding(ct, hooknum);
-                    t->dst.protonum == IPPROTO_UDPLITE ||
-                    t->dst.protonum == IPPROTO_DCCP ||
-                    t->dst.protonum == IPPROTO_SCTP)
-                        fl4->fl4_sport = t->src.u.tcp.port;
        }
+        return ret;
 }
-#endif
 static unsigned int
-nf_nat_fn(unsigned int hooknum,
+nf_nat_ipv4_fn(unsigned int hooknum,
-          struct sk_buff *skb,
+               struct sk_buff *skb,
-          const struct net_device *in,
+               const struct net_device *in,
-          const struct net_device *out,
+               const struct net_device *out,
-          int (*okfn)(struct sk_buff *))
+               int (*okfn)(struct sk_buff *))
 {
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
@@ -87,14 +74,16 @@ nf_nat_fn(unsigned int hooknum,
        enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
        /* We never see fragments: conntrack defrags on pre-routing
-           and local-out, and nf_nat_out protects post-routing. */
+         * and local-out, and nf_nat_out protects post-routing.
+         */
        NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
        ct = nf_ct_get(skb, &ctinfo);
        /* Can't track?  It's not due to stress, or conntrack would
-           have dropped it.  Hence it's the user's responsibilty to
+         * have dropped it.  Hence it's the user's responsibilty to
-           packet filter it out, or implement conntrack/NAT for that
+         * packet filter it out, or implement conntrack/NAT for that
-           protocol. 8) --RR */
+         * protocol. 8) --RR
+         */
        if (!ct)
                return NF_ACCEPT;
@@ -118,17 +107,17 @@ nf_nat_fn(unsigned int hooknum,
        case IP_CT_RELATED:
        case IP_CT_RELATED_REPLY:
                if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-                        if (!nf_nat_icmp_reply_translation(ct, ctinfo,
+                        if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-                                                           hooknum, skb))
+                                                           hooknum))
                                return NF_DROP;
                        else
                                return NF_ACCEPT;
                }
                /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
        case IP_CT_NEW:
                /* Seen it before?  This can happen for loopback, retrans,
-                   or local packets.. */
+                 * or local packets.
+                 */
                if (!nf_nat_initialized(ct, maniptype)) {
                        unsigned int ret;
@@ -151,16 +140,16 @@ nf_nat_fn(unsigned int hooknum,
 }
 static unsigned int
-nf_nat_in(unsigned int hooknum,
+nf_nat_ipv4_in(unsigned int hooknum,
-          struct sk_buff *skb,
+               struct sk_buff *skb,
-          const struct net_device *in,
+               const struct net_device *in,
-          const struct net_device *out,
+               const struct net_device *out,
-          int (*okfn)(struct sk_buff *))
+               int (*okfn)(struct sk_buff *))
 {
        unsigned int ret;
        __be32 daddr = ip_hdr(skb)->daddr;
-        ret = nf_nat_fn(hooknum, skb, in, out, okfn);
+        ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
        if (ret != NF_DROP && ret != NF_STOLEN &&
            daddr != ip_hdr(skb)->daddr)
                skb_dst_drop(skb);
@@ -169,11 +158,11 @@ nf_nat_in(unsigned int hooknum,
 }
 static unsigned int
-nf_nat_out(unsigned int hooknum,
+nf_nat_ipv4_out(unsigned int hooknum,
-           struct sk_buff *skb,
+                struct sk_buff *skb,
-           const struct net_device *in,
+                const struct net_device *in,
-           const struct net_device *out,
+                const struct net_device *out,
-           int (*okfn)(struct sk_buff *))
+                int (*okfn)(struct sk_buff *))
 {
 #ifdef CONFIG_XFRM
        const struct nf_conn *ct;
@@ -186,29 +175,30 @@ nf_nat_out(unsigned int hooknum,
            ip_hdrlen(skb) < sizeof(struct iphdr))
                return NF_ACCEPT;
-        ret = nf_nat_fn(hooknum, skb, in, out, okfn);
+        ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
 #ifdef CONFIG_XFRM
        if (ret != NF_DROP && ret != NF_STOLEN &&
+            !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
            (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
                if ((ct->tuplehash[dir].tuple.src.u3.ip !=
                     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
                    (ct->tuplehash[dir].tuple.src.u.all !=
-                     ct->tuplehash[!dir].tuple.dst.u.all)
+                     ct->tuplehash[!dir].tuple.dst.u.all))
-                   )
+                        if (nf_xfrm_me_harder(skb, AF_INET) < 0)
-                        return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
+                                ret = NF_DROP;
        }
 #endif
        return ret;
 }
 static unsigned int
-nf_nat_local_fn(unsigned int hooknum,
+nf_nat_ipv4_local_fn(unsigned int hooknum,
-                struct sk_buff *skb,
+                     struct sk_buff *skb,
-                const struct net_device *in,
+                     const struct net_device *in,
-                const struct net_device *out,
+                     const struct net_device *out,
-                int (*okfn)(struct sk_buff *))
+                     int (*okfn)(struct sk_buff *))
 {
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
@@ -219,7 +209,7 @@ nf_nat_local_fn(unsigned int hooknum,
            ip_hdrlen(skb) < sizeof(struct iphdr))
                return NF_ACCEPT;
-        ret = nf_nat_fn(hooknum, skb, in, out, okfn);
+        ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
        if (ret != NF_DROP && ret != NF_STOLEN &&
            (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -230,21 +220,20 @@ nf_nat_local_fn(unsigned int hooknum,
                                ret = NF_DROP;
                }
 #ifdef CONFIG_XFRM
-                else if (ct->tuplehash[dir].tuple.dst.u.all !=
+                else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+                         ct->tuplehash[dir].tuple.dst.u.all !=
                         ct->tuplehash[!dir].tuple.src.u.all)
-                        if (ip_xfrm_me_harder(skb))
+                        if (nf_xfrm_me_harder(skb, AF_INET) < 0)
                                ret = NF_DROP;
 #endif
        }
        return ret;
 }
-/* We must be after connection tracking and before packet filtering. */
+static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
-static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
        /* Before packet filtering, change destination */
        {
-                .hook           = nf_nat_in,
+                .hook           = nf_nat_ipv4_in,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_PRE_ROUTING,
@@ -252,7 +241,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
        },
        /* After packet filtering, change source */
        {
-                .hook           = nf_nat_out,
+                .hook           = nf_nat_ipv4_out,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_POST_ROUTING,
@@ -260,7 +249,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
        },
        /* Before packet filtering, change destination */
        {
-                .hook           = nf_nat_local_fn,
+                .hook           = nf_nat_ipv4_local_fn,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_OUT,
@@ -268,7 +257,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
        },
        /* After packet filtering, change source */
        {
-                .hook           = nf_nat_fn,
+                .hook           = nf_nat_ipv4_fn,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_IN,
@@ -276,51 +265,56 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
        },
 };
-static int __init nf_nat_standalone_init(void)
+static int __net_init iptable_nat_net_init(struct net *net)
 {
-        int ret = 0;
+        struct ipt_replace *repl;
+        repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+        if (repl == NULL)
+                return -ENOMEM;
+        net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+        kfree(repl);
+        if (IS_ERR(net->ipv4.nat_table))
+                return PTR_ERR(net->ipv4.nat_table);
+        return 0;
+}
-        need_ipv4_conntrack();
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+        ipt_unregister_table(net, net->ipv4.nat_table);
+}
-#ifdef CONFIG_XFRM
+static struct pernet_operations iptable_nat_net_ops = {
-        BUG_ON(ip_nat_decode_session != NULL);
+        .init   = iptable_nat_net_init,
-        RCU_INIT_POINTER(ip_nat_decode_session, nat_decode_session);
+        .exit   = iptable_nat_net_exit,
-#endif
+};
-        ret = nf_nat_rule_init();
-        if (ret < 0) {
-                pr_err("nf_nat_init: can't setup rules.\n");
-                goto cleanup_decode_session;
-        }
-        ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
-        if (ret < 0) {
-                pr_err("nf_nat_init: can't register hooks.\n");
-                goto cleanup_rule_init;
-        }
-        return ret;
- cleanup_rule_init:
+static int __init iptable_nat_init(void)
-        nf_nat_rule_cleanup();
+{
- cleanup_decode_session:
+        int err;
-#ifdef CONFIG_XFRM
-        RCU_INIT_POINTER(ip_nat_decode_session, NULL);
+        err = register_pernet_subsys(&iptable_nat_net_ops);
-        synchronize_net();
+        if (err < 0)
-#endif
+                goto err1;
-        return ret;
+        err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+        if (err < 0)
+                goto err2;
+        return 0;
+err2:
+        unregister_pernet_subsys(&iptable_nat_net_ops);
+err1:
+        return err;
 }
-static void __exit nf_nat_standalone_fini(void)
+static void __exit iptable_nat_exit(void)
 {
-        nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
+        nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-        nf_nat_rule_cleanup();
+        unregister_pernet_subsys(&iptable_nat_net_ops);
-#ifdef CONFIG_XFRM
-        RCU_INIT_POINTER(ip_nat_decode_session, NULL);
-        synchronize_net();
-#endif
-        /* Conntrack caches are unregistered in nf_conntrack_cleanup */
 }
-module_init(nf_nat_standalone_init);
+module_init(iptable_nat_init);
-module_exit(nf_nat_standalone_fini);
+module_exit(iptable_nat_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 07fb710cd722..03d9696d3c6e 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,9 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
        net->ipv4.iptable_raw =
                ipt_register_table(net, &packet_raw, repl);
        kfree(repl);
-        if (IS_ERR(net->ipv4.iptable_raw))
+        return PTR_RET(net->ipv4.iptable_raw);
-                return PTR_ERR(net->ipv4.iptable_raw);
-        return 0;
 }
 static void __net_exit iptable_raw_net_exit(struct net *net)
@@ -75,14 +73,10 @@ static int __init iptable_raw_init(void)
        rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
        if (IS_ERR(rawtable_ops)) {
                ret = PTR_ERR(rawtable_ops);
-                goto cleanup_table;
+                unregister_pernet_subsys(&iptable_raw_net_ops);
        }
        return ret;
- cleanup_table:
-        unregister_pernet_subsys(&iptable_raw_net_ops);
-        return ret;
 }
 static void __exit iptable_raw_fini(void)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index be45bdc4c602..b283d8e2601a 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,10 +66,7 @@ static int __net_init iptable_security_net_init(struct net *net)
        net->ipv4.iptable_security =
                ipt_register_table(net, &security_table, repl);
        kfree(repl);
-        if (IS_ERR(net->ipv4.iptable_security))
+        return PTR_RET(net->ipv4.iptable_security);
-                return PTR_ERR(net->ipv4.iptable_security);
-        return 0;
 }
 static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index e7ff2dcab6ce..fcdd0c2406e6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -29,11 +29,6 @@
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 #include <net/netfilter/nf_log.h>
-int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
-                              struct nf_conn *ct,
-                              enum ip_conntrack_info ctinfo);
-EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
 static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
                              struct nf_conntrack_tuple *tuple)
 {
@@ -149,7 +144,8 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
                typeof(nf_nat_seq_adjust_hook) seq_adjust;
                seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-                if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) {
+                if (!seq_adjust ||
+                    !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
                        NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
                        return NF_DROP;
                }
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
deleted file mode 100644
index 3c04d24e2976..000000000000
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Amanda extension for TCP NAT alteration.
- * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
- * based on a copy of HW's ip_nat_irc.c as well as other modules
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/udp.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <linux/netfilter/nf_conntrack_amanda.h>
-MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
-MODULE_DESCRIPTION("Amanda NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_amanda");
-static unsigned int help(struct sk_buff *skb,
-                         enum ip_conntrack_info ctinfo,
-                         unsigned int matchoff,
-                         unsigned int matchlen,
-                         struct nf_conntrack_expect *exp)
-{
-        char buffer[sizeof("65535")];
-        u_int16_t port;
-        unsigned int ret;
-        /* Connection comes from client. */
-        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
-        exp->dir = IP_CT_DIR_ORIGINAL;
-        /* When you see the packet, we need to NAT it the same as the
-         * this one (ie. same IP: it will be TCP and master is UDP). */
-        exp->expectfn = nf_nat_follow_master;
-        /* Try to get same port: if not, try to change it. */
-        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
-                int res;
-                exp->tuple.dst.u.tcp.port = htons(port);
-                res = nf_ct_expect_related(exp);
-                if (res == 0)
-                        break;
-                else if (res != -EBUSY) {
-                        port = 0;
-                        break;
-                }
-        }
-        if (port == 0)
-                return NF_DROP;
-        sprintf(buffer, "%u", port);
-        ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
-                                       matchoff, matchlen,
-                                       buffer, strlen(buffer));
-        if (ret != NF_ACCEPT)
-                nf_ct_unexpect_related(exp);
-        return ret;
-}
-static void __exit nf_nat_amanda_fini(void)
-{
-        RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
-        synchronize_rcu();
-}
-static int __init nf_nat_amanda_init(void)
-{
-        BUG_ON(nf_nat_amanda_hook != NULL);
-        RCU_INIT_POINTER(nf_nat_amanda_hook, help);
-        return 0;
-}
-module_init(nf_nat_amanda_init);
-module_exit(nf_nat_amanda_fini);
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
deleted file mode 100644
index 44b082fd48ab..000000000000
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ /dev/null
@@ -1,763 +0,0 @@
-/* NAT for netfilter; shared with compatibility layer. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/gfp.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h>  /* For tcp_prot in getorigdst */
-#include <linux/icmp.h>
-#include <linux/udp.h>
-#include <linux/jhash.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-static DEFINE_SPINLOCK(nf_nat_lock);
-static struct nf_conntrack_l3proto *l3proto __read_mostly;
-#define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
-                                                __read_mostly;
-static inline const struct nf_nat_protocol *
-__nf_nat_proto_find(u_int8_t protonum)
-{
-        return rcu_dereference(nf_nat_protos[protonum]);
-}
-/* We keep an extra hash for each conntrack, for fast searching. */
-static inline unsigned int
-hash_by_src(const struct net *net, u16 zone,
-            const struct nf_conntrack_tuple *tuple)
-{
-        unsigned int hash;
-        /* Original src, to ensure we map it consistently if poss. */
-        hash = jhash_3words((__force u32)tuple->src.u3.ip,
-                            (__force u32)tuple->src.u.all ^ zone,
-                            tuple->dst.protonum, nf_conntrack_hash_rnd);
-        return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
-}
-/* Is this tuple already taken? (not by us) */
-int
-nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
-                  const struct nf_conn *ignored_conntrack)
-{
-        /* Conntrack tracking doesn't keep track of outgoing tuples; only
-           incoming ones.  NAT means they don't have a fixed mapping,
-           so we invert the tuple and look for the incoming reply.
-           We could keep a separate hash if this proves too slow. */
-        struct nf_conntrack_tuple reply;
-        nf_ct_invert_tuplepr(&reply, tuple);
-        return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
-}
-EXPORT_SYMBOL(nf_nat_used_tuple);
-/* If we source map this tuple so reply looks like reply_tuple, will
- * that meet the constraints of range. */
-static int
-in_range(const struct nf_conntrack_tuple *tuple,
-         const struct nf_nat_ipv4_range *range)
-{
-        const struct nf_nat_protocol *proto;
-        int ret = 0;
-        /* If we are supposed to map IPs, then we must be in the
-           range specified, otherwise let this drag us onto a new src IP. */
-        if (range->flags & NF_NAT_RANGE_MAP_IPS) {
-                if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
-                    ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
-                        return 0;
-        }
-        rcu_read_lock();
-        proto = __nf_nat_proto_find(tuple->dst.protonum);
-        if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
-            proto->in_range(tuple, NF_NAT_MANIP_SRC,
-                            &range->min, &range->max))
-                ret = 1;
-        rcu_read_unlock();
-        return ret;
-}
-static inline int
-same_src(const struct nf_conn *ct,
-         const struct nf_conntrack_tuple *tuple)
-{
-        const struct nf_conntrack_tuple *t;
-        t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-        return (t->dst.protonum == tuple->dst.protonum &&
-                t->src.u3.ip == tuple->src.u3.ip &&
-                t->src.u.all == tuple->src.u.all);
-}
-/* Only called for SRC manip */
-static int
-find_appropriate_src(struct net *net, u16 zone,
-                     const struct nf_conntrack_tuple *tuple,
-                     struct nf_conntrack_tuple *result,
-                     const struct nf_nat_ipv4_range *range)
-{
-        unsigned int h = hash_by_src(net, zone, tuple);
-        const struct nf_conn_nat *nat;
-        const struct nf_conn *ct;
-        const struct hlist_node *n;
-        rcu_read_lock();
-        hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
-                ct = nat->ct;
-                if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
-                        /* Copy source part from reply tuple. */
-                        nf_ct_invert_tuplepr(result,
-                                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-                        result->dst = tuple->dst;
-                        if (in_range(result, range)) {
-                                rcu_read_unlock();
-                                return 1;
-                        }
-                }
-        }
-        rcu_read_unlock();
-        return 0;
-}
-/* For [FUTURE] fragmentation handling, we want the least-used
-   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
-   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
-   1-65535, we don't do pro-rata allocation based on ports; we choose
-   the ip with the lowest src-ip/dst-ip/proto usage.
-*/
-static void
-find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
-                    const struct nf_nat_ipv4_range *range,
-                    const struct nf_conn *ct,
-                    enum nf_nat_manip_type maniptype)
-{
-        __be32 *var_ipp;
-        /* Host order */
-        u_int32_t minip, maxip, j;
-        /* No IP mapping?  Do nothing. */
-        if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
-                return;
-        if (maniptype == NF_NAT_MANIP_SRC)
-                var_ipp = &tuple->src.u3.ip;
-        else
-                var_ipp = &tuple->dst.u3.ip;
-        /* Fast path: only one choice. */
-        if (range->min_ip == range->max_ip) {
-                *var_ipp = range->min_ip;
-                return;
-        }
-        /* Hashing source and destination IPs gives a fairly even
-         * spread in practice (if there are a small number of IPs
-         * involved, there usually aren't that many connections
-         * anyway).  The consistency means that servers see the same
-         * client coming from the same IP (some Internet Banking sites
-         * like this), even across reboots. */
-        minip = ntohl(range->min_ip);
-        maxip = ntohl(range->max_ip);
-        j = jhash_2words((__force u32)tuple->src.u3.ip,
-                         range->flags & NF_NAT_RANGE_PERSISTENT ?
-                                0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
-        j = ((u64)j * (maxip - minip + 1)) >> 32;
-        *var_ipp = htonl(minip + j);
-}
-/* Manipulate the tuple into the range given.  For NF_INET_POST_ROUTING,
- * we change the source to map into the range.  For NF_INET_PRE_ROUTING
- * and NF_INET_LOCAL_OUT, we change the destination to map into the
- * range.  It might not be possible to get a unique tuple, but we try.
- * At worst (or if we race), we will end up with a final duplicate in
- * __ip_conntrack_confirm and drop the packet. */
-static void
-get_unique_tuple(struct nf_conntrack_tuple *tuple,
-                 const struct nf_conntrack_tuple *orig_tuple,
-                 const struct nf_nat_ipv4_range *range,
-                 struct nf_conn *ct,
-                 enum nf_nat_manip_type maniptype)
-{
-        struct net *net = nf_ct_net(ct);
-        const struct nf_nat_protocol *proto;
-        u16 zone = nf_ct_zone(ct);
-        /* 1) If this srcip/proto/src-proto-part is currently mapped,
-           and that same mapping gives a unique tuple within the given
-           range, use that.
-           This is only required for source (ie. NAT/masq) mappings.
-           So far, we don't do local source mappings, so multiple
-           manips not an issue.  */
-        if (maniptype == NF_NAT_MANIP_SRC &&
-            !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
-                /* try the original tuple first */
-                if (in_range(orig_tuple, range)) {
-                        if (!nf_nat_used_tuple(orig_tuple, ct)) {
-                                *tuple = *orig_tuple;
-                                return;
-                        }
-                } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
-                           range)) {
-                        pr_debug("get_unique_tuple: Found current src map\n");
-                        if (!nf_nat_used_tuple(tuple, ct))
-                                return;
-                }
-        }
-        /* 2) Select the least-used IP/proto combination in the given
-           range. */
-        *tuple = *orig_tuple;
-        find_best_ips_proto(zone, tuple, range, ct, maniptype);
-        /* 3) The per-protocol part of the manip is made to map into
-           the range to make a unique tuple. */
-        rcu_read_lock();
-        proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
-        /* Only bother mapping if it's not already in range and unique */
-        if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
-                if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
-                        if (proto->in_range(tuple, maniptype, &range->min,
-                                            &range->max) &&
-                            (range->min.all == range->max.all ||
-                             !nf_nat_used_tuple(tuple, ct)))
-                                goto out;
-                } else if (!nf_nat_used_tuple(tuple, ct)) {
-                        goto out;
-                }
-        }
-        /* Last change: get protocol to try to obtain unique tuple. */
-        proto->unique_tuple(tuple, range, maniptype, ct);
-out:
-        rcu_read_unlock();
-}
-unsigned int
-nf_nat_setup_info(struct nf_conn *ct,
-                  const struct nf_nat_ipv4_range *range,
-                  enum nf_nat_manip_type maniptype)
-{
-        struct net *net = nf_ct_net(ct);
-        struct nf_conntrack_tuple curr_tuple, new_tuple;
-        struct nf_conn_nat *nat;
-        /* nat helper or nfctnetlink also setup binding */
-        nat = nfct_nat(ct);
-        if (!nat) {
-                nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
-                if (nat == NULL) {
-                        pr_debug("failed to add NAT extension\n");
-                        return NF_ACCEPT;
-                }
-        }
-        NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
-                     maniptype == NF_NAT_MANIP_DST);
-        BUG_ON(nf_nat_initialized(ct, maniptype));
-        /* What we've got will look like inverse of reply. Normally
-           this is what is in the conntrack, except for prior
-           manipulations (future optimization: if num_manips == 0,
-           orig_tp =
-           conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
-        nf_ct_invert_tuplepr(&curr_tuple,
-                             &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-        get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
-        if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
-                struct nf_conntrack_tuple reply;
-                /* Alter conntrack table so will recognize replies. */
-                nf_ct_invert_tuplepr(&reply, &new_tuple);
-                nf_conntrack_alter_reply(ct, &reply);
-                /* Non-atomic: we own this at the moment. */
-                if (maniptype == NF_NAT_MANIP_SRC)
-                        ct->status |= IPS_SRC_NAT;
-                else
-                        ct->status |= IPS_DST_NAT;
-        }
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                unsigned int srchash;
-                srchash = hash_by_src(net, nf_ct_zone(ct),
-                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-                spin_lock_bh(&nf_nat_lock);
-                /* nf_conntrack_alter_reply might re-allocate extension area */
-                nat = nfct_nat(ct);
-                nat->ct = ct;
-                hlist_add_head_rcu(&nat->bysource,
-                                   &net->ipv4.nat_bysource[srchash]);
-                spin_unlock_bh(&nf_nat_lock);
-        }
-        /* It's done. */
-        if (maniptype == NF_NAT_MANIP_DST)
-                ct->status |= IPS_DST_NAT_DONE;
-        else
-                ct->status |= IPS_SRC_NAT_DONE;
-        return NF_ACCEPT;
-}
-EXPORT_SYMBOL(nf_nat_setup_info);
-/* Returns true if succeeded. */
-static bool
-manip_pkt(u_int16_t proto,
-          struct sk_buff *skb,
-          unsigned int iphdroff,
-          const struct nf_conntrack_tuple *target,
-          enum nf_nat_manip_type maniptype)
-{
-        struct iphdr *iph;
-        const struct nf_nat_protocol *p;
-        if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
-                return false;
-        iph = (void *)skb->data + iphdroff;
-        /* Manipulate protcol part. */
-        /* rcu_read_lock()ed by nf_hook_slow */
-        p = __nf_nat_proto_find(proto);
-        if (!p->manip_pkt(skb, iphdroff, target, maniptype))
-                return false;
-        iph = (void *)skb->data + iphdroff;
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
-                iph->saddr = target->src.u3.ip;
-        } else {
-                csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
-                iph->daddr = target->dst.u3.ip;
-        }
-        return true;
-}
-/* Do packet manipulations according to nf_nat_setup_info. */
-unsigned int nf_nat_packet(struct nf_conn *ct,
-                           enum ip_conntrack_info ctinfo,
-                           unsigned int hooknum,
-                           struct sk_buff *skb)
-{
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        unsigned long statusbit;
-        enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
-        if (mtype == NF_NAT_MANIP_SRC)
-                statusbit = IPS_SRC_NAT;
-        else
-                statusbit = IPS_DST_NAT;
-        /* Invert if this is reply dir. */
-        if (dir == IP_CT_DIR_REPLY)
-                statusbit ^= IPS_NAT_MASK;
-        /* Non-atomic: these bits don't change. */
-        if (ct->status & statusbit) {
-                struct nf_conntrack_tuple target;
-                /* We are aiming to look like inverse of other direction. */
-                nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-                if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
-                        return NF_DROP;
-        }
-        return NF_ACCEPT;
-}
-EXPORT_SYMBOL_GPL(nf_nat_packet);
-/* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int nf_nat_icmp_reply_translation(struct nf_conn *ct,
-                                  enum ip_conntrack_info ctinfo,
-                                  unsigned int hooknum,
-                                  struct sk_buff *skb)
-{
-        struct {
-                struct icmphdr icmp;
-                struct iphdr ip;
-        } *inside;
-        struct nf_conntrack_tuple target;
-        int hdrlen = ip_hdrlen(skb);
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        unsigned long statusbit;
-        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
-        if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
-                return 0;
-        inside = (void *)skb->data + hdrlen;
-        /* We're actually going to mangle it beyond trivial checksum
-           adjustment, so make sure the current checksum is correct. */
-        if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
-                return 0;
-        /* Must be RELATED */
-        NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
-                     skb->nfctinfo == IP_CT_RELATED_REPLY);
-        /* Redirects on non-null nats must be dropped, else they'll
-           start talking to each other without our translation, and be
-           confused... --RR */
-        if (inside->icmp.type == ICMP_REDIRECT) {
-                /* If NAT isn't finished, assume it and drop. */
-                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
-                        return 0;
-                if (ct->status & IPS_NAT_MASK)
-                        return 0;
-        }
-        if (manip == NF_NAT_MANIP_SRC)
-                statusbit = IPS_SRC_NAT;
-        else
-                statusbit = IPS_DST_NAT;
-        /* Invert if this is reply dir. */
-        if (dir == IP_CT_DIR_REPLY)
-                statusbit ^= IPS_NAT_MASK;
-        if (!(ct->status & statusbit))
-                return 1;
-        pr_debug("icmp_reply_translation: translating error %p manip %u "
-                 "dir %s\n", skb, manip,
-                 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
-        /* Change inner back to look like incoming packet.  We do the
-           opposite manip on this hook to normal, because it might not
-           pass all hooks (locally-generated ICMP).  Consider incoming
-           packet: PREROUTING (DST manip), routing produces ICMP, goes
-           through POSTROUTING (which must correct the DST manip). */
-        if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
-                       &ct->tuplehash[!dir].tuple, !manip))
-                return 0;
-        if (skb->ip_summed != CHECKSUM_PARTIAL) {
-                /* Reloading "inside" here since manip_pkt inner. */
-                inside = (void *)skb->data + hdrlen;
-                inside->icmp.checksum = 0;
-                inside->icmp.checksum =
-                        csum_fold(skb_checksum(skb, hdrlen,
-                                               skb->len - hdrlen, 0));
-        }
-        /* Change outer to look the reply to an incoming packet
-         * (proto 0 means don't invert per-proto part). */
-        nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-        if (!manip_pkt(0, skb, 0, &target, manip))
-                return 0;
-        return 1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-/* Protocol registration. */
-int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
-{
-        int ret = 0;
-        spin_lock_bh(&nf_nat_lock);
-        if (rcu_dereference_protected(
-                        nf_nat_protos[proto->protonum],
-                        lockdep_is_held(&nf_nat_lock)
-                        ) != &nf_nat_unknown_protocol) {
-                ret = -EBUSY;
-                goto out;
-        }
-        RCU_INIT_POINTER(nf_nat_protos[proto->protonum], proto);
- out:
-        spin_unlock_bh(&nf_nat_lock);
-        return ret;
-}
-EXPORT_SYMBOL(nf_nat_protocol_register);
-/* No one stores the protocol anywhere; simply delete it. */
-void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
-{
-        spin_lock_bh(&nf_nat_lock);
-        RCU_INIT_POINTER(nf_nat_protos[proto->protonum],
-                           &nf_nat_unknown_protocol);
-        spin_unlock_bh(&nf_nat_lock);
-        synchronize_rcu();
-}
-EXPORT_SYMBOL(nf_nat_protocol_unregister);
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
-        struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
-        if (nat == NULL || nat->ct == NULL)
-                return;
-        NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
-        spin_lock_bh(&nf_nat_lock);
-        hlist_del_rcu(&nat->bysource);
-        spin_unlock_bh(&nf_nat_lock);
-}
-static void nf_nat_move_storage(void *new, void *old)
-{
-        struct nf_conn_nat *new_nat = new;
-        struct nf_conn_nat *old_nat = old;
-        struct nf_conn *ct = old_nat->ct;
-        if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
-                return;
-        spin_lock_bh(&nf_nat_lock);
-        hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
-        spin_unlock_bh(&nf_nat_lock);
-}
-static struct nf_ct_ext_type nat_extend __read_mostly = {
-        .len            = sizeof(struct nf_conn_nat),
-        .align          = __alignof__(struct nf_conn_nat),
-        .destroy        = nf_nat_cleanup_conntrack,
-        .move           = nf_nat_move_storage,
-        .id             = NF_CT_EXT_NAT,
-        .flags          = NF_CT_EXT_F_PREALLOC,
-};
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
-        [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
-        [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
-};
-static int nfnetlink_parse_nat_proto(struct nlattr *attr,
-                                     const struct nf_conn *ct,
-                                     struct nf_nat_ipv4_range *range)
-{
-        struct nlattr *tb[CTA_PROTONAT_MAX+1];
-        const struct nf_nat_protocol *npt;
-        int err;
-        err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
-        if (err < 0)
-                return err;
-        rcu_read_lock();
-        npt = __nf_nat_proto_find(nf_ct_protonum(ct));
-        if (npt->nlattr_to_range)
-                err = npt->nlattr_to_range(tb, range);
-        rcu_read_unlock();
-        return err;
-}
-static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
-        [CTA_NAT_MINIP]         = { .type = NLA_U32 },
-        [CTA_NAT_MAXIP]         = { .type = NLA_U32 },
-        [CTA_NAT_PROTO]         = { .type = NLA_NESTED },
-};
-static int
-nfnetlink_parse_nat(const struct nlattr *nat,
-                    const struct nf_conn *ct, struct nf_nat_ipv4_range *range)
-{
-        struct nlattr *tb[CTA_NAT_MAX+1];
-        int err;
-        memset(range, 0, sizeof(*range));
-        err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
-        if (err < 0)
-                return err;
-        if (tb[CTA_NAT_MINIP])
-                range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
-        if (!tb[CTA_NAT_MAXIP])
-                range->max_ip = range->min_ip;
-        else
-                range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
-        if (range->min_ip)
-                range->flags |= NF_NAT_RANGE_MAP_IPS;
-        if (!tb[CTA_NAT_PROTO])
-                return 0;
-        err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
-        if (err < 0)
-                return err;
-        return 0;
-}
-static int
-nfnetlink_parse_nat_setup(struct nf_conn *ct,
-                          enum nf_nat_manip_type manip,
-                          const struct nlattr *attr)
-{
-        struct nf_nat_ipv4_range range;
-        if (nfnetlink_parse_nat(attr, ct, &range) < 0)
-                return -EINVAL;
-        if (nf_nat_initialized(ct, manip))
-                return -EEXIST;
-        return nf_nat_setup_info(ct, &range, manip);
-}
-#else
-static int
-nfnetlink_parse_nat_setup(struct nf_conn *ct,
-                          enum nf_nat_manip_type manip,
-                          const struct nlattr *attr)
-{
-        return -EOPNOTSUPP;
-}
-#endif
-static int __net_init nf_nat_net_init(struct net *net)
-{
-        /* Leave them the same for the moment. */
-        net->ipv4.nat_htable_size = net->ct.htable_size;
-        net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
-        if (!net->ipv4.nat_bysource)
-                return -ENOMEM;
-        return 0;
-}
-/* Clear NAT section of all conntracks, in case we're loaded again. */
-static int clean_nat(struct nf_conn *i, void *data)
-{
-        struct nf_conn_nat *nat = nfct_nat(i);
-        if (!nat)
-                return 0;
-        memset(nat, 0, sizeof(*nat));
-        i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
-        return 0;
-}
-static void __net_exit nf_nat_net_exit(struct net *net)
-{
-        nf_ct_iterate_cleanup(net, &clean_nat, NULL);
-        synchronize_rcu();
-        nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
-}
-static struct pernet_operations nf_nat_net_ops = {
-        .init = nf_nat_net_init,
-        .exit = nf_nat_net_exit,
-};
-static struct nf_ct_helper_expectfn follow_master_nat = {
-        .name           = "nat-follow-master",
-        .expectfn       = nf_nat_follow_master,
-};
-static struct nfq_ct_nat_hook nfq_ct_nat = {
-        .seq_adjust     = nf_nat_tcp_seq_adjust,
-};
-static int __init nf_nat_init(void)
-{
-        size_t i;
-        int ret;
-        need_ipv4_conntrack();
-        ret = nf_ct_extend_register(&nat_extend);
-        if (ret < 0) {
-                printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
-                return ret;
-        }
-        ret = register_pernet_subsys(&nf_nat_net_ops);
-        if (ret < 0)
-                goto cleanup_extend;
-        /* Sew in builtin protocols. */
-        spin_lock_bh(&nf_nat_lock);
-        for (i = 0; i < MAX_IP_NAT_PROTO; i++)
-                RCU_INIT_POINTER(nf_nat_protos[i], &nf_nat_unknown_protocol);
-        RCU_INIT_POINTER(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
-        RCU_INIT_POINTER(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
-        RCU_INIT_POINTER(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
-        spin_unlock_bh(&nf_nat_lock);
-        /* Initialize fake conntrack so that NAT will skip it */
-        nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
-        l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
-        nf_ct_helper_expectfn_register(&follow_master_nat);
-        BUG_ON(nf_nat_seq_adjust_hook != NULL);
-        RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
-        BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
-        RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
-                           nfnetlink_parse_nat_setup);
-        BUG_ON(nf_ct_nat_offset != NULL);
-        RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
-        RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
-        return 0;
- cleanup_extend:
-        nf_ct_extend_unregister(&nat_extend);
-        return ret;
-}
-static void __exit nf_nat_cleanup(void)
-{
-        unregister_pernet_subsys(&nf_nat_net_ops);
-        nf_ct_l3proto_put(l3proto);
-        nf_ct_extend_unregister(&nat_extend);
-        nf_ct_helper_expectfn_unregister(&follow_master_nat);
-        RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
-        RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-        RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
-        RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
-        synchronize_net();
-}
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("nf-nat-ipv4");
-module_init(nf_nat_init);
-module_exit(nf_nat_cleanup);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
deleted file mode 100644
index e462a957d080..000000000000
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/* FTP extension for TCP NAT alteration. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_ftp.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
-MODULE_DESCRIPTION("ftp NAT helper");
-MODULE_ALIAS("ip_nat_ftp");
-/* FIXME: Time out? --RR */
-static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
-                              char *buffer, size_t buflen,
-                              __be32 addr, u16 port)
-{
-        switch (type) {
-        case NF_CT_FTP_PORT:
-        case NF_CT_FTP_PASV:
-                return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
-                                ((unsigned char *)&addr)[0],
-                                ((unsigned char *)&addr)[1],
-                                ((unsigned char *)&addr)[2],
-                                ((unsigned char *)&addr)[3],
-                                port >> 8,
-                                port & 0xFF);
-        case NF_CT_FTP_EPRT:
-                return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
-        case NF_CT_FTP_EPSV:
-                return snprintf(buffer, buflen, "|||%u|", port);
-        }
-        return 0;
-}
-/* So, this packet has hit the connection tracking matching code.
-   Mangle it, and change the expectation to match the new version. */
-static unsigned int nf_nat_ftp(struct sk_buff *skb,
-                               enum ip_conntrack_info ctinfo,
-                               enum nf_ct_ftp_type type,
-                               unsigned int matchoff,
-                               unsigned int matchlen,
-                               struct nf_conntrack_expect *exp)
-{
-        __be32 newip;
-        u_int16_t port;
-        int dir = CTINFO2DIR(ctinfo);
-        struct nf_conn *ct = exp->master;
-        char buffer[sizeof("|1|255.255.255.255|65535|")];
-        unsigned int buflen;
-        pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
-        /* Connection will come from wherever this packet goes, hence !dir */
-        newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
-        exp->dir = !dir;
-        /* When you see the packet, we need to NAT it the same as the
-         * this one. */
-        exp->expectfn = nf_nat_follow_master;
-        /* Try to get same port: if not, try to change it. */
-        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
-                int ret;
-                exp->tuple.dst.u.tcp.port = htons(port);
-                ret = nf_ct_expect_related(exp);
-                if (ret == 0)
-                        break;
-                else if (ret != -EBUSY) {
-                        port = 0;
-                        break;
-                }
-        }
-        if (port == 0)
-                return NF_DROP;
-        buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
-        if (!buflen)
-                goto out;
-        pr_debug("calling nf_nat_mangle_tcp_packet\n");
-        if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
-                                      matchlen, buffer, buflen))
-                goto out;
-        return NF_ACCEPT;
-out:
-        nf_ct_unexpect_related(exp);
-        return NF_DROP;
-}
-static void __exit nf_nat_ftp_fini(void)
-{
-        RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
-        synchronize_rcu();
-}
-static int __init nf_nat_ftp_init(void)
-{
-        BUG_ON(nf_nat_ftp_hook != NULL);
-        RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
-        return 0;
-}
-/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
-        printk(KERN_INFO KBUILD_MODNAME
-               ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
-        return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-module_init(nf_nat_ftp_init);
-module_exit(nf_nat_ftp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index c6784a18c1c4..9c3db10b22d3 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -15,13 +15,12 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_h323.h>
 /****************************************************************************/
-static int set_addr(struct sk_buff *skb,
+static int set_addr(struct sk_buff *skb, unsigned int protoff,
                    unsigned char **data, int dataoff,
                    unsigned int addroff, __be32 ip, __be16 port)
 {
@@ -40,7 +39,7 @@ static int set_addr(struct sk_buff *skb,
        if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
                if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-                                              addroff, sizeof(buf),
+                                              protoff, addroff, sizeof(buf),
                                              (char *) &buf, sizeof(buf))) {
                        net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
                        return -1;
@@ -54,7 +53,7 @@ static int set_addr(struct sk_buff *skb,
                *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
        } else {
                if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
-                                              addroff, sizeof(buf),
+                                              protoff, addroff, sizeof(buf),
                                              (char *) &buf, sizeof(buf))) {
                        net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
                        return -1;
@@ -69,22 +68,22 @@ static int set_addr(struct sk_buff *skb,
 }
 /****************************************************************************/
-static int set_h225_addr(struct sk_buff *skb,
+static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
                         unsigned char **data, int dataoff,
                         TransportAddress *taddr,
                         union nf_inet_addr *addr, __be16 port)
 {
-        return set_addr(skb, data, dataoff, taddr->ipAddress.ip,
+        return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
                        addr->ip, port);
 }
 /****************************************************************************/
-static int set_h245_addr(struct sk_buff *skb,
+static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
                         unsigned char **data, int dataoff,
                         H245_TransportAddress *taddr,
                         union nf_inet_addr *addr, __be16 port)
 {
-        return set_addr(skb, data, dataoff,
+        return set_addr(skb, protoff, data, dataoff,
                        taddr->unicastAddress.iPAddress.network,
                        addr->ip, port);
 }
@@ -92,7 +91,7 @@ static int set_h245_addr(struct sk_buff *skb,
 /****************************************************************************/
 static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
                        enum ip_conntrack_info ctinfo,
-                        unsigned char **data,
+                        unsigned int protoff, unsigned char **data,
                        TransportAddress *taddr, int count)
 {
        const struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -118,7 +117,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
                                         &addr.ip, port,
                                         &ct->tuplehash[!dir].tuple.dst.u3.ip,
                                         info->sig_port[!dir]);
-                                return set_h225_addr(skb, data, 0, &taddr[i],
+                                return set_h225_addr(skb, protoff, data, 0,
+                                                     &taddr[i],
                                                     &ct->tuplehash[!dir].
                                                     tuple.dst.u3,
                                                     info->sig_port[!dir]);
@@ -129,7 +129,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
                                         &addr.ip, port,
                                         &ct->tuplehash[!dir].tuple.src.u3.ip,
                                         info->sig_port[!dir]);
-                                return set_h225_addr(skb, data, 0, &taddr[i],
+                                return set_h225_addr(skb, protoff, data, 0,
+                                                     &taddr[i],
                                                     &ct->tuplehash[!dir].
                                                     tuple.src.u3,
                                                     info->sig_port[!dir]);
@@ -143,7 +144,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
                        enum ip_conntrack_info ctinfo,
-                        unsigned char **data,
+                        unsigned int protoff, unsigned char **data,
                        TransportAddress *taddr, int count)
 {
        int dir = CTINFO2DIR(ctinfo);
@@ -159,7 +160,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
                                 &addr.ip, ntohs(port),
                                 &ct->tuplehash[!dir].tuple.dst.u3.ip,
                                 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
-                        return set_h225_addr(skb, data, 0, &taddr[i],
+                        return set_h225_addr(skb, protoff, data, 0, &taddr[i],
                                             &ct->tuplehash[!dir].tuple.dst.u3,
                                             ct->tuplehash[!dir].tuple.
                                                                dst.u.udp.port);
@@ -172,7 +173,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
                        enum ip_conntrack_info ctinfo,
-                        unsigned char **data, int dataoff,
+                        unsigned int protoff, unsigned char **data, int dataoff,
                        H245_TransportAddress *taddr,
                        __be16 port, __be16 rtp_port,
                        struct nf_conntrack_expect *rtp_exp,
@@ -244,7 +245,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
        }
        /* Modify signal */
-        if (set_h245_addr(skb, data, dataoff, taddr,
+        if (set_h245_addr(skb, protoff, data, dataoff, taddr,
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons((port & htons(1)) ? nated_port + 1 :
                                                    nated_port)) == 0) {
@@ -275,7 +276,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
                    enum ip_conntrack_info ctinfo,
-                    unsigned char **data, int dataoff,
+                    unsigned int protoff, unsigned char **data, int dataoff,
                    H245_TransportAddress *taddr, __be16 port,
                    struct nf_conntrack_expect *exp)
 {
@@ -307,7 +308,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
        }
        /* Modify signal */
-        if (set_h245_addr(skb, data, dataoff, taddr,
+        if (set_h245_addr(skb, protoff, data, dataoff, taddr,
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons(nated_port)) < 0) {
                nf_ct_unexpect_related(exp);
@@ -326,7 +327,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
                    enum ip_conntrack_info ctinfo,
-                    unsigned char **data, int dataoff,
+                    unsigned int protoff, unsigned char **data, int dataoff,
                    TransportAddress *taddr, __be16 port,
                    struct nf_conntrack_expect *exp)
 {
@@ -363,7 +364,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
        }
        /* Modify signal */
-        if (set_h225_addr(skb, data, dataoff, taddr,
+        if (set_h225_addr(skb, protoff, data, dataoff, taddr,
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons(nated_port)) == 0) {
                /* Save ports */
@@ -390,7 +391,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_q931_expect(struct nf_conn *new,
                               struct nf_conntrack_expect *this)
 {
-        struct nf_nat_ipv4_range range;
+        struct nf_nat_range range;
        if (this->tuple.src.u3.ip != 0) {       /* Only accept calls from GK */
                nf_nat_follow_master(new, this);
@@ -402,21 +403,23 @@ static void ip_nat_q931_expect(struct nf_conn *new,
        /* Change src to where master sends to */
        range.flags = NF_NAT_RANGE_MAP_IPS;
-        range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
+        range.min_addr = range.max_addr =
+            new->tuplehash[!this->dir].tuple.src.u3;
        nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
        /* For DST manip, map port here to where it's expected. */
        range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-        range.min = range.max = this->saved_proto;
+        range.min_proto = range.max_proto = this->saved_proto;
-        range.min_ip = range.max_ip =
+        range.min_addr = range.max_addr =
-            new->master->tuplehash[!this->dir].tuple.src.u3.ip;
+            new->master->tuplehash[!this->dir].tuple.src.u3;
        nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
 }
 /****************************************************************************/
 static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
                    enum ip_conntrack_info ctinfo,
-                    unsigned char **data, TransportAddress *taddr, int idx,
+                    unsigned int protoff, unsigned char **data,
+                    TransportAddress *taddr, int idx,
                    __be16 port, struct nf_conntrack_expect *exp)
 {
        struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -453,7 +456,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
        }
        /* Modify signal */
-        if (set_h225_addr(skb, data, 0, &taddr[idx],
+        if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons(nated_port)) == 0) {
                /* Save ports */
@@ -464,7 +467,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
                if (idx > 0 &&
                    get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
                    (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
-                        set_h225_addr(skb, data, 0, &taddr[0],
+                        set_h225_addr(skb, protoff, data, 0, &taddr[0],
                                      &ct->tuplehash[!dir].tuple.dst.u3,
                                      info->sig_port[!dir]);
                }
@@ -487,26 +490,28 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_callforwarding_expect(struct nf_conn *new,
                                         struct nf_conntrack_expect *this)
 {
-        struct nf_nat_ipv4_range range;
+        struct nf_nat_range range;
        /* This must be a fresh one. */
        BUG_ON(new->status & IPS_NAT_DONE_MASK);
        /* Change src to where master sends to */
        range.flags = NF_NAT_RANGE_MAP_IPS;
-        range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
+        range.min_addr = range.max_addr =
+            new->tuplehash[!this->dir].tuple.src.u3;
        nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
        /* For DST manip, map port here to where it's expected. */
        range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-        range.min = range.max = this->saved_proto;
+        range.min_proto = range.max_proto = this->saved_proto;
-        range.min_ip = range.max_ip = this->saved_ip;
+        range.min_addr = range.max_addr = this->saved_addr;
        nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
 }
 /****************************************************************************/
 static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
                              enum ip_conntrack_info ctinfo,
+                              unsigned int protoff,
                              unsigned char **data, int dataoff,
                              TransportAddress *taddr, __be16 port,
                              struct nf_conntrack_expect *exp)
@@ -515,7 +520,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
        u_int16_t nated_port;
        /* Set expectations for NAT */
-        exp->saved_ip = exp->tuple.dst.u3.ip;
+        exp->saved_addr = exp->tuple.dst.u3;
        exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
        exp->expectfn = ip_nat_callforwarding_expect;
@@ -541,7 +546,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
        }
        /* Modify signal */
-        if (!set_h225_addr(skb, data, dataoff, taddr,
+        if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
                           &ct->tuplehash[!dir].tuple.dst.u3,
                           htons(nated_port)) == 0) {
                nf_ct_unexpect_related(exp);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
deleted file mode 100644
index 2e59ad0b90ca..000000000000
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ /dev/null
@@ -1,458 +0,0 @@
-/* ip_nat_helper.c - generic support functions for NAT helpers
- *
- * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
- * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/gfp.h>
-#include <linux/kmod.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <net/checksum.h>
-#include <net/tcp.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-#define DUMP_OFFSET(x) \
-        pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
-                 x->offset_before, x->offset_after, x->correction_pos);
-static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
-/* Setup TCP sequence correction given this change at this sequence */
-static inline void
-adjust_tcp_sequence(u32 seq,
-                    int sizediff,
-                    struct nf_conn *ct,
-                    enum ip_conntrack_info ctinfo)
-{
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        struct nf_conn_nat *nat = nfct_nat(ct);
-        struct nf_nat_seq *this_way = &nat->seq[dir];
-        pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
-                 seq, sizediff);
-        pr_debug("adjust_tcp_sequence: Seq_offset before: ");
-        DUMP_OFFSET(this_way);
-        spin_lock_bh(&nf_nat_seqofs_lock);
-        /* SYN adjust. If it's uninitialized, or this is after last
-         * correction, record it: we don't handle more than one
-         * adjustment in the window, but do deal with common case of a
-         * retransmit */
-        if (this_way->offset_before == this_way->offset_after ||
-            before(this_way->correction_pos, seq)) {
-                this_way->correction_pos = seq;
-                this_way->offset_before = this_way->offset_after;
-                this_way->offset_after += sizediff;
-        }
-        spin_unlock_bh(&nf_nat_seqofs_lock);
-        pr_debug("adjust_tcp_sequence: Seq_offset after: ");
-        DUMP_OFFSET(this_way);
-}
-/* Get the offset value, for conntrack */
-s16 nf_nat_get_offset(const struct nf_conn *ct,
-                      enum ip_conntrack_dir dir,
-                      u32 seq)
-{
-        struct nf_conn_nat *nat = nfct_nat(ct);
-        struct nf_nat_seq *this_way;
-        s16 offset;
-        if (!nat)
-                return 0;
-        this_way = &nat->seq[dir];
-        spin_lock_bh(&nf_nat_seqofs_lock);
-        offset = after(seq, this_way->correction_pos)
-                 ? this_way->offset_after : this_way->offset_before;
-        spin_unlock_bh(&nf_nat_seqofs_lock);
-        return offset;
-}
-EXPORT_SYMBOL_GPL(nf_nat_get_offset);
-/* Frobs data inside this packet, which is linear. */
-static void mangle_contents(struct sk_buff *skb,
-                            unsigned int dataoff,
-                            unsigned int match_offset,
-                            unsigned int match_len,
-                            const char *rep_buffer,
-                            unsigned int rep_len)
-{
-        unsigned char *data;
-        BUG_ON(skb_is_nonlinear(skb));
-        data = skb_network_header(skb) + dataoff;
-        /* move post-replacement */
-        memmove(data + match_offset + rep_len,
-                data + match_offset + match_len,
-                skb->tail - (skb->network_header + dataoff +
-                             match_offset + match_len));
-        /* insert data from buffer */
-        memcpy(data + match_offset, rep_buffer, rep_len);
-        /* update skb info */
-        if (rep_len > match_len) {
-                pr_debug("nf_nat_mangle_packet: Extending packet by "
-                         "%u from %u bytes\n", rep_len - match_len, skb->len);
-                skb_put(skb, rep_len - match_len);
-        } else {
-                pr_debug("nf_nat_mangle_packet: Shrinking packet from "
-                         "%u from %u bytes\n", match_len - rep_len, skb->len);
-                __skb_trim(skb, skb->len + rep_len - match_len);
-        }
-        /* fix IP hdr checksum information */
-        ip_hdr(skb)->tot_len = htons(skb->len);
-        ip_send_check(ip_hdr(skb));
-}
-/* Unusual, but possible case. */
-static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
-{
-        if (skb->len + extra > 65535)
-                return 0;
-        if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
-                return 0;
-        return 1;
-}
-void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-                           __be32 seq, s16 off)
-{
-        if (!off)
-                return;
-        set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
-        adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
-        nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
-}
-EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
-void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
-                           u32 ctinfo, int off)
-{
-        const struct tcphdr *th;
-        if (nf_ct_protonum(ct) != IPPROTO_TCP)
-                return;
-        th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
-        nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
-}
-EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
-static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
-                        int datalen, __sum16 *check, int oldlen)
-{
-        struct rtable *rt = skb_rtable(skb);
-        if (skb->ip_summed != CHECKSUM_PARTIAL) {
-                if (!(rt->rt_flags & RTCF_LOCAL) &&
-                    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
-                        skb->ip_summed = CHECKSUM_PARTIAL;
-                        skb->csum_start = skb_headroom(skb) +
-                                          skb_network_offset(skb) +
-                                          iph->ihl * 4;
-                        skb->csum_offset = (void *)check - data;
-                        *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                    datalen, iph->protocol, 0);
-                } else {
-                        *check = 0;
-                        *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                   datalen, iph->protocol,
-                                                   csum_partial(data, datalen,
-                                                                0));
-                        if (iph->protocol == IPPROTO_UDP && !*check)
-                                *check = CSUM_MANGLED_0;
-                }
-        } else
-                inet_proto_csum_replace2(check, skb,
-                                         htons(oldlen), htons(datalen), 1);
-}
-/* Generic function for mangling variable-length address changes inside
- * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
- * command in FTP).
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * */
-int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
-                               struct nf_conn *ct,
-                               enum ip_conntrack_info ctinfo,
-                               unsigned int match_offset,
-                               unsigned int match_len,
-                               const char *rep_buffer,
-                               unsigned int rep_len, bool adjust)
-{
-        struct iphdr *iph;
-        struct tcphdr *tcph;
-        int oldlen, datalen;
-        if (!skb_make_writable(skb, skb->len))
-                return 0;
-        if (rep_len > match_len &&
-            rep_len - match_len > skb_tailroom(skb) &&
-            !enlarge_skb(skb, rep_len - match_len))
-                return 0;
-        SKB_LINEAR_ASSERT(skb);
-        iph = ip_hdr(skb);
-        tcph = (void *)iph + iph->ihl*4;
-        oldlen = skb->len - iph->ihl*4;
-        mangle_contents(skb, iph->ihl*4 + tcph->doff*4,
-                        match_offset, match_len, rep_buffer, rep_len);
-        datalen = skb->len - iph->ihl*4;
-        nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
-        if (adjust && rep_len != match_len)
-                nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
-                                      (int)rep_len - (int)match_len);
-        return 1;
-}
-EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
-/* Generic function for mangling variable-length address changes inside
- * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
- * command in the Amanda protocol)
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
- *       should be fairly easy to do.
- */
-int
-nf_nat_mangle_udp_packet(struct sk_buff *skb,
-                         struct nf_conn *ct,
-                         enum ip_conntrack_info ctinfo,
-                         unsigned int match_offset,
-                         unsigned int match_len,
-                         const char *rep_buffer,
-                         unsigned int rep_len)
-{
-        struct iphdr *iph;
-        struct udphdr *udph;
-        int datalen, oldlen;
-        if (!skb_make_writable(skb, skb->len))
-                return 0;
-        if (rep_len > match_len &&
-            rep_len - match_len > skb_tailroom(skb) &&
-            !enlarge_skb(skb, rep_len - match_len))
-                return 0;
-        iph = ip_hdr(skb);
-        udph = (void *)iph + iph->ihl*4;
-        oldlen = skb->len - iph->ihl*4;
-        mangle_contents(skb, iph->ihl*4 + sizeof(*udph),
-                        match_offset, match_len, rep_buffer, rep_len);
-        /* update the length of the UDP packet */
-        datalen = skb->len - iph->ihl*4;
-        udph->len = htons(datalen);
-        /* fix udp checksum if udp checksum was previously calculated */
-        if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
-                return 1;
-        nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
-        return 1;
-}
-EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
-/* Adjust one found SACK option including checksum correction */
-static void
-sack_adjust(struct sk_buff *skb,
-            struct tcphdr *tcph,
-            unsigned int sackoff,
-            unsigned int sackend,
-            struct nf_nat_seq *natseq)
-{
-        while (sackoff < sackend) {
-                struct tcp_sack_block_wire *sack;
-                __be32 new_start_seq, new_end_seq;
-                sack = (void *)skb->data + sackoff;
-                if (after(ntohl(sack->start_seq) - natseq->offset_before,
-                          natseq->correction_pos))
-                        new_start_seq = htonl(ntohl(sack->start_seq)
-                                        - natseq->offset_after);
-                else
-                        new_start_seq = htonl(ntohl(sack->start_seq)
-                                        - natseq->offset_before);
-                if (after(ntohl(sack->end_seq) - natseq->offset_before,
-                          natseq->correction_pos))
-                        new_end_seq = htonl(ntohl(sack->end_seq)
-                                      - natseq->offset_after);
-                else
-                        new_end_seq = htonl(ntohl(sack->end_seq)
-                                      - natseq->offset_before);
-                pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
-                         ntohl(sack->start_seq), new_start_seq,
-                         ntohl(sack->end_seq), new_end_seq);
-                inet_proto_csum_replace4(&tcph->check, skb,
-                                         sack->start_seq, new_start_seq, 0);
-                inet_proto_csum_replace4(&tcph->check, skb,
-                                         sack->end_seq, new_end_seq, 0);
-                sack->start_seq = new_start_seq;
-                sack->end_seq = new_end_seq;
-                sackoff += sizeof(*sack);
-        }
-}
-/* TCP SACK sequence number adjustment */
-static inline unsigned int
-nf_nat_sack_adjust(struct sk_buff *skb,
-                   struct tcphdr *tcph,
-                   struct nf_conn *ct,
-                   enum ip_conntrack_info ctinfo)
-{
-        unsigned int dir, optoff, optend;
-        struct nf_conn_nat *nat = nfct_nat(ct);
-        optoff = ip_hdrlen(skb) + sizeof(struct tcphdr);
-        optend = ip_hdrlen(skb) + tcph->doff * 4;
-        if (!skb_make_writable(skb, optend))
-                return 0;
-        dir = CTINFO2DIR(ctinfo);
-        while (optoff < optend) {
-                /* Usually: option, length. */
-                unsigned char *op = skb->data + optoff;
-                switch (op[0]) {
-                case TCPOPT_EOL:
-                        return 1;
-                case TCPOPT_NOP:
-                        optoff++;
-                        continue;
-                default:
-                        /* no partial options */
-                        if (optoff + 1 == optend ||
-                            optoff + op[1] > optend ||
-                            op[1] < 2)
-                                return 0;
-                        if (op[0] == TCPOPT_SACK &&
-                            op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
-                            ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
-                                sack_adjust(skb, tcph, optoff+2,
-                                            optoff+op[1], &nat->seq[!dir]);
-                        optoff += op[1];
-                }
-        }
-        return 1;
-}
-/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
-int
-nf_nat_seq_adjust(struct sk_buff *skb,
-                  struct nf_conn *ct,
-                  enum ip_conntrack_info ctinfo)
-{
-        struct tcphdr *tcph;
-        int dir;
-        __be32 newseq, newack;
-        s16 seqoff, ackoff;
-        struct nf_conn_nat *nat = nfct_nat(ct);
-        struct nf_nat_seq *this_way, *other_way;
-        dir = CTINFO2DIR(ctinfo);
-        this_way = &nat->seq[dir];
-        other_way = &nat->seq[!dir];
-        if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
-                return 0;
-        tcph = (void *)skb->data + ip_hdrlen(skb);
-        if (after(ntohl(tcph->seq), this_way->correction_pos))
-                seqoff = this_way->offset_after;
-        else
-                seqoff = this_way->offset_before;
-        if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
-                  other_way->correction_pos))
-                ackoff = other_way->offset_after;
-        else
-                ackoff = other_way->offset_before;
-        newseq = htonl(ntohl(tcph->seq) + seqoff);
-        newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-        inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
-        inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
-        pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
-                 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
-                 ntohl(newack));
-        tcph->seq = newseq;
-        tcph->ack_seq = newack;
-        return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
-}
-/* Setup NAT on this expected conntrack so it follows master. */
-/* If we fail to get a free NAT slot, we'll get dropped on confirm */
-void nf_nat_follow_master(struct nf_conn *ct,
-                          struct nf_conntrack_expect *exp)
-{
-        struct nf_nat_ipv4_range range;
-        /* This must be a fresh one. */
-        BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-        /* Change src to where master sends to */
-        range.flags = NF_NAT_RANGE_MAP_IPS;
-        range.min_ip = range.max_ip
-                = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
-        nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
-        /* For DST manip, map port here to where it's expected. */
-        range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-        range.min = range.max = exp->saved_proto;
-        range.min_ip = range.max_ip
-                = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
-        nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
-}
-EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
deleted file mode 100644
index 979ae165f4ef..000000000000
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/* IRC extension for TCP NAT alteration.
- *
- * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
- * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
- * based on a copy of RR's ip_nat_ftp.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/tcp.h>
-#include <linux/kernel.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_irc.h>
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("IRC (DCC) NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_irc");
-static unsigned int help(struct sk_buff *skb,
-                         enum ip_conntrack_info ctinfo,
-                         unsigned int matchoff,
-                         unsigned int matchlen,
-                         struct nf_conntrack_expect *exp)
-{
-        char buffer[sizeof("4294967296 65635")];
-        u_int32_t ip;
-        u_int16_t port;
-        unsigned int ret;
-        /* Reply comes from server. */
-        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
-        exp->dir = IP_CT_DIR_REPLY;
-        exp->expectfn = nf_nat_follow_master;
-        /* Try to get same port: if not, try to change it. */
-        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
-                int ret;
-                exp->tuple.dst.u.tcp.port = htons(port);
-                ret = nf_ct_expect_related(exp);
-                if (ret == 0)
-                        break;
-                else if (ret != -EBUSY) {
-                        port = 0;
-                        break;
-                }
-        }
-        if (port == 0)
-                return NF_DROP;
-        ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
-        sprintf(buffer, "%u %u", ip, port);
-        pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
-                 buffer, &ip, port);
-        ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
-                                       matchoff, matchlen, buffer,
-                                       strlen(buffer));
-        if (ret != NF_ACCEPT)
-                nf_ct_unexpect_related(exp);
-        return ret;
-}
-static void __exit nf_nat_irc_fini(void)
-{
-        RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
-        synchronize_rcu();
-}
-static int __init nf_nat_irc_init(void)
-{
-        BUG_ON(nf_nat_irc_hook != NULL);
-        RCU_INIT_POINTER(nf_nat_irc_hook, help);
-        return 0;
-}
-/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
-        printk(KERN_INFO KBUILD_MODNAME
-               ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
-        return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-module_init(nf_nat_irc_init);
-module_exit(nf_nat_irc_fini);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
new file mode 100644
index 000000000000..d8b2e14efddc
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -0,0 +1,281 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/secure_seq.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
+                                       const struct nf_conn *ct,
+                                       enum ip_conntrack_dir dir,
+                                       unsigned long statusbit,
+                                       struct flowi *fl)
+{
+        const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+        struct flowi4 *fl4 = &fl->u.ip4;
+        if (ct->status & statusbit) {
+                fl4->daddr = t->dst.u3.ip;
+                if (t->dst.protonum == IPPROTO_TCP ||
+                    t->dst.protonum == IPPROTO_UDP ||
+                    t->dst.protonum == IPPROTO_UDPLITE ||
+                    t->dst.protonum == IPPROTO_DCCP ||
+                    t->dst.protonum == IPPROTO_SCTP)
+                        fl4->fl4_dport = t->dst.u.all;
+        }
+        statusbit ^= IPS_NAT_MASK;
+        if (ct->status & statusbit) {
+                fl4->saddr = t->src.u3.ip;
+                if (t->dst.protonum == IPPROTO_TCP ||
+                    t->dst.protonum == IPPROTO_UDP ||
+                    t->dst.protonum == IPPROTO_UDPLITE ||
+                    t->dst.protonum == IPPROTO_DCCP ||
+                    t->dst.protonum == IPPROTO_SCTP)
+                        fl4->fl4_sport = t->src.u.all;
+        }
+}
+#endif /* CONFIG_XFRM */
+static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
+                                 const struct nf_nat_range *range)
+{
+        return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
+               ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
+}
+static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
+                                   __be16 dport)
+{
+        return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
+}
+static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
+                                  unsigned int iphdroff,
+                                  const struct nf_nat_l4proto *l4proto,
+                                  const struct nf_conntrack_tuple *target,
+                                  enum nf_nat_manip_type maniptype)
+{
+        struct iphdr *iph;
+        unsigned int hdroff;
+        if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+                return false;
+        iph = (void *)skb->data + iphdroff;
+        hdroff = iphdroff + iph->ihl * 4;
+        if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
+                                target, maniptype))
+                return false;
+        iph = (void *)skb->data + iphdroff;
+        if (maniptype == NF_NAT_MANIP_SRC) {
+                csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+                iph->saddr = target->src.u3.ip;
+        } else {
+                csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+                iph->daddr = target->dst.u3.ip;
+        }
+        return true;
+}
+static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
+                                    unsigned int iphdroff, __sum16 *check,
+                                    const struct nf_conntrack_tuple *t,
+                                    enum nf_nat_manip_type maniptype)
+{
+        struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+        __be32 oldip, newip;
+        if (maniptype == NF_NAT_MANIP_SRC) {
+                oldip = iph->saddr;
+                newip = t->src.u3.ip;
+        } else {
+                oldip = iph->daddr;
+                newip = t->dst.u3.ip;
+        }
+        inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+}
+static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
+                                    u8 proto, void *data, __sum16 *check,
+                                    int datalen, int oldlen)
+{
+        const struct iphdr *iph = ip_hdr(skb);
+        struct rtable *rt = skb_rtable(skb);
+        if (skb->ip_summed != CHECKSUM_PARTIAL) {
+                if (!(rt->rt_flags & RTCF_LOCAL) &&
+                    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+                        skb->ip_summed = CHECKSUM_PARTIAL;
+                        skb->csum_start = skb_headroom(skb) +
+                                          skb_network_offset(skb) +
+                                          ip_hdrlen(skb);
+                        skb->csum_offset = (void *)check - data;
+                        *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                    datalen, proto, 0);
+                } else {
+                        *check = 0;
+                        *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                   datalen, proto,
+                                                   csum_partial(data, datalen,
+                                                                0));
+                        if (proto == IPPROTO_UDP && !*check)
+                                *check = CSUM_MANGLED_0;
+                }
+        } else
+                inet_proto_csum_replace2(check, skb,
+                                         htons(oldlen), htons(datalen), 1);
+}
+static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
+                                       struct nf_nat_range *range)
+{
+        if (tb[CTA_NAT_V4_MINIP]) {
+                range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
+                range->flags |= NF_NAT_RANGE_MAP_IPS;
+        }
+        if (tb[CTA_NAT_V4_MAXIP])
+                range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
+        else
+                range->max_addr.ip = range->min_addr.ip;
+        return 0;
+}
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
+        .l3proto                = NFPROTO_IPV4,
+        .in_range               = nf_nat_ipv4_in_range,
+        .secure_port            = nf_nat_ipv4_secure_port,
+        .manip_pkt              = nf_nat_ipv4_manip_pkt,
+        .csum_update            = nf_nat_ipv4_csum_update,
+        .csum_recalc            = nf_nat_ipv4_csum_recalc,
+        .nlattr_to_range        = nf_nat_ipv4_nlattr_to_range,
+#ifdef CONFIG_XFRM
+        .decode_session         = nf_nat_ipv4_decode_session,
+#endif
+};
+int nf_nat_icmp_reply_translation(struct sk_buff *skb,
+                                  struct nf_conn *ct,
+                                  enum ip_conntrack_info ctinfo,
+                                  unsigned int hooknum)
+{
+        struct {
+                struct icmphdr  icmp;
+                struct iphdr    ip;
+        } *inside;
+        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+        unsigned int hdrlen = ip_hdrlen(skb);
+        const struct nf_nat_l4proto *l4proto;
+        struct nf_conntrack_tuple target;
+        unsigned long statusbit;
+        NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+        if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+                return 0;
+        if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
+                return 0;
+        inside = (void *)skb->data + hdrlen;
+        if (inside->icmp.type == ICMP_REDIRECT) {
+                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+                        return 0;
+                if (ct->status & IPS_NAT_MASK)
+                        return 0;
+        }
+        if (manip == NF_NAT_MANIP_SRC)
+                statusbit = IPS_SRC_NAT;
+        else
+                statusbit = IPS_DST_NAT;
+        /* Invert if this is reply direction */
+        if (dir == IP_CT_DIR_REPLY)
+                statusbit ^= IPS_NAT_MASK;
+        if (!(ct->status & statusbit))
+                return 1;
+        l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
+        if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
+                                   l4proto, &ct->tuplehash[!dir].tuple, !manip))
+                return 0;
+        if (skb->ip_summed != CHECKSUM_PARTIAL) {
+                /* Reloading "inside" here since manip_pkt may reallocate */
+                inside = (void *)skb->data + hdrlen;
+                inside->icmp.checksum = 0;
+                inside->icmp.checksum =
+                        csum_fold(skb_checksum(skb, hdrlen,
+                                               skb->len - hdrlen, 0));
+        }
+        /* Change outer to look like the reply to an incoming packet */
+        nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+        l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
+        if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
+                return 0;
+        return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+static int __init nf_nat_l3proto_ipv4_init(void)
+{
+        int err;
+        err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+        if (err < 0)
+                goto err1;
+        err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
+        if (err < 0)
+                goto err2;
+        return err;
+err2:
+        nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+err1:
+        return err;
+}
+static void __exit nf_nat_l3proto_ipv4_exit(void)
+{
+        nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
+        nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+}
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
+module_init(nf_nat_l3proto_ipv4_init);
+module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 388140881ebe..a06d7d74817d 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -22,7 +22,6 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_zones.h>
@@ -47,7 +46,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
        struct nf_conntrack_tuple t;
        const struct nf_ct_pptp_master *ct_pptp_info;
        const struct nf_nat_pptp *nat_pptp_info;
-        struct nf_nat_ipv4_range range;
+        struct nf_nat_range range;
        ct_pptp_info = nfct_help_data(master);
        nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
@@ -89,21 +88,21 @@ static void pptp_nat_expected(struct nf_conn *ct,
        /* Change src to where master sends to */
        range.flags = NF_NAT_RANGE_MAP_IPS;
-        range.min_ip = range.max_ip
+        range.min_addr = range.max_addr
-                = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+                = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
        if (exp->dir == IP_CT_DIR_ORIGINAL) {
                range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-                range.min = range.max = exp->saved_proto;
+                range.min_proto = range.max_proto = exp->saved_proto;
        }
        nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
        /* For DST manip, map port here to where it's expected. */
        range.flags = NF_NAT_RANGE_MAP_IPS;
-        range.min_ip = range.max_ip
+        range.min_addr = range.max_addr
-                = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
+                = ct->master->tuplehash[!exp->dir].tuple.src.u3;
        if (exp->dir == IP_CT_DIR_REPLY) {
                range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-                range.min = range.max = exp->saved_proto;
+                range.min_proto = range.max_proto = exp->saved_proto;
        }
        nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
 }
@@ -113,6 +112,7 @@ static int
 pptp_outbound_pkt(struct sk_buff *skb,
                  struct nf_conn *ct,
                  enum ip_conntrack_info ctinfo,
+                  unsigned int protoff,
                  struct PptpControlHeader *ctlh,
                  union pptp_ctrl_union *pptpReq)
@@ -175,7 +175,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
                 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
        /* mangle packet */
-        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
                                     cid_off + sizeof(struct pptp_pkt_hdr) +
                                     sizeof(struct PptpControlHeader),
                                     sizeof(new_callid), (char *)&new_callid,
@@ -216,6 +216,7 @@ static int
 pptp_inbound_pkt(struct sk_buff *skb,
                 struct nf_conn *ct,
                 enum ip_conntrack_info ctinfo,
+                 unsigned int protoff,
                 struct PptpControlHeader *ctlh,
                 union pptp_ctrl_union *pptpReq)
 {
@@ -268,7 +269,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
        pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
                 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
-        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
                                     pcid_off + sizeof(struct pptp_pkt_hdr) +
                                     sizeof(struct PptpControlHeader),
                                     sizeof(new_pcid), (char *)&new_pcid,
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
deleted file mode 100644
index 9993bc93e102..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/random.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/export.h>
-#include <net/secure_seq.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
-                           enum nf_nat_manip_type maniptype,
-                           const union nf_conntrack_man_proto *min,
-                           const union nf_conntrack_man_proto *max)
-{
-        __be16 port;
-        if (maniptype == NF_NAT_MANIP_SRC)
-                port = tuple->src.u.all;
-        else
-                port = tuple->dst.u.all;
-        return ntohs(port) >= ntohs(min->all) &&
-               ntohs(port) <= ntohs(max->all);
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
-void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
-                               const struct nf_nat_ipv4_range *range,
-                               enum nf_nat_manip_type maniptype,
-                               const struct nf_conn *ct,
-                               u_int16_t *rover)
-{
-        unsigned int range_size, min, i;
-        __be16 *portptr;
-        u_int16_t off;
-        if (maniptype == NF_NAT_MANIP_SRC)
-                portptr = &tuple->src.u.all;
-        else
-                portptr = &tuple->dst.u.all;
-        /* If no range specified... */
-        if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
-                /* If it's dst rewrite, can't change port */
-                if (maniptype == NF_NAT_MANIP_DST)
-                        return;
-                if (ntohs(*portptr) < 1024) {
-                        /* Loose convention: >> 512 is credential passing */
-                        if (ntohs(*portptr) < 512) {
-                                min = 1;
-                                range_size = 511 - min + 1;
-                        } else {
-                                min = 600;
-                                range_size = 1023 - min + 1;
-                        }
-                } else {
-                        min = 1024;
-                        range_size = 65535 - 1024 + 1;
-                }
-        } else {
-                min = ntohs(range->min.all);
-                range_size = ntohs(range->max.all) - min + 1;
-        }
-        if (range->flags & NF_NAT_RANGE_PROTO_RANDOM)
-                off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip,
-                                                 maniptype == NF_NAT_MANIP_SRC
-                                                 ? tuple->dst.u.all
-                                                 : tuple->src.u.all);
-        else
-                off = *rover;
-        for (i = 0; ; ++off) {
-                *portptr = htons(min + off % range_size);
-                if (++i != range_size && nf_nat_used_tuple(tuple, ct))
-                        continue;
-                if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM))
-                        *rover = off;
-                return;
-        }
-        return;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
-                                 struct nf_nat_ipv4_range *range)
-{
-        if (tb[CTA_PROTONAT_PORT_MIN]) {
-                range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
-                range->max.all = range->min.tcp.port;
-                range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-        }
-        if (tb[CTA_PROTONAT_PORT_MAX]) {
-                range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
-                range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-        }
-        return 0;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
-#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
deleted file mode 100644
index 3f67138d187c..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * DCCP NAT protocol helper
- *
- * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/dccp.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-static u_int16_t dccp_port_rover;
-static void
-dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
-                  const struct nf_nat_ipv4_range *range,
-                  enum nf_nat_manip_type maniptype,
-                  const struct nf_conn *ct)
-{
-        nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-                                  &dccp_port_rover);
-}
-static bool
-dccp_manip_pkt(struct sk_buff *skb,
-               unsigned int iphdroff,
-               const struct nf_conntrack_tuple *tuple,
-               enum nf_nat_manip_type maniptype)
-{
-        const struct iphdr *iph = (const void *)(skb->data + iphdroff);
-        struct dccp_hdr *hdr;
-        unsigned int hdroff = iphdroff + iph->ihl * 4;
-        __be32 oldip, newip;
-        __be16 *portptr, oldport, newport;
-        int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-        if (skb->len >= hdroff + sizeof(struct dccp_hdr))
-                hdrsize = sizeof(struct dccp_hdr);
-        if (!skb_make_writable(skb, hdroff + hdrsize))
-                return false;
-        iph = (struct iphdr *)(skb->data + iphdroff);
-        hdr = (struct dccp_hdr *)(skb->data + hdroff);
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                oldip = iph->saddr;
-                newip = tuple->src.u3.ip;
-                newport = tuple->src.u.dccp.port;
-                portptr = &hdr->dccph_sport;
-        } else {
-                oldip = iph->daddr;
-                newip = tuple->dst.u3.ip;
-                newport = tuple->dst.u.dccp.port;
-                portptr = &hdr->dccph_dport;
-        }
-        oldport = *portptr;
-        *portptr = newport;
-        if (hdrsize < sizeof(*hdr))
-                return true;
-        inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1);
-        inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
-                                 0);
-        return true;
-}
-static const struct nf_nat_protocol nf_nat_protocol_dccp = {
-        .protonum               = IPPROTO_DCCP,
-        .manip_pkt              = dccp_manip_pkt,
-        .in_range               = nf_nat_proto_in_range,
-        .unique_tuple           = dccp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
-#endif
-};
-static int __init nf_nat_proto_dccp_init(void)
-{
-        return nf_nat_protocol_register(&nf_nat_protocol_dccp);
-}
-static void __exit nf_nat_proto_dccp_fini(void)
-{
-        nf_nat_protocol_unregister(&nf_nat_protocol_dccp);
-}
-module_init(nf_nat_proto_dccp_init);
-module_exit(nf_nat_proto_dccp_fini);
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP NAT protocol helper");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 46ba0b9ab985..ea44f02563b5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -28,8 +28,7 @@
 #include <linux/ip.h>
 #include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_l4proto.h>
-#include <net/netfilter/nf_nat_protocol.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
 MODULE_LICENSE("GPL");
@@ -38,8 +37,9 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 /* generate unique tuple ... */
 static void
-gre_unique_tuple(struct nf_conntrack_tuple *tuple,
+gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
-                 const struct nf_nat_ipv4_range *range,
+                 struct nf_conntrack_tuple *tuple,
+                 const struct nf_nat_range *range,
                 enum nf_nat_manip_type maniptype,
                 const struct nf_conn *ct)
 {
@@ -62,8 +62,8 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
                min = 1;
                range_size = 0xffff;
        } else {
-                min = ntohs(range->min.gre.key);
+                min = ntohs(range->min_proto.gre.key);
-                range_size = ntohs(range->max.gre.key) - min + 1;
+                range_size = ntohs(range->max_proto.gre.key) - min + 1;
        }
        pr_debug("min = %u, range_size = %u\n", min, range_size);
@@ -80,14 +80,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 /* manipulate a GRE packet according to maniptype */
 static bool
-gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
+gre_manip_pkt(struct sk_buff *skb,
+              const struct nf_nat_l3proto *l3proto,
+              unsigned int iphdroff, unsigned int hdroff,
              const struct nf_conntrack_tuple *tuple,
              enum nf_nat_manip_type maniptype)
 {
        const struct gre_hdr *greh;
        struct gre_hdr_pptp *pgreh;
-        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-        unsigned int hdroff = iphdroff + iph->ihl * 4;
        /* pgreh includes two optional 32bit fields which are not required
         * to be there.  That's where the magic '8' comes from */
@@ -117,24 +117,24 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
        return true;
 }
-static const struct nf_nat_protocol gre = {
+static const struct nf_nat_l4proto gre = {
-        .protonum               = IPPROTO_GRE,
+        .l4proto                = IPPROTO_GRE,
        .manip_pkt              = gre_manip_pkt,
-        .in_range               = nf_nat_proto_in_range,
+        .in_range               = nf_nat_l4proto_in_range,
        .unique_tuple           = gre_unique_tuple,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
+        .nlattr_to_range        = nf_nat_l4proto_nlattr_to_range,
 #endif
 };
 static int __init nf_nat_proto_gre_init(void)
 {
-        return nf_nat_protocol_register(&gre);
+        return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
 }
 static void __exit nf_nat_proto_gre_fini(void)
 {
-        nf_nat_protocol_unregister(&gre);
+        nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
 }
 module_init(nf_nat_proto_gre_init);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index b35172851bae..eb303471bcf6 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -15,8 +15,7 @@
 #include <linux/netfilter.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_l4proto.h>
-#include <net/netfilter/nf_nat_protocol.h>
 static bool
 icmp_in_range(const struct nf_conntrack_tuple *tuple,
@@ -29,8 +28,9 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 }
 static void
-icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
+icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
-                  const struct nf_nat_ipv4_range *range,
+                  struct nf_conntrack_tuple *tuple,
+                  const struct nf_nat_range *range,
                  enum nf_nat_manip_type maniptype,
                  const struct nf_conn *ct)
 {
@@ -38,13 +38,14 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
        unsigned int range_size;
        unsigned int i;
-        range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
+        range_size = ntohs(range->max_proto.icmp.id) -
+                     ntohs(range->min_proto.icmp.id) + 1;
        /* If no range specified... */
        if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
                range_size = 0xFFFF;
        for (i = 0; ; ++id) {
-                tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
+                tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
                                             (id % range_size));
                if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
                        return;
@@ -54,13 +55,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 static bool
 icmp_manip_pkt(struct sk_buff *skb,
-               unsigned int iphdroff,
+               const struct nf_nat_l3proto *l3proto,
+               unsigned int iphdroff, unsigned int hdroff,
               const struct nf_conntrack_tuple *tuple,
               enum nf_nat_manip_type maniptype)
 {
-        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
        struct icmphdr *hdr;
-        unsigned int hdroff = iphdroff + iph->ihl*4;
        if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
                return false;
@@ -72,12 +72,12 @@ icmp_manip_pkt(struct sk_buff *skb,
        return true;
 }
-const struct nf_nat_protocol nf_nat_protocol_icmp = {
+const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
-        .protonum               = IPPROTO_ICMP,
+        .l4proto                = IPPROTO_ICMP,
        .manip_pkt              = icmp_manip_pkt,
        .in_range               = icmp_in_range,
        .unique_tuple           = icmp_unique_tuple,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
+        .nlattr_to_range        = nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
deleted file mode 100644
index 3cce9b6c1c29..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/sctp.h>
-#include <linux/module.h>
-#include <net/sctp/checksum.h>
-#include <net/netfilter/nf_nat_protocol.h>
-static u_int16_t nf_sctp_port_rover;
-static void
-sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
-                  const struct nf_nat_ipv4_range *range,
-                  enum nf_nat_manip_type maniptype,
-                  const struct nf_conn *ct)
-{
-        nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-                                  &nf_sctp_port_rover);
-}
-static bool
-sctp_manip_pkt(struct sk_buff *skb,
-               unsigned int iphdroff,
-               const struct nf_conntrack_tuple *tuple,
-               enum nf_nat_manip_type maniptype)
-{
-        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-        struct sk_buff *frag;
-        sctp_sctphdr_t *hdr;
-        unsigned int hdroff = iphdroff + iph->ihl*4;
-        __be32 oldip, newip;
-        __be32 crc32;
-        if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-                return false;
-        iph = (struct iphdr *)(skb->data + iphdroff);
-        hdr = (struct sctphdr *)(skb->data + hdroff);
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                /* Get rid of src ip and src pt */
-                oldip = iph->saddr;
-                newip = tuple->src.u3.ip;
-                hdr->source = tuple->src.u.sctp.port;
-        } else {
-                /* Get rid of dst ip and dst pt */
-                oldip = iph->daddr;
-                newip = tuple->dst.u3.ip;
-                hdr->dest = tuple->dst.u.sctp.port;
-        }
-        crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
-        skb_walk_frags(skb, frag)
-                crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
-                                          crc32);
-        crc32 = sctp_end_cksum(crc32);
-        hdr->checksum = crc32;
-        return true;
-}
-static const struct nf_nat_protocol nf_nat_protocol_sctp = {
-        .protonum               = IPPROTO_SCTP,
-        .manip_pkt              = sctp_manip_pkt,
-        .in_range               = nf_nat_proto_in_range,
-        .unique_tuple           = sctp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
-#endif
-};
-static int __init nf_nat_proto_sctp_init(void)
-{
-        return nf_nat_protocol_register(&nf_nat_protocol_sctp);
-}
-static void __exit nf_nat_proto_sctp_exit(void)
-{
-        nf_nat_protocol_unregister(&nf_nat_protocol_sctp);
-}
-module_init(nf_nat_proto_sctp_init);
-module_exit(nf_nat_proto_sctp_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SCTP NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
deleted file mode 100644
index 9fb4b4e72bbf..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-static u_int16_t tcp_port_rover;
-static void
-tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
-                 const struct nf_nat_ipv4_range *range,
-                 enum nf_nat_manip_type maniptype,
-                 const struct nf_conn *ct)
-{
-        nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
-}
-static bool
-tcp_manip_pkt(struct sk_buff *skb,
-              unsigned int iphdroff,
-              const struct nf_conntrack_tuple *tuple,
-              enum nf_nat_manip_type maniptype)
-{
-        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-        struct tcphdr *hdr;
-        unsigned int hdroff = iphdroff + iph->ihl*4;
-        __be32 oldip, newip;
-        __be16 *portptr, newport, oldport;
-        int hdrsize = 8; /* TCP connection tracking guarantees this much */
-        /* this could be a inner header returned in icmp packet; in such
-           cases we cannot update the checksum field since it is outside of
-           the 8 bytes of transport layer headers we are guaranteed */
-        if (skb->len >= hdroff + sizeof(struct tcphdr))
-                hdrsize = sizeof(struct tcphdr);
-        if (!skb_make_writable(skb, hdroff + hdrsize))
-                return false;
-        iph = (struct iphdr *)(skb->data + iphdroff);
-        hdr = (struct tcphdr *)(skb->data + hdroff);
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                /* Get rid of src ip and src pt */
-                oldip = iph->saddr;
-                newip = tuple->src.u3.ip;
-                newport = tuple->src.u.tcp.port;
-                portptr = &hdr->source;
-        } else {
-                /* Get rid of dst ip and dst pt */
-                oldip = iph->daddr;
-                newip = tuple->dst.u3.ip;
-                newport = tuple->dst.u.tcp.port;
-                portptr = &hdr->dest;
-        }
-        oldport = *portptr;
-        *portptr = newport;
-        if (hdrsize < sizeof(*hdr))
-                return true;
-        inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
-        inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
-        return true;
-}
-const struct nf_nat_protocol nf_nat_protocol_tcp = {
-        .protonum               = IPPROTO_TCP,
-        .manip_pkt              = tcp_manip_pkt,
-        .in_range               = nf_nat_proto_in_range,
-        .unique_tuple           = tcp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
deleted file mode 100644
index 9883336e628f..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-static u_int16_t udp_port_rover;
-static void
-udp_unique_tuple(struct nf_conntrack_tuple *tuple,
-                 const struct nf_nat_ipv4_range *range,
-                 enum nf_nat_manip_type maniptype,
-                 const struct nf_conn *ct)
-{
-        nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
-}
-static bool
-udp_manip_pkt(struct sk_buff *skb,
-              unsigned int iphdroff,
-              const struct nf_conntrack_tuple *tuple,
-              enum nf_nat_manip_type maniptype)
-{
-        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-        struct udphdr *hdr;
-        unsigned int hdroff = iphdroff + iph->ihl*4;
-        __be32 oldip, newip;
-        __be16 *portptr, newport;
-        if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-                return false;
-        iph = (struct iphdr *)(skb->data + iphdroff);
-        hdr = (struct udphdr *)(skb->data + hdroff);
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                /* Get rid of src ip and src pt */
-                oldip = iph->saddr;
-                newip = tuple->src.u3.ip;
-                newport = tuple->src.u.udp.port;
-                portptr = &hdr->source;
-        } else {
-                /* Get rid of dst ip and dst pt */
-                oldip = iph->daddr;
-                newip = tuple->dst.u3.ip;
-                newport = tuple->dst.u.udp.port;
-                portptr = &hdr->dest;
-        }
-        if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
-                inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
-                inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
-                                         0);
-                if (!hdr->check)
-                        hdr->check = CSUM_MANGLED_0;
-        }
-        *portptr = newport;
-        return true;
-}
-const struct nf_nat_protocol nf_nat_protocol_udp = {
-        .protonum               = IPPROTO_UDP,
-        .manip_pkt              = udp_manip_pkt,
-        .in_range               = nf_nat_proto_in_range,
-        .unique_tuple           = udp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index d24d10a7beb2..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-static u_int16_t udplite_port_rover;
-static void
-udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
-                     const struct nf_nat_ipv4_range *range,
-                     enum nf_nat_manip_type maniptype,
-                     const struct nf_conn *ct)
-{
-        nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-                                  &udplite_port_rover);
-}
-static bool
-udplite_manip_pkt(struct sk_buff *skb,
-                  unsigned int iphdroff,
-                  const struct nf_conntrack_tuple *tuple,
-                  enum nf_nat_manip_type maniptype)
-{
-        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-        struct udphdr *hdr;
-        unsigned int hdroff = iphdroff + iph->ihl*4;
-        __be32 oldip, newip;
-        __be16 *portptr, newport;
-        if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-                return false;
-        iph = (struct iphdr *)(skb->data + iphdroff);
-        hdr = (struct udphdr *)(skb->data + hdroff);
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                /* Get rid of src ip and src pt */
-                oldip = iph->saddr;
-                newip = tuple->src.u3.ip;
-                newport = tuple->src.u.udp.port;
-                portptr = &hdr->source;
-        } else {
-                /* Get rid of dst ip and dst pt */
-                oldip = iph->daddr;
-                newip = tuple->dst.u3.ip;
-                newport = tuple->dst.u.udp.port;
-                portptr = &hdr->dest;
-        }
-        inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
-        inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
-        if (!hdr->check)
-                hdr->check = CSUM_MANGLED_0;
-        *portptr = newport;
-        return true;
-}
-static const struct nf_nat_protocol nf_nat_protocol_udplite = {
-        .protonum               = IPPROTO_UDPLITE,
-        .manip_pkt              = udplite_manip_pkt,
-        .in_range               = nf_nat_proto_in_range,
-        .unique_tuple           = udplite_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
-#endif
-};
-static int __init nf_nat_proto_udplite_init(void)
-{
-        return nf_nat_protocol_register(&nf_nat_protocol_udplite);
-}
-static void __exit nf_nat_proto_udplite_fini(void)
-{
-        nf_nat_protocol_unregister(&nf_nat_protocol_udplite);
-}
-module_init(nf_nat_proto_udplite_init);
-module_exit(nf_nat_proto_udplite_fini);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
deleted file mode 100644
index e0afe8112b1c..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* The "unknown" protocol.  This is what is used for protocols we
- * don't understand.  It's returned by ip_ct_find_proto().
- */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
-                             enum nf_nat_manip_type manip_type,
-                             const union nf_conntrack_man_proto *min,
-                             const union nf_conntrack_man_proto *max)
-{
-        return true;
-}
-static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
-                                 const struct nf_nat_ipv4_range *range,
-                                 enum nf_nat_manip_type maniptype,
-                                 const struct nf_conn *ct)
-{
-        /* Sorry: we can't help you; if it's not unique, we can't frob
-           anything. */
-        return;
-}
-static bool
-unknown_manip_pkt(struct sk_buff *skb,
-                  unsigned int iphdroff,
-                  const struct nf_conntrack_tuple *tuple,
-                  enum nf_nat_manip_type maniptype)
-{
-        return true;
-}
-const struct nf_nat_protocol nf_nat_unknown_protocol = {
-        .manip_pkt              = unknown_manip_pkt,
-        .in_range               = unknown_in_range,
-        .unique_tuple           = unknown_unique_tuple,
-};
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
deleted file mode 100644
index d2a9dc314e0e..000000000000
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-/* Everything about the rules for NAT. */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/bitops.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
-                         (1 << NF_INET_POST_ROUTING) | \
-                         (1 << NF_INET_LOCAL_OUT) | \
-                         (1 << NF_INET_LOCAL_IN))
-static const struct xt_table nat_table = {
-        .name           = "nat",
-        .valid_hooks    = NAT_VALID_HOOKS,
-        .me             = THIS_MODULE,
-        .af             = NFPROTO_IPV4,
-};
-/* Source NAT */
-static unsigned int
-ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
-{
-        struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
-                     par->hooknum == NF_INET_LOCAL_IN);
-        ct = nf_ct_get(skb, &ctinfo);
-        /* Connection must be valid and new. */
-        NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-                            ctinfo == IP_CT_RELATED_REPLY));
-        NF_CT_ASSERT(par->out != NULL);
-        return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_SRC);
-}
-static unsigned int
-ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
-{
-        struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
-                     par->hooknum == NF_INET_LOCAL_OUT);
-        ct = nf_ct_get(skb, &ctinfo);
-        /* Connection must be valid and new. */
-        NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-        return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_DST);
-}
-static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
-{
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        /* Must be a valid range */
-        if (mr->rangesize != 1) {
-                pr_info("SNAT: multiple ranges no longer supported\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
-{
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-        /* Must be a valid range */
-        if (mr->rangesize != 1) {
-                pr_info("DNAT: multiple ranges no longer supported\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-static unsigned int
-alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
-{
-        /* Force range to this IP; let proto decide mapping for
-           per-proto parts (hence not NF_NAT_RANGE_PROTO_SPECIFIED).
-        */
-        struct nf_nat_ipv4_range range;
-        range.flags = 0;
-        pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
-                 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
-                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
-                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-        return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
-}
-int nf_nat_rule_find(struct sk_buff *skb,
-                     unsigned int hooknum,
-                     const struct net_device *in,
-                     const struct net_device *out,
-                     struct nf_conn *ct)
-{
-        struct net *net = nf_ct_net(ct);
-        int ret;
-        ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
-        if (ret == NF_ACCEPT) {
-                if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
-                        /* NUL mapping */
-                        ret = alloc_null_binding(ct, hooknum);
-        }
-        return ret;
-}
-static struct xt_target ipt_snat_reg __read_mostly = {
-        .name           = "SNAT",
-        .target         = ipt_snat_target,
-        .targetsize     = sizeof(struct nf_nat_ipv4_multi_range_compat),
-        .table          = "nat",
-        .hooks          = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
-        .checkentry     = ipt_snat_checkentry,
-        .family         = AF_INET,
-};
-static struct xt_target ipt_dnat_reg __read_mostly = {
-        .name           = "DNAT",
-        .target         = ipt_dnat_target,
-        .targetsize     = sizeof(struct nf_nat_ipv4_multi_range_compat),
-        .table          = "nat",
-        .hooks          = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
-        .checkentry     = ipt_dnat_checkentry,
-        .family         = AF_INET,
-};
-static int __net_init nf_nat_rule_net_init(struct net *net)
-{
-        struct ipt_replace *repl;
-        repl = ipt_alloc_initial_table(&nat_table);
-        if (repl == NULL)
-                return -ENOMEM;
-        net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
-        kfree(repl);
-        if (IS_ERR(net->ipv4.nat_table))
-                return PTR_ERR(net->ipv4.nat_table);
-        return 0;
-}
-static void __net_exit nf_nat_rule_net_exit(struct net *net)
-{
-        ipt_unregister_table(net, net->ipv4.nat_table);
-}
-static struct pernet_operations nf_nat_rule_net_ops = {
-        .init = nf_nat_rule_net_init,
-        .exit = nf_nat_rule_net_exit,
-};
-int __init nf_nat_rule_init(void)
-{
-        int ret;
-        ret = register_pernet_subsys(&nf_nat_rule_net_ops);
-        if (ret != 0)
-                goto out;
-        ret = xt_register_target(&ipt_snat_reg);
-        if (ret != 0)
-                goto unregister_table;
-        ret = xt_register_target(&ipt_dnat_reg);
-        if (ret != 0)
-                goto unregister_snat;
-        return ret;
- unregister_snat:
-        xt_unregister_target(&ipt_snat_reg);
- unregister_table:
-        unregister_pernet_subsys(&nf_nat_rule_net_ops);
- out:
-        return ret;
-}
-void nf_nat_rule_cleanup(void)
-{
-        xt_unregister_target(&ipt_dnat_reg);
-        xt_unregister_target(&ipt_snat_reg);
-        unregister_pernet_subsys(&nf_nat_rule_net_ops);
-}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
deleted file mode 100644
index 9c87cde28ff8..000000000000
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ /dev/null
@@ -1,572 +0,0 @@
-/* SIP extension for NAT alteration.
- *
- * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
- * based on RR's ip_nat_ftp.c and other modules.
- * (C) 2007 United Security Providers
- * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/ip.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_sip.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
-MODULE_DESCRIPTION("SIP NAT helper");
-MODULE_ALIAS("ip_nat_sip");
-static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
-                                  const char **dptr, unsigned int *datalen,
-                                  unsigned int matchoff, unsigned int matchlen,
-                                  const char *buffer, unsigned int buflen)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        struct tcphdr *th;
-        unsigned int baseoff;
-        if (nf_ct_protonum(ct) == IPPROTO_TCP) {
-                th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
-                baseoff = ip_hdrlen(skb) + th->doff * 4;
-                matchoff += dataoff - baseoff;
-                if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-                                                matchoff, matchlen,
-                                                buffer, buflen, false))
-                        return 0;
-        } else {
-                baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
-                matchoff += dataoff - baseoff;
-                if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
-                                              matchoff, matchlen,
-                                              buffer, buflen))
-                        return 0;
-        }
-        /* Reload data pointer and adjust datalen value */
-        *dptr = skb->data + dataoff;
-        *datalen += buflen - matchlen;
-        return 1;
-}
-static int map_addr(struct sk_buff *skb, unsigned int dataoff,
-                    const char **dptr, unsigned int *datalen,
-                    unsigned int matchoff, unsigned int matchlen,
-                    union nf_inet_addr *addr, __be16 port)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
-        unsigned int buflen;
-        __be32 newaddr;
-        __be16 newport;
-        if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip &&
-            ct->tuplehash[dir].tuple.src.u.udp.port == port) {
-                newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
-                newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
-        } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip &&
-                   ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
-                newaddr = ct->tuplehash[!dir].tuple.src.u3.ip;
-                newport = ct->tuplehash[!dir].tuple.src.u.udp.port;
-        } else
-                return 1;
-        if (newaddr == addr->ip && newport == port)
-                return 1;
-        buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
-        return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                             buffer, buflen);
-}
-static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
-                        const char **dptr, unsigned int *datalen,
-                        enum sip_header_types type)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        unsigned int matchlen, matchoff;
-        union nf_inet_addr addr;
-        __be16 port;
-        if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
-                                    &matchoff, &matchlen, &addr, &port) <= 0)
-                return 1;
-        return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                        &addr, port);
-}
-static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
-                               const char **dptr, unsigned int *datalen)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        unsigned int coff, matchoff, matchlen;
-        enum sip_header_types hdr;
-        union nf_inet_addr addr;
-        __be16 port;
-        int request, in_header;
-        /* Basic rules: requests and responses. */
-        if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
-                if (ct_sip_parse_request(ct, *dptr, *datalen,
-                                         &matchoff, &matchlen,
-                                         &addr, &port) > 0 &&
-                    !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                              &addr, port))
-                        return NF_DROP;
-                request = 1;
-        } else
-                request = 0;
-        if (nf_ct_protonum(ct) == IPPROTO_TCP)
-                hdr = SIP_HDR_VIA_TCP;
-        else
-                hdr = SIP_HDR_VIA_UDP;
-        /* Translate topmost Via header and parameters */
-        if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
-                                    hdr, NULL, &matchoff, &matchlen,
-                                    &addr, &port) > 0) {
-                unsigned int olen, matchend, poff, plen, buflen, n;
-                char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
-                /* We're only interested in headers related to this
-                 * connection */
-                if (request) {
-                        if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip ||
-                            port != ct->tuplehash[dir].tuple.src.u.udp.port)
-                                goto next;
-                } else {
-                        if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip ||
-                            port != ct->tuplehash[dir].tuple.dst.u.udp.port)
-                                goto next;
-                }
-                olen = *datalen;
-                if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                              &addr, port))
-                        return NF_DROP;
-                matchend = matchoff + matchlen + *datalen - olen;
-                /* The maddr= parameter (RFC 2361) specifies where to send
-                 * the reply. */
-                if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
-                                               "maddr=", &poff, &plen,
-                                               &addr, true) > 0 &&
-                    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
-                    addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
-                        buflen = sprintf(buffer, "%pI4",
-                                        &ct->tuplehash[!dir].tuple.dst.u3.ip);
-                        if (!mangle_packet(skb, dataoff, dptr, datalen,
-                                           poff, plen, buffer, buflen))
-                                return NF_DROP;
-                }
-                /* The received= parameter (RFC 2361) contains the address
-                 * from which the server received the request. */
-                if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
-                                               "received=", &poff, &plen,
-                                               &addr, false) > 0 &&
-                    addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
-                    addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
-                        buflen = sprintf(buffer, "%pI4",
-                                        &ct->tuplehash[!dir].tuple.src.u3.ip);
-                        if (!mangle_packet(skb, dataoff, dptr, datalen,
-                                           poff, plen, buffer, buflen))
-                                return NF_DROP;
-                }
-                /* The rport= parameter (RFC 3581) contains the port number
-                 * from which the server received the request. */
-                if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
-                                                 "rport=", &poff, &plen,
-                                                 &n) > 0 &&
-                    htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
-                    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
-                        __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
-                        buflen = sprintf(buffer, "%u", ntohs(p));
-                        if (!mangle_packet(skb, dataoff, dptr, datalen,
-                                           poff, plen, buffer, buflen))
-                                return NF_DROP;
-                }
-        }
-next:
-        /* Translate Contact headers */
-        coff = 0;
-        in_header = 0;
-        while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
-                                       SIP_HDR_CONTACT, &in_header,
-                                       &matchoff, &matchlen,
-                                       &addr, &port) > 0) {
-                if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                              &addr, port))
-                        return NF_DROP;
-        }
-        if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
-            !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
-                return NF_DROP;
-        return NF_ACCEPT;
-}
-static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        const struct tcphdr *th;
-        if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
-                return;
-        th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
-        nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
-}
-/* Handles expected signalling connections and media streams */
-static void ip_nat_sip_expected(struct nf_conn *ct,
-                                struct nf_conntrack_expect *exp)
-{
-        struct nf_nat_ipv4_range range;
-        /* This must be a fresh one. */
-        BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-        /* For DST manip, map port here to where it's expected. */
-        range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-        range.min = range.max = exp->saved_proto;
-        range.min_ip = range.max_ip = exp->saved_ip;
-        nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
-        /* Change src to where master sends to, but only if the connection
-         * actually came from the same source. */
-        if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
-            ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
-                range.flags = NF_NAT_RANGE_MAP_IPS;
-                range.min_ip = range.max_ip
-                        = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
-                nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
-        }
-}
-static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
-                                      const char **dptr, unsigned int *datalen,
-                                      struct nf_conntrack_expect *exp,
-                                      unsigned int matchoff,
-                                      unsigned int matchlen)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        __be32 newip;
-        u_int16_t port;
-        char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
-        unsigned int buflen;
-        /* Connection will come from reply */
-        if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
-                newip = exp->tuple.dst.u3.ip;
-        else
-                newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-        /* If the signalling port matches the connection's source port in the
-         * original direction, try to use the destination port in the opposite
-         * direction. */
-        if (exp->tuple.dst.u.udp.port ==
-            ct->tuplehash[dir].tuple.src.u.udp.port)
-                port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
-        else
-                port = ntohs(exp->tuple.dst.u.udp.port);
-        exp->saved_ip = exp->tuple.dst.u3.ip;
-        exp->tuple.dst.u3.ip = newip;
-        exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
-        exp->dir = !dir;
-        exp->expectfn = ip_nat_sip_expected;
-        for (; port != 0; port++) {
-                int ret;
-                exp->tuple.dst.u.udp.port = htons(port);
-                ret = nf_ct_expect_related(exp);
-                if (ret == 0)
-                        break;
-                else if (ret != -EBUSY) {
-                        port = 0;
-                        break;
-                }
-        }
-        if (port == 0)
-                return NF_DROP;
-        if (exp->tuple.dst.u3.ip != exp->saved_ip ||
-            exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
-                buflen = sprintf(buffer, "%pI4:%u", &newip, port);
-                if (!mangle_packet(skb, dataoff, dptr, datalen,
-                                   matchoff, matchlen, buffer, buflen))
-                        goto err;
-        }
-        return NF_ACCEPT;
-err:
-        nf_ct_unexpect_related(exp);
-        return NF_DROP;
-}
-static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
-                              const char **dptr, unsigned int *datalen)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        unsigned int matchoff, matchlen;
-        char buffer[sizeof("65536")];
-        int buflen, c_len;
-        /* Get actual SDP length */
-        if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
-                                  SDP_HDR_VERSION, SDP_HDR_UNSPEC,
-                                  &matchoff, &matchlen) <= 0)
-                return 0;
-        c_len = *datalen - matchoff + strlen("v=");
-        /* Now, update SDP length */
-        if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
-                              &matchoff, &matchlen) <= 0)
-                return 0;
-        buflen = sprintf(buffer, "%u", c_len);
-        return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                             buffer, buflen);
-}
-static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
-                             const char **dptr, unsigned int *datalen,
-                             unsigned int sdpoff,
-                             enum sdp_header_types type,
-                             enum sdp_header_types term,
-                             char *buffer, int buflen)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        unsigned int matchlen, matchoff;
-        if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
-                                  &matchoff, &matchlen) <= 0)
-                return -ENOENT;
-        return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                             buffer, buflen) ? 0 : -EINVAL;
-}
-static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
-                                    const char **dptr, unsigned int *datalen,
-                                    unsigned int sdpoff,
-                                    enum sdp_header_types type,
-                                    enum sdp_header_types term,
-                                    const union nf_inet_addr *addr)
-{
-        char buffer[sizeof("nnn.nnn.nnn.nnn")];
-        unsigned int buflen;
-        buflen = sprintf(buffer, "%pI4", &addr->ip);
-        if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
-                              buffer, buflen))
-                return 0;
-        return mangle_content_len(skb, dataoff, dptr, datalen);
-}
-static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
-                                    const char **dptr, unsigned int *datalen,
-                                    unsigned int matchoff,
-                                    unsigned int matchlen,
-                                    u_int16_t port)
-{
-        char buffer[sizeof("nnnnn")];
-        unsigned int buflen;
-        buflen = sprintf(buffer, "%u", port);
-        if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-                           buffer, buflen))
-                return 0;
-        return mangle_content_len(skb, dataoff, dptr, datalen);
-}
-static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
-                                       const char **dptr, unsigned int *datalen,
-                                       unsigned int sdpoff,
-                                       const union nf_inet_addr *addr)
-{
-        char buffer[sizeof("nnn.nnn.nnn.nnn")];
-        unsigned int buflen;
-        /* Mangle session description owner and contact addresses */
-        buflen = sprintf(buffer, "%pI4", &addr->ip);
-        if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
-                               SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
-                               buffer, buflen))
-                return 0;
-        switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
-                                  SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
-                                  buffer, buflen)) {
-        case 0:
-        /*
-         * RFC 2327:
-         *
-         * Session description
-         *
-         * c=* (connection information - not required if included in all media)
-         */
-        case -ENOENT:
-                break;
-        default:
-                return 0;
-        }
-        return mangle_content_len(skb, dataoff, dptr, datalen);
-}
-/* So, this packet has hit the connection tracking matching code.
-   Mangle it, and change the expectation to match the new version. */
-static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
-                                     const char **dptr, unsigned int *datalen,
-                                     struct nf_conntrack_expect *rtp_exp,
-                                     struct nf_conntrack_expect *rtcp_exp,
-                                     unsigned int mediaoff,
-                                     unsigned int medialen,
-                                     union nf_inet_addr *rtp_addr)
-{
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        u_int16_t port;
-        /* Connection will come from reply */
-        if (ct->tuplehash[dir].tuple.src.u3.ip ==
-            ct->tuplehash[!dir].tuple.dst.u3.ip)
-                rtp_addr->ip = rtp_exp->tuple.dst.u3.ip;
-        else
-                rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-        rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip;
-        rtp_exp->tuple.dst.u3.ip = rtp_addr->ip;
-        rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
-        rtp_exp->dir = !dir;
-        rtp_exp->expectfn = ip_nat_sip_expected;
-        rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip;
-        rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip;
-        rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
-        rtcp_exp->dir = !dir;
-        rtcp_exp->expectfn = ip_nat_sip_expected;
-        /* Try to get same pair of ports: if not, try to change them. */
-        for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
-             port != 0; port += 2) {
-                int ret;
-                rtp_exp->tuple.dst.u.udp.port = htons(port);
-                ret = nf_ct_expect_related(rtp_exp);
-                if (ret == -EBUSY)
-                        continue;
-                else if (ret < 0) {
-                        port = 0;
-                        break;
-                }
-                rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
-                ret = nf_ct_expect_related(rtcp_exp);
-                if (ret == 0)
-                        break;
-                else if (ret == -EBUSY) {
-                        nf_ct_unexpect_related(rtp_exp);
-                        continue;
-                } else if (ret < 0) {
-                        nf_ct_unexpect_related(rtp_exp);
-                        port = 0;
-                        break;
-                }
-        }
-        if (port == 0)
-                goto err1;
-        /* Update media port. */
-        if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
-            !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
-                             mediaoff, medialen, port))
-                goto err2;
-        return NF_ACCEPT;
-err2:
-        nf_ct_unexpect_related(rtp_exp);
-        nf_ct_unexpect_related(rtcp_exp);
-err1:
-        return NF_DROP;
-}
-static struct nf_ct_helper_expectfn sip_nat = {
-        .name           = "sip",
-        .expectfn       = ip_nat_sip_expected,
-};
-static void __exit nf_nat_sip_fini(void)
-{
-        RCU_INIT_POINTER(nf_nat_sip_hook, NULL);
-        RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, NULL);
-        RCU_INIT_POINTER(nf_nat_sip_expect_hook, NULL);
-        RCU_INIT_POINTER(nf_nat_sdp_addr_hook, NULL);
-        RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL);
-        RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL);
-        RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL);
-        nf_ct_helper_expectfn_unregister(&sip_nat);
-        synchronize_rcu();
-}
-static int __init nf_nat_sip_init(void)
-{
-        BUG_ON(nf_nat_sip_hook != NULL);
-        BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
-        BUG_ON(nf_nat_sip_expect_hook != NULL);
-        BUG_ON(nf_nat_sdp_addr_hook != NULL);
-        BUG_ON(nf_nat_sdp_port_hook != NULL);
-        BUG_ON(nf_nat_sdp_session_hook != NULL);
-        BUG_ON(nf_nat_sdp_media_hook != NULL);
-        RCU_INIT_POINTER(nf_nat_sip_hook, ip_nat_sip);
-        RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
-        RCU_INIT_POINTER(nf_nat_sip_expect_hook, ip_nat_sip_expect);
-        RCU_INIT_POINTER(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
-        RCU_INIT_POINTER(nf_nat_sdp_port_hook, ip_nat_sdp_port);
-        RCU_INIT_POINTER(nf_nat_sdp_session_hook, ip_nat_sdp_session);
-        RCU_INIT_POINTER(nf_nat_sdp_media_hook, ip_nat_sdp_media);
-        nf_ct_helper_expectfn_register(&sip_nat);
-        return 0;
-}
-module_init(nf_nat_sip_init);
-module_exit(nf_nat_sip_fini);
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
deleted file mode 100644
index 9dbb8d284f99..000000000000
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/udp.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <linux/netfilter/nf_conntrack_tftp.h>
-MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
-MODULE_DESCRIPTION("TFTP NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_tftp");
-static unsigned int help(struct sk_buff *skb,
-                         enum ip_conntrack_info ctinfo,
-                         struct nf_conntrack_expect *exp)
-{
-        const struct nf_conn *ct = exp->master;
-        exp->saved_proto.udp.port
-                = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
-        exp->dir = IP_CT_DIR_REPLY;
-        exp->expectfn = nf_nat_follow_master;
-        if (nf_ct_expect_related(exp) != 0)
-                return NF_DROP;
-        return NF_ACCEPT;
-}
-static void __exit nf_nat_tftp_fini(void)
-{
-        RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
-        synchronize_rcu();
-}
-static int __init nf_nat_tftp_init(void)
-{
-        BUG_ON(nf_nat_tftp_hook != NULL);
-        RCU_INIT_POINTER(nf_nat_tftp_hook, help);
-        return 0;
-}
-module_init(nf_nat_tftp_init);
-module_exit(nf_nat_tftp_fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 957acd12250b..8de53e1ddd54 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
        SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
        SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+        SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+        SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+        SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+        SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
        SNMP_MIB_SENTINEL
 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fd9af60397b5..ff622069fcef 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1111,10 +1111,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        const struct rtable *rt = (const struct rtable *) dst;
        unsigned int mtu = rt->rt_pmtu;
-        if (mtu && time_after_eq(jiffies, rt->dst.expires))
+        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
-                mtu = 0;
-        if (!mtu)
                mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu && rt_is_output_route(rt))
@@ -1566,11 +1563,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_zeronet(daddr))
                goto martian_destination;
-        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
+        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
-                if (ipv4_is_loopback(daddr))
+         * and call it once if daddr or/and saddr are loopback addresses
+         */
+        if (ipv4_is_loopback(daddr)) {
+                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_destination;
+        } else if (ipv4_is_loopback(saddr)) {
-                if (ipv4_is_loopback(saddr))
+                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_source;
        }
@@ -1595,7 +1595,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (res.type == RTN_LOCAL) {
                err = fib_validate_source(skb, saddr, daddr, tos,
-                                          net->loopback_dev->ifindex,
+                                          LOOPBACK_IFINDEX,
                                          dev, in_dev, &itag);
                if (err < 0)
                        goto martian_source_keep_err;
@@ -1871,7 +1871,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
        orig_oif = fl4->flowi4_oif;
-        fl4->flowi4_iif = net->loopback_dev->ifindex;
+        fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1960,7 +1960,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                if (!fl4->daddr)
                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
                dev_out = net->loopback_dev;
-                fl4->flowi4_oif = net->loopback_dev->ifindex;
+                fl4->flowi4_oif = LOOPBACK_IFINDEX;
                res.type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
@@ -2131,7 +2131,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
-                        struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
+                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
                        u32 seq, int event, int nowait, unsigned int flags)
 {
        struct rtable *rt = skb_rtable(skb);
@@ -2141,7 +2141,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
        u32 error;
        u32 metrics[RTAX_MAX];
-        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
+        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
@@ -2301,12 +2301,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
                rt->rt_flags |= RTCF_NOTIFY;
        err = rt_fill_info(net, dst, src, &fl4, skb,
-                           NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                           RTM_NEWROUTE, 0, 0);
        if (err <= 0)
                goto errout_free;
-        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 errout:
        return err;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e6..ba48e799b031 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        ireq->tstamp_ok         = tcp_opt.saw_tstamp;
        req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
        treq->snt_synack        = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+        treq->listener          = NULL;
        /* We throwed the options of the initial SYN away, so we hope
         * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3e78c79b5586..9205e492dc9d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -232,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
        return 0;
 }
+int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
+                          size_t *lenp, loff_t *ppos)
+{
+        ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
+        struct tcp_fastopen_context *ctxt;
+        int ret;
+        u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+        tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+        if (!tbl.data)
+                return -ENOMEM;
+        rcu_read_lock();
+        ctxt = rcu_dereference(tcp_fastopen_ctx);
+        if (ctxt)
+                memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+        rcu_read_unlock();
+        snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
+                user_key[0], user_key[1], user_key[2], user_key[3]);
+        ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+        if (write && ret == 0) {
+                if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
+                           user_key + 2, user_key + 3) != 4) {
+                        ret = -EINVAL;
+                        goto bad_key;
+                }
+                tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+        }
+bad_key:
+        pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+               user_key[0], user_key[1], user_key[2], user_key[3],
+               (char *)tbl.data, ret);
+        kfree(tbl.data);
+        return ret;
+}
 static struct ctl_table ipv4_table[] = {
        {
                .procname       = "tcp_timestamps",
@@ -386,6 +425,12 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
+                .procname       = "tcp_fastopen_key",
+                .mode           = 0600,
+                .maxlen         = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+                .proc_handler   = proc_tcp_fastopen_key,
+        },
+        {
                .procname       = "tcp_tw_recycle",
                .data           = &tcp_death_row.sysctl_tw_recycle,
                .maxlen         = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5f6419341821..f32c02e2a543 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-        /* Connected? */
+        /* Connected or passive Fast Open socket? */
-        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+        if (sk->sk_state != TCP_SYN_SENT &&
+            (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
                int target = sock_rcvlowat(sk, 0, INT_MAX);
                if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
        ssize_t copied;
        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-        /* Wait for a connection to finish. */
+        /* Wait for a connection to finish. One exception is TCP Fast Open
-        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+         * (passive side) where data is allowed to be sent before a connection
+         * is fully established.
+         */
+        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+            !tcp_passive_fastopen(sk)) {
                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
                        goto out_err;
+        }
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-        /* Wait for a connection to finish. */
+        /* Wait for a connection to finish. One exception is TCP Fast Open
-        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+         * (passive side) where data is allowed to be sent before a connection
+         * is fully established.
+         */
+        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+            !tcp_passive_fastopen(sk)) {
                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
                        goto do_error;
+        }
        if (unlikely(tp->repair)) {
                if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1139,78 +1150,43 @@ new_segment:
                                if (err)
                                        goto do_fault;
                        } else {
-                                bool merge = false;
+                                bool merge = true;
                                int i = skb_shinfo(skb)->nr_frags;
-                                struct page *page = sk->sk_sndmsg_page;
+                                struct page_frag *pfrag = sk_page_frag(sk);
-                                int off;
+                                if (!sk_page_frag_refill(sk, pfrag))
-                                if (page && page_count(page) == 1)
+                                        goto wait_for_memory;
-                                        sk->sk_sndmsg_off = 0;
+                                if (!skb_can_coalesce(skb, i, pfrag->page,
-                                off = sk->sk_sndmsg_off;
+                                                      pfrag->offset)) {
+                                        if (i == MAX_SKB_FRAGS || !sg) {
-                                if (skb_can_coalesce(skb, i, page, off) &&
+                                                tcp_mark_push(tp, skb);
-                                    off != PAGE_SIZE) {
+                                                goto new_segment;
-                                        /* We can extend the last page
-                                         * fragment. */
-                                        merge = true;
-                                } else if (i == MAX_SKB_FRAGS || !sg) {
-                                        /* Need to add new fragment and cannot
-                                         * do this because interface is non-SG,
-                                         * or because all the page slots are
-                                         * busy. */
-                                        tcp_mark_push(tp, skb);
-                                        goto new_segment;
-                                } else if (page) {
-                                        if (off == PAGE_SIZE) {
-                                                put_page(page);
-                                                sk->sk_sndmsg_page = page = NULL;
-                                                off = 0;
                                        }
-                                } else
+                                        merge = false;
-                                        off = 0;
+                                }
-                                if (copy > PAGE_SIZE - off)
+                                copy = min_t(int, copy, pfrag->size - pfrag->offset);
-                                        copy = PAGE_SIZE - off;
                                if (!sk_wmem_schedule(sk, copy))
                                        goto wait_for_memory;
-                                if (!page) {
-                                        /* Allocate new cache page. */
-                                        if (!(page = sk_stream_alloc_page(sk)))
-                                                goto wait_for_memory;
-                                }
-                                /* Time to copy data. We are close to
-                                 * the end! */
                                err = skb_copy_to_page_nocache(sk, from, skb,
-                                                               page, off, copy);
+                                                               pfrag->page,
-                                if (err) {
+                                                               pfrag->offset,
-                                        /* If this page was new, give it to the
+                                                               copy);
-                                         * socket so it does not get leaked.
+                                if (err)
-                                         */
-                                        if (!sk->sk_sndmsg_page) {
-                                                sk->sk_sndmsg_page = page;
-                                                sk->sk_sndmsg_off = 0;
-                                        }
                                        goto do_error;
-                                }
                                /* Update the skb. */
                                if (merge) {
                                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                                } else {
-                                        skb_fill_page_desc(skb, i, page, off, copy);
+                                        skb_fill_page_desc(skb, i, pfrag->page,
-                                        if (sk->sk_sndmsg_page) {
+                                                           pfrag->offset, copy);
-                                                get_page(page);
+                                        get_page(pfrag->page);
-                                        } else if (off + copy < PAGE_SIZE) {
-                                                get_page(page);
-                                                sk->sk_sndmsg_page = page;
-                                        }
                                }
+                                pfrag->offset += copy;
-                                sk->sk_sndmsg_off = off + copy;
                        }
                        if (!copied)
@@ -2150,6 +2126,10 @@ void tcp_close(struct sock *sk, long timeout)
                 * they look as CLOSING or LAST_ACK for Linux)
                 * Probably, I missed some more holelets.
                 *                                              --ANK
+                 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+                 * in a single packet! (May consider it later but will
+                 * probably need API support or TCP_CORK SYN-ACK until
+                 * data is written and socket is closed.)
                 */
                tcp_send_fin(sk);
        }
@@ -2221,8 +2201,16 @@ adjudge_to_death:
                }
        }
-        if (sk->sk_state == TCP_CLOSE)
+        if (sk->sk_state == TCP_CLOSE) {
+                struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+                /* We could get here with a non-NULL req if the socket is
+                 * aborted (e.g., closed with unread data) before 3WHS
+                 * finishes.
+                 */
+                if (req != NULL)
+                        reqsk_fastopen_remove(sk, req, false);
                inet_csk_destroy_sock(sk);
+        }
        /* Otherwise, socket is reprieved until protocol close. */
 out:
@@ -2308,6 +2296,13 @@ int tcp_disconnect(struct sock *sk, int flags)
 }
 EXPORT_SYMBOL(tcp_disconnect);
+void tcp_sock_destruct(struct sock *sk)
+{
+        inet_sock_destruct(sk);
+        kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
+}
 static inline bool tcp_can_repair_sock(const struct sock *sk)
 {
        return capable(CAP_NET_ADMIN) &&
@@ -2701,6 +2696,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                else
                        icsk->icsk_user_timeout = msecs_to_jiffies(val);
                break;
+        case TCP_FASTOPEN:
+                if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+                    TCPF_LISTEN)))
+                        err = fastopen_init_queue(sk, val);
+                else
+                        err = -EINVAL;
+                break;
        default:
                err = -ENOPROTOOPT;
                break;
@@ -3514,11 +3517,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
 void tcp_done(struct sock *sk)
 {
+        struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
        if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
        tcp_set_state(sk, TCP_CLOSE);
        tcp_clear_xmit_timers(sk);
+        if (req != NULL)
+                reqsk_fastopen_remove(sk, req, false);
        sk->sk_shutdown = SHUTDOWN_MASK;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c409d7..8f7ef0ad80e5 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,91 @@
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <net/inetpeer.h>
+#include <net/tcp.h>
-int sysctl_tcp_fastopen;
+int sysctl_tcp_fastopen __read_mostly;
+struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+        struct tcp_fastopen_context *ctx =
+            container_of(head, struct tcp_fastopen_context, rcu);
+        crypto_free_cipher(ctx->tfm);
+        kfree(ctx);
+}
+int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+{
+        int err;
+        struct tcp_fastopen_context *ctx, *octx;
+        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+        if (!ctx)
+                return -ENOMEM;
+        ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
+        if (IS_ERR(ctx->tfm)) {
+                err = PTR_ERR(ctx->tfm);
+error:          kfree(ctx);
+                pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
+                return err;
+        }
+        err = crypto_cipher_setkey(ctx->tfm, key, len);
+        if (err) {
+                pr_err("TCP: TFO cipher key error: %d\n", err);
+                crypto_free_cipher(ctx->tfm);
+                goto error;
+        }
+        memcpy(ctx->key, key, len);
+        spin_lock(&tcp_fastopen_ctx_lock);
+        octx = rcu_dereference_protected(tcp_fastopen_ctx,
+                                lockdep_is_held(&tcp_fastopen_ctx_lock));
+        rcu_assign_pointer(tcp_fastopen_ctx, ctx);
+        spin_unlock(&tcp_fastopen_ctx_lock);
+        if (octx)
+                call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+        return err;
+}
+/* Computes the fastopen cookie for the peer.
+ * The peer address is a 128 bits long (pad with zeros for IPv4).
+ *
+ * The caller must check foc->len to determine if a valid cookie
+ * has been generated successfully.
+*/
+void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
+{
+        __be32 peer_addr[4] = { addr, 0, 0, 0 };
+        struct tcp_fastopen_context *ctx;
+        rcu_read_lock();
+        ctx = rcu_dereference(tcp_fastopen_ctx);
+        if (ctx) {
+                crypto_cipher_encrypt_one(ctx->tfm,
+                                          foc->val,
+                                          (__u8 *)peer_addr);
+                foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+        }
+        rcu_read_unlock();
+}
 static int __init tcp_fastopen_init(void)
 {
+        __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+        get_random_bytes(key, sizeof(key));
+        tcp_fastopen_reset_cipher(key, sizeof(key));
        return 0;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d377f4854cb8..432c36649db3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
                        tcp_enter_quickack_mode((struct sock *)tp);
                break;
        case INET_ECN_CE:
-                tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+                if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+                        /* Better not delay acks, sender can have a very low cwnd */
+                        tcp_enter_quickack_mode((struct sock *)tp);
+                        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+                }
                /* fallinto */
        default:
                tp->ecn_flags |= TCP_ECN_SEEN;
@@ -374,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
 /* 4. Try to fixup all. It is made immediately after connection enters
 *    established state.
 */
-static void tcp_init_buffer_space(struct sock *sk)
+void tcp_init_buffer_space(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int maxwin;
@@ -739,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
-/* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        const struct inet_connection_sock *icsk = inet_csk(sk);
-        tp->prior_ssthresh = 0;
-        tp->bytes_acked = 0;
-        if (icsk->icsk_ca_state < TCP_CA_CWR) {
-                tp->undo_marker = 0;
-                if (set_ssthresh)
-                        tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-                tp->snd_cwnd = min(tp->snd_cwnd,
-                                   tcp_packets_in_flight(tp) + 1U);
-                tp->snd_cwnd_cnt = 0;
-                tp->high_seq = tp->snd_nxt;
-                tp->snd_cwnd_stamp = tcp_time_stamp;
-                TCP_ECN_queue_cwr(tp);
-                tcp_set_ca_state(sk, TCP_CA_CWR);
-        }
-}
 /*
 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 * disables it when reordering is detected
@@ -2489,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-/* Lower bound on congestion window is slow start threshold
- * unless congestion avoidance choice decides to overide it.
- */
-static inline u32 tcp_cwnd_min(const struct sock *sk)
-{
-        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-        return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
-}
-/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct sock *sk, int flag)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int decr = tp->snd_cwnd_cnt + 1;
-        if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
-            (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
-                tp->snd_cwnd_cnt = decr & 1;
-                decr >>= 1;
-                if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
-                        tp->snd_cwnd -= decr;
-                tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-                tp->snd_cwnd_stamp = tcp_time_stamp;
-        }
-}
 /* Nothing was retransmitted or returned timestamp is less
 * than timestamp of the first retransmission.
 */
@@ -2719,24 +2671,80 @@ static bool tcp_try_undo_loss(struct sock *sk)
        return false;
 }
-static inline void tcp_complete_cwr(struct sock *sk)
+/* The cwnd reduction in CWR and Recovery use the PRR algorithm
+ * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ *   1) If the packets in flight is larger than ssthresh, PRR spreads the
+ *      cwnd reductions across a full RTT.
+ *   2) If packets in flight is lower than ssthresh (such as due to excess
+ *      losses and/or application stalls), do not perform any further cwnd
+ *      reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        /* Do not moderate cwnd if it's already undone in cwr or recovery. */
+        tp->high_seq = tp->snd_nxt;
-        if (tp->undo_marker) {
+        tp->bytes_acked = 0;
-                if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
+        tp->snd_cwnd_cnt = 0;
-                        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+        tp->prior_cwnd = tp->snd_cwnd;
-                        tp->snd_cwnd_stamp = tcp_time_stamp;
+        tp->prr_delivered = 0;
-                } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+        tp->prr_out = 0;
-                        /* PRR algorithm. */
+        if (set_ssthresh)
-                        tp->snd_cwnd = tp->snd_ssthresh;
+                tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
-                        tp->snd_cwnd_stamp = tcp_time_stamp;
+        TCP_ECN_queue_cwr(tp);
-                }
+}
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+                               int fast_rexmit)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int sndcnt = 0;
+        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+        tp->prr_delivered += newly_acked_sacked;
+        if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+                               tp->prior_cwnd - 1;
+                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+        } else {
+                sndcnt = min_t(int, delta,
+                               max_t(int, tp->prr_delivered - tp->prr_out,
+                                     newly_acked_sacked) + 1);
+        }
+        sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+        tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+        if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+            (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+                tp->snd_cwnd = tp->snd_ssthresh;
+                tp->snd_cwnd_stamp = tcp_time_stamp;
        }
        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tp->prior_ssthresh = 0;
+        tp->bytes_acked = 0;
+        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+                tp->undo_marker = 0;
+                tcp_init_cwnd_reduction(sk, set_ssthresh);
+                tcp_set_ca_state(sk, TCP_CA_CWR);
+        }
+}
 static void tcp_try_keep_open(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2751,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk)
        }
 }
-static void tcp_try_to_open(struct sock *sk, int flag)
+static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2768,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
                if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
                        tcp_moderate_cwnd(tp);
        } else {
-                tcp_cwnd_down(sk, flag);
+                tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
        }
 }
@@ -2850,38 +2858,6 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
-/* This function implements the PRR algorithm, specifcally the PRR-SSRB
- * (proportional rate reduction with slow start reduction bound) as described in
- * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
- * It computes the number of packets to send (sndcnt) based on packets newly
- * delivered:
- *   1) If the packets in flight is larger than ssthresh, PRR spreads the
- *      cwnd reductions across a full RTT.
- *   2) If packets in flight is lower than ssthresh (such as due to excess
- *      losses and/or application stalls), do not perform any further cwnd
- *      reductions, but instead slow start up to ssthresh.
- */
-static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
-                                        int fast_rexmit, int flag)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int sndcnt = 0;
-        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
-        if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
-                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
-                               tp->prior_cwnd - 1;
-                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
-        } else {
-                sndcnt = min_t(int, delta,
-                               max_t(int, tp->prr_delivered - tp->prr_out,
-                                     newly_acked_sacked) + 1);
-        }
-        sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
-        tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
-}
 static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2894,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
        NET_INC_STATS_BH(sock_net(sk), mib_idx);
-        tp->high_seq = tp->snd_nxt;
        tp->prior_ssthresh = 0;
        tp->undo_marker = tp->snd_una;
        tp->undo_retrans = tp->retrans_out;
@@ -2902,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                if (!ece_ack)
                        tp->prior_ssthresh = tcp_current_ssthresh(sk);
-                tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+                tcp_init_cwnd_reduction(sk, true);
-                TCP_ECN_queue_cwr(tp);
        }
-        tp->bytes_acked = 0;
-        tp->snd_cwnd_cnt = 0;
-        tp->prior_cwnd = tp->snd_cwnd;
-        tp->prr_delivered = 0;
-        tp->prr_out = 0;
        tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
@@ -2970,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                        /* CWR is to be held something *above* high_seq
                         * is ACKed for CWR bit to reach receiver. */
                        if (tp->snd_una != tp->high_seq) {
-                                tcp_complete_cwr(sk);
+                                tcp_end_cwnd_reduction(sk);
                                tcp_set_ca_state(sk, TCP_CA_Open);
                        }
                        break;
@@ -2980,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                                tcp_reset_reno_sack(tp);
                        if (tcp_try_undo_recovery(sk))
                                return;
-                        tcp_complete_cwr(sk);
+                        tcp_end_cwnd_reduction(sk);
                        break;
                }
        }
@@ -3021,7 +2989,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                        tcp_try_undo_dsack(sk);
                if (!tcp_time_to_recover(sk, flag)) {
-                        tcp_try_to_open(sk, flag);
+                        tcp_try_to_open(sk, flag, newly_acked_sacked);
                        return;
                }
@@ -3043,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
        if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
                tcp_update_scoreboard(sk, fast_rexmit);
-        tp->prr_delivered += newly_acked_sacked;
+        tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
-        tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
        tcp_xmit_retransmit_queue(sk);
 }
@@ -3123,6 +3090,12 @@ void tcp_rearm_rto(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        /* If the retrans timer is currently being used by Fast Open
+         * for SYN-ACK retrans purpose, stay put.
+         */
+        if (tp->fastopen_rsk)
+                return;
        if (!tp->packets_out) {
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
        } else {
@@ -3384,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-                !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
+                !tcp_in_cwnd_reduction(sk);
 }
 /* Check that window update is acceptable.
@@ -3452,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 }
 /* A conservative spurious RTO response algorithm: reduce cwnd using
- * rate halving and continue in congestion avoidance.
+ * PRR and continue in congestion avoidance.
 */
-static void tcp_ratehalving_spur_to_response(struct sock *sk)
+static void tcp_cwr_spur_to_response(struct sock *sk)
 {
        tcp_enter_cwr(sk, 0);
 }
@@ -3462,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)
 static void tcp_undo_spur_to_response(struct sock *sk, int flag)
 {
        if (flag & FLAG_ECE)
-                tcp_ratehalving_spur_to_response(sk);
+                tcp_cwr_spur_to_response(sk);
        else
                tcp_undo_cwr(sk, true);
 }
@@ -3569,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
                        tcp_conservative_spur_to_response(tp);
                        break;
                default:
-                        tcp_ratehalving_spur_to_response(sk);
+                        tcp_cwr_spur_to_response(sk);
                        break;
                }
                tp->frto_counter = 0;
@@ -4034,7 +4007,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
 }
 /* When we get a reset we do this. */
-static void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk)
 {
        /* We want the right error as BSD sees it (and indeed as we do). */
        switch (sk->sk_state) {
@@ -5740,7 +5713,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                TCP_ECN_rcv_synack(tp, th);
-                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                tcp_ack(sk, skb, FLAG_SLOWPATH);
                /* Ok.. it's good. Set up sequence numbers and
@@ -5753,7 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 * never scaled.
                 */
                tp->snd_wnd = ntohs(th->window);
-                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                if (!tp->rx_opt.wscale_ok) {
                        tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5891,7 +5863,9 @@ discard:
                tcp_send_synack(sk);
 #if 0
                /* Note, we could accept data and URG from this segment.
-                 * There are no obstacles to make this.
+                 * There are no obstacles to make this (except that we must
+                 * either change tcp_recvmsg() to prevent it from returning data
+                 * before 3WHS completes per RFC793, or employ TCP Fast Open).
                 *
                 * However, if we ignore data in ACKless segments sometimes,
                 * we have no reasons to accept it sometimes.
@@ -5931,6 +5905,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock *req;
        int queued = 0;
        tp->rx_opt.saw_tstamp = 0;
@@ -5986,6 +5961,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                return 0;
        }
+        req = tp->fastopen_rsk;
+        if (req != NULL) {
+                BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                    sk->sk_state != TCP_FIN_WAIT1);
+                if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+                        goto discard;
+        }
        if (!tcp_validate_incoming(sk, skb, th, 0))
                return 0;
@@ -5996,7 +5979,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                switch (sk->sk_state) {
                case TCP_SYN_RECV:
                        if (acceptable) {
-                                tp->copied_seq = tp->rcv_nxt;
+                                /* Once we leave TCP_SYN_RECV, we no longer
+                                 * need req so release it.
+                                 */
+                                if (req) {
+                                        tcp_synack_rtt_meas(sk, req);
+                                        tp->total_retrans = req->retrans;
+                                        reqsk_fastopen_remove(sk, req, false);
+                                } else {
+                                        /* Make sure socket is routed, for
+                                         * correct metrics.
+                                         */
+                                        icsk->icsk_af_ops->rebuild_header(sk);
+                                        tcp_init_congestion_control(sk);
+                                        tcp_mtup_init(sk);
+                                        tcp_init_buffer_space(sk);
+                                        tp->copied_seq = tp->rcv_nxt;
+                                }
                                smp_mb();
                                tcp_set_state(sk, TCP_ESTABLISHED);
                                sk->sk_state_change(sk);
@@ -6018,23 +6019,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                if (tp->rx_opt.tstamp_ok)
                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-                                /* Make sure socket is routed, for
+                                if (req) {
-                                 * correct metrics.
+                                        /* Re-arm the timer because data may
-                                 */
+                                         * have been sent out. This is similar
-                                icsk->icsk_af_ops->rebuild_header(sk);
+                                         * to the regular data transmission case
+                                         * when new data has just been ack'ed.
-                                tcp_init_metrics(sk);
+                                         *
+                                         * (TFO) - we could try to be more
-                                tcp_init_congestion_control(sk);
+                                         * aggressive and retranmitting any data
+                                         * sooner based on when they were sent
+                                         * out.
+                                         */
+                                        tcp_rearm_rto(sk);
+                                } else
+                                        tcp_init_metrics(sk);
                                /* Prevent spurious tcp_cwnd_restart() on
                                 * first data packet.
                                 */
                                tp->lsndtime = tcp_time_stamp;
-                                tcp_mtup_init(sk);
                                tcp_initialize_rcv_mss(sk);
-                                tcp_init_buffer_space(sk);
                                tcp_fast_path_on(tp);
                        } else {
                                return 1;
@@ -6042,6 +6047,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        break;
                case TCP_FIN_WAIT1:
+                        /* If we enter the TCP_FIN_WAIT1 state and we are a
+                         * Fast Open socket and this is the first acceptable
+                         * ACK we have received, this would have acknowledged
+                         * our SYNACK so stop the SYNACK timer.
+                         */
+                        if (acceptable && req != NULL) {
+                                /* We no longer need the request sock. */
+                                reqsk_fastopen_remove(sk, req, false);
+                                tcp_rearm_rto(sk);
+                        }
                        if (tp->snd_una == tp->write_seq) {
                                struct dst_entry *dst;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index be23a0b7b89e..75735c9a6a9d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        const int code = icmp_hdr(icmp_skb)->code;
        struct sock *sk;
        struct sk_buff *skb;
+        struct request_sock *req;
        __u32 seq;
        __u32 remaining;
        int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        icsk = inet_csk(sk);
        tp = tcp_sk(sk);
+        req = tp->fastopen_rsk;
        seq = ntohl(th->seq);
        if (sk->sk_state != TCP_LISTEN &&
-            !between(seq, tp->snd_una, tp->snd_nxt)) {
+            !between(seq, tp->snd_una, tp->snd_nxt) &&
+            (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
+                /* For a Fast Open socket, allow seq to be snt_isn. */
                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                    !icsk->icsk_backoff)
                        break;
+                /* XXX (TFO) - revisit the following logic for TFO */
                if (sock_owned_by_user(sk))
                        break;
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                goto out;
        }
+        /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
+         * than following the TCP_SYN_RECV case and closing the socket,
+         * we ignore the ICMP error and keep trying like a fully established
+         * socket. Is this the right thing to do?
+         */
+        if (req && req->sk == NULL)
+                goto out;
        switch (sk->sk_state) {
                struct request_sock *req, **prev;
        case TCP_LISTEN:
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:  /* Cannot happen.
-                               It can f.e. if SYNs crossed.
+                               It can f.e. if SYNs crossed,
+                               or Fast Open.
                             */
                if (!sock_owned_by_user(sk)) {
                        sk->sk_err = err;
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
 {
-        tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
+        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
-                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
+         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+         */
+        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
                        req->ts_recent,
                        0,
                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -839,7 +858,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                return -1;
-        skb = tcp_make_synack(sk, dst, req, rvp);
+        skb = tcp_make_synack(sk, dst, req, rvp, NULL);
        if (skb) {
                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -849,6 +868,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                                            ireq->rmt_addr,
                                            ireq->opt);
                err = net_xmit_eval(err);
+                if (!tcp_rsk(req)->snt_synack && !err)
+                        tcp_rsk(req)->snt_synack = tcp_time_stamp;
        }
        return err;
@@ -904,8 +925,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
 /*
 * Save and compile IPv4 options into the request_sock if needed.
 */
-static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
+static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
-                                                  struct sk_buff *skb)
 {
        const struct ip_options *opt = &(IPCB(skb)->opt);
        struct ip_options_rcu *dopt = NULL;
@@ -1272,6 +1292,182 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 };
 #endif
+static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
+                               struct request_sock *req,
+                               struct tcp_fastopen_cookie *foc,
+                               struct tcp_fastopen_cookie *valid_foc)
+{
+        bool skip_cookie = false;
+        struct fastopen_queue *fastopenq;
+        if (likely(!fastopen_cookie_present(foc))) {
+                /* See include/net/tcp.h for the meaning of these knobs */
+                if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
+                    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
+                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
+                        skip_cookie = true; /* no cookie to validate */
+                else
+                        return false;
+        }
+        fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+        /* A FO option is present; bump the counter. */
+        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
+        /* Make sure the listener has enabled fastopen, and we don't
+         * exceed the max # of pending TFO requests allowed before trying
+         * to validating the cookie in order to avoid burning CPU cycles
+         * unnecessarily.
+         *
+         * XXX (TFO) - The implication of checking the max_qlen before
+         * processing a cookie request is that clients can't differentiate
+         * between qlen overflow causing Fast Open to be disabled
+         * temporarily vs a server not supporting Fast Open at all.
+         */
+        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
+            fastopenq == NULL || fastopenq->max_qlen == 0)
+                return false;
+        if (fastopenq->qlen >= fastopenq->max_qlen) {
+                struct request_sock *req1;
+                spin_lock(&fastopenq->lock);
+                req1 = fastopenq->rskq_rst_head;
+                if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+                        spin_unlock(&fastopenq->lock);
+                        NET_INC_STATS_BH(sock_net(sk),
+                            LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+                        /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
+                        foc->len = -1;
+                        return false;
+                }
+                fastopenq->rskq_rst_head = req1->dl_next;
+                fastopenq->qlen--;
+                spin_unlock(&fastopenq->lock);
+                reqsk_free(req1);
+        }
+        if (skip_cookie) {
+                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                return true;
+        }
+        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
+                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
+                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
+                            memcmp(&foc->val[0], &valid_foc->val[0],
+                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
+                                return false;
+                        valid_foc->len = -1;
+                }
+                /* Acknowledge the data received from the peer. */
+                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                return true;
+        } else if (foc->len == 0) { /* Client requesting a cookie */
+                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                NET_INC_STATS_BH(sock_net(sk),
+                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+        } else {
+                /* Client sent a cookie with wrong size. Treat it
+                 * the same as invalid and return a valid one.
+                 */
+                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+        }
+        return false;
+}
+static int tcp_v4_conn_req_fastopen(struct sock *sk,
+                                    struct sk_buff *skb,
+                                    struct sk_buff *skb_synack,
+                                    struct request_sock *req,
+                                    struct request_values *rvp)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+        const struct inet_request_sock *ireq = inet_rsk(req);
+        struct sock *child;
+        int err;
+        req->retrans = 0;
+        req->sk = NULL;
+        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+        if (child == NULL) {
+                NET_INC_STATS_BH(sock_net(sk),
+                                 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+                kfree_skb(skb_synack);
+                return -1;
+        }
+        err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+                                    ireq->rmt_addr, ireq->opt);
+        err = net_xmit_eval(err);
+        if (!err)
+                tcp_rsk(req)->snt_synack = tcp_time_stamp;
+        /* XXX (TFO) - is it ok to ignore error and continue? */
+        spin_lock(&queue->fastopenq->lock);
+        queue->fastopenq->qlen++;
+        spin_unlock(&queue->fastopenq->lock);
+        /* Initialize the child socket. Have to fix some values to take
+         * into account the child is a Fast Open socket and is created
+         * only out of the bits carried in the SYN packet.
+         */
+        tp = tcp_sk(child);
+        tp->fastopen_rsk = req;
+        /* Do a hold on the listner sk so that if the listener is being
+         * closed, the child that has been accepted can live on and still
+         * access listen_lock.
+         */
+        sock_hold(sk);
+        tcp_rsk(req)->listener = sk;
+        /* RFC1323: The window in SYN & SYN/ACK segments is never
+         * scaled. So correct it appropriately.
+         */
+        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+        /* Activate the retrans timer so that SYNACK can be retransmitted.
+         * The request socket is not added to the SYN table of the parent
+         * because it's been added to the accept queue directly.
+         */
+        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+            TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+        /* Add the child socket directly into the accept queue */
+        inet_csk_reqsk_queue_add(sk, req, child);
+        /* Now finish processing the fastopen child socket. */
+        inet_csk(child)->icsk_af_ops->rebuild_header(child);
+        tcp_init_congestion_control(child);
+        tcp_mtup_init(child);
+        tcp_init_buffer_space(child);
+        tcp_init_metrics(child);
+        /* Queue the data carried in the SYN packet. We need to first
+         * bump skb's refcnt because the caller will attempt to free it.
+         *
+         * XXX (TFO) - we honor a zero-payload TFO request for now.
+         * (Any reason not to?)
+         */
+        if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
+                /* Don't queue the skb if there is no payload in SYN.
+                 * XXX (TFO) - How about SYN+FIN?
+                 */
+                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+        } else {
+                skb = skb_get(skb);
+                skb_dst_drop(skb);
+                __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+                skb_set_owner_r(skb, child);
+                __skb_queue_tail(&child->sk_receive_queue, skb);
+                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+        }
+        sk->sk_data_ready(sk, 0);
+        bh_unlock_sock(child);
+        sock_put(child);
+        WARN_ON(req->sk == NULL);
+        return 0;
+}
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_extend_values tmp_ext;
@@ -1285,6 +1481,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        __be32 daddr = ip_hdr(skb)->daddr;
        __u32 isn = TCP_SKB_CB(skb)->when;
        bool want_cookie = false;
+        struct flowi4 fl4;
+        struct tcp_fastopen_cookie foc = { .len = -1 };
+        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+        struct sk_buff *skb_synack;
+        int do_fastopen;
        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1319,7 +1520,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
-        tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+        tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
+            want_cookie ? NULL : &foc);
        if (tmp_opt.cookie_plus > 0 &&
            tmp_opt.saw_tstamp &&
@@ -1365,7 +1567,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        ireq->loc_addr = daddr;
        ireq->rmt_addr = saddr;
        ireq->no_srccheck = inet_sk(sk)->transparent;
-        ireq->opt = tcp_v4_save_options(sk, skb);
+        ireq->opt = tcp_v4_save_options(skb);
        if (security_inet_conn_request(sk, skb, req))
                goto drop_and_free;
@@ -1377,8 +1579,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
-                struct flowi4 fl4;
                /* VJ's idea. We save last timestamp seen
                 * from the destination in peer table, when entering
                 * state TIME-WAIT, and check against it before
@@ -1417,16 +1617,54 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                isn = tcp_v4_init_sequence(skb);
        }
        tcp_rsk(req)->snt_isn = isn;
-        tcp_rsk(req)->snt_synack = tcp_time_stamp;
-        if (tcp_v4_send_synack(sk, dst, req,
+        if (dst == NULL) {
-                               (struct request_values *)&tmp_ext,
+                dst = inet_csk_route_req(sk, &fl4, req);
-                               skb_get_queue_mapping(skb),
+                if (dst == NULL)
-                               want_cookie) ||
+                        goto drop_and_free;
-            want_cookie)
+        }
+        do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
+        /* We don't call tcp_v4_send_synack() directly because we need
+         * to make sure a child socket can be created successfully before
+         * sending back synack!
+         *
+         * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
+         * (or better yet, call tcp_send_synack() in the child context
+         * directly, but will have to fix bunch of other code first)
+         * after syn_recv_sock() except one will need to first fix the
+         * latter to remove its dependency on the current implementation
+         * of tcp_v4_send_synack()->tcp_select_initial_window().
+         */
+        skb_synack = tcp_make_synack(sk, dst, req,
+            (struct request_values *)&tmp_ext,
+            fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
+        if (skb_synack) {
+                __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
+                skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
+        } else
+                goto drop_and_free;
+        if (likely(!do_fastopen)) {
+                int err;
+                err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+                     ireq->rmt_addr, ireq->opt);
+                err = net_xmit_eval(err);
+                if (err || want_cookie)
+                        goto drop_and_free;
+                tcp_rsk(req)->snt_synack = tcp_time_stamp;
+                tcp_rsk(req)->listener = NULL;
+                /* Add the request_sock to the SYN table */
+                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+                if (fastopen_cookie_present(&foc) && foc.len != 0)
+                        NET_INC_STATS_BH(sock_net(sk),
+                            LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+        } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
+            (struct request_values *)&tmp_ext))
                goto drop_and_free;
-        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
        return 0;
 drop_and_release:
@@ -1500,9 +1738,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
        tcp_initialize_rcv_mss(newsk);
-        if (tcp_rsk(req)->snt_synack)
+        tcp_synack_rtt_meas(newsk, req);
-                tcp_valid_rtt_meas(newsk,
-                    tcp_time_stamp - tcp_rsk(req)->snt_synack);
        newtp->total_retrans = req->retrans;
 #ifdef CONFIG_TCP_MD5SIG
@@ -1554,7 +1790,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
                                                       iph->saddr, iph->daddr);
        if (req)
-                return tcp_check_req(sk, skb, req, prev);
+                return tcp_check_req(sk, skb, req, prev, false);
        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
                        th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1963,20 +2199,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
        if (inet_csk(sk)->icsk_bind_hash)
                inet_put_port(sk);
-        /*
-         * If sendmsg cached page exists, toss it.
-         */
-        if (sk->sk_sndmsg_page) {
-                __free_page(sk->sk_sndmsg_page);
-                sk->sk_sndmsg_page = NULL;
-        }
        /* TCP Cookie Transactions */
        if (tp->cookie_values != NULL) {
                kref_put(&tp->cookie_values->kref,
                         tcp_cookie_values_release);
                tp->cookie_values = NULL;
        }
+        BUG_ON(tp->fastopen_rsk != NULL);
        /* If socket is aborted during connect operation */
        tcp_free_fastopen_req(tp);
@@ -2396,7 +2625,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
                         struct seq_file *f, int i, kuid_t uid, int *len)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
-        int ttd = req->expires - jiffies;
+        long delta = req->expires - jiffies;
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
@@ -2408,7 +2637,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
                TCP_SYN_RECV,
                0, 0, /* could print option size, but that is af dependent. */
                1,    /* timers active (only the expire timer) */
-                jiffies_to_clock_t(ttd),
+                jiffies_delta_to_clock_t(delta),
                req->retrans,
                from_kuid_munged(seq_user_ns(f), uid),
                0,  /* non standard timer */
@@ -2425,6 +2654,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct inet_sock *inet = inet_sk(sk);
+        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
        __be32 dest = inet->inet_daddr;
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
@@ -2459,7 +2689,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                tp->write_seq - tp->snd_una,
                rx_queue,
                timer_active,
-                jiffies_to_clock_t(timer_expires - jiffies),
+                jiffies_delta_to_clock_t(timer_expires - jiffies),
                icsk->icsk_retransmits,
                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
                icsk->icsk_probes_out,
@@ -2469,7 +2699,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                jiffies_to_clock_t(icsk->icsk_ack.ato),
                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
                tp->snd_cwnd,
-                tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
+                sk->sk_state == TCP_LISTEN ?
+                    (fastopenq ? fastopenq->max_qlen : 0) :
+                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
                len);
 }
@@ -2478,10 +2710,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
 {
        __be32 dest, src;
        __u16 destp, srcp;
-        int ttd = tw->tw_ttd - jiffies;
+        long delta = tw->tw_ttd - jiffies;
-        if (ttd < 0)
-                ttd = 0;
        dest  = tw->tw_daddr;
        src   = tw->tw_rcv_saddr;
@@ -2491,7 +2720,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
-                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
+                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
                atomic_read(&tw->tw_refcnt), tw, len);
 }
@@ -2574,6 +2803,8 @@ void tcp4_proc_exit(void)
 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
        const struct iphdr *iph = skb_gro_network_header(skb);
+        __wsum wsum;
+        __sum16 sum;
        switch (skb->ip_summed) {
        case CHECKSUM_COMPLETE:
@@ -2582,11 +2813,22 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
                        break;
                }
+flush:
-                /* fall through */
-        case CHECKSUM_NONE:
                NAPI_GRO_CB(skb)->flush = 1;
                return NULL;
+        case CHECKSUM_NONE:
+                wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+                                          skb_gro_len(skb), IPPROTO_TCP, 0);
+                sum = csum_fold(skb_checksum(skb,
+                                             skb_gro_offset(skb),
+                                             skb_gro_len(skb),
+                                             wsum));
+                if (sum)
+                        goto flush;
+                skb->ip_summed = CHECKSUM_UNNECESSARY;
+                break;
        }
        return tcp_gro_receive(head, skb);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0abe67bb4d3a..4c752a6e0bcd 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/tcp.h>
 #include <linux/hash.h>
+#include <linux/tcp_metrics.h>
 #include <net/inet_connection_sock.h>
 #include <net/net_namespace.h>
@@ -17,20 +18,10 @@
 #include <net/ipv6.h>
 #include <net/dst.h>
 #include <net/tcp.h>
+#include <net/genetlink.h>
 int sysctl_tcp_nometrics_save __read_mostly;
-enum tcp_metric_index {
-        TCP_METRIC_RTT,
-        TCP_METRIC_RTTVAR,
-        TCP_METRIC_SSTHRESH,
-        TCP_METRIC_CWND,
-        TCP_METRIC_REORDERING,
-        /* Always last.  */
-        TCP_METRIC_MAX,
-};
 struct tcp_fastopen_metrics {
        u16     mss;
        u16     syn_loss:10;            /* Recurring Fast Open SYN losses */
@@ -45,8 +36,10 @@ struct tcp_metrics_block {
        u32                             tcpm_ts;
        u32                             tcpm_ts_stamp;
        u32                             tcpm_lock;
-        u32                             tcpm_vals[TCP_METRIC_MAX];
+        u32                             tcpm_vals[TCP_METRIC_MAX + 1];
        struct tcp_fastopen_metrics     tcpm_fastopen;
+        struct rcu_head                 rcu_head;
 };
 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -690,6 +683,325 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
        rcu_read_unlock();
 }
+static struct genl_family tcp_metrics_nl_family = {
+        .id             = GENL_ID_GENERATE,
+        .hdrsize        = 0,
+        .name           = TCP_METRICS_GENL_NAME,
+        .version        = TCP_METRICS_GENL_VERSION,
+        .maxattr        = TCP_METRICS_ATTR_MAX,
+        .netnsok        = true,
+};
+static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+        [TCP_METRICS_ATTR_ADDR_IPV4]    = { .type = NLA_U32, },
+        [TCP_METRICS_ATTR_ADDR_IPV6]    = { .type = NLA_BINARY,
+                                            .len = sizeof(struct in6_addr), },
+        /* Following attributes are not received for GET/DEL,
+         * we keep them for reference
+         */
+#if 0
+        [TCP_METRICS_ATTR_AGE]          = { .type = NLA_MSECS, },
+        [TCP_METRICS_ATTR_TW_TSVAL]     = { .type = NLA_U32, },
+        [TCP_METRICS_ATTR_TW_TS_STAMP]  = { .type = NLA_S32, },
+        [TCP_METRICS_ATTR_VALS]         = { .type = NLA_NESTED, },
+        [TCP_METRICS_ATTR_FOPEN_MSS]    = { .type = NLA_U16, },
+        [TCP_METRICS_ATTR_FOPEN_SYN_DROPS]      = { .type = NLA_U16, },
+        [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]    = { .type = NLA_MSECS, },
+        [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
+                                            .len = TCP_FASTOPEN_COOKIE_MAX, },
+#endif
+};
+/* Add attributes, caller cancels its header on failure */
+static int tcp_metrics_fill_info(struct sk_buff *msg,
+                                 struct tcp_metrics_block *tm)
+{
+        struct nlattr *nest;
+        int i;
+        switch (tm->tcpm_addr.family) {
+        case AF_INET:
+                if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+                                tm->tcpm_addr.addr.a4) < 0)
+                        goto nla_put_failure;
+                break;
+        case AF_INET6:
+                if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
+                            tm->tcpm_addr.addr.a6) < 0)
+                        goto nla_put_failure;
+                break;
+        default:
+                return -EAFNOSUPPORT;
+        }
+        if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
+                          jiffies - tm->tcpm_stamp) < 0)
+                goto nla_put_failure;
+        if (tm->tcpm_ts_stamp) {
+                if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
+                                (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
+                        goto nla_put_failure;
+                if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
+                                tm->tcpm_ts) < 0)
+                        goto nla_put_failure;
+        }
+        {
+                int n = 0;
+                nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
+                if (!nest)
+                        goto nla_put_failure;
+                for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
+                        if (!tm->tcpm_vals[i])
+                                continue;
+                        if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
+                                goto nla_put_failure;
+                        n++;
+                }
+                if (n)
+                        nla_nest_end(msg, nest);
+                else
+                        nla_nest_cancel(msg, nest);
+        }
+        {
+                struct tcp_fastopen_metrics tfom_copy[1], *tfom;
+                unsigned int seq;
+                do {
+                        seq = read_seqbegin(&fastopen_seqlock);
+                        tfom_copy[0] = tm->tcpm_fastopen;
+                } while (read_seqretry(&fastopen_seqlock, seq));
+                tfom = tfom_copy;
+                if (tfom->mss &&
+                    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
+                                tfom->mss) < 0)
+                        goto nla_put_failure;
+                if (tfom->syn_loss &&
+                    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
+                                tfom->syn_loss) < 0 ||
+                     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
+                                jiffies - tfom->last_syn_loss) < 0))
+                        goto nla_put_failure;
+                if (tfom->cookie.len > 0 &&
+                    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
+                            tfom->cookie.len, tfom->cookie.val) < 0)
+                        goto nla_put_failure;
+        }
+        return 0;
+nla_put_failure:
+        return -EMSGSIZE;
+}
+static int tcp_metrics_dump_info(struct sk_buff *skb,
+                                 struct netlink_callback *cb,
+                                 struct tcp_metrics_block *tm)
+{
+        void *hdr;
+        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+                          &tcp_metrics_nl_family, NLM_F_MULTI,
+                          TCP_METRICS_CMD_GET);
+        if (!hdr)
+                return -EMSGSIZE;
+        if (tcp_metrics_fill_info(skb, tm) < 0)
+                goto nla_put_failure;
+        return genlmsg_end(skb, hdr);
+nla_put_failure:
+        genlmsg_cancel(skb, hdr);
+        return -EMSGSIZE;
+}
+static int tcp_metrics_nl_dump(struct sk_buff *skb,
+                               struct netlink_callback *cb)
+{
+        struct net *net = sock_net(skb->sk);
+        unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+        unsigned int row, s_row = cb->args[0];
+        int s_col = cb->args[1], col = s_col;
+        for (row = s_row; row < max_rows; row++, s_col = 0) {
+                struct tcp_metrics_block *tm;
+                struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
+                rcu_read_lock();
+                for (col = 0, tm = rcu_dereference(hb->chain); tm;
+                     tm = rcu_dereference(tm->tcpm_next), col++) {
+                        if (col < s_col)
+                                continue;
+                        if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+                                rcu_read_unlock();
+                                goto done;
+                        }
+                }
+                rcu_read_unlock();
+        }
+done:
+        cb->args[0] = row;
+        cb->args[1] = col;
+        return skb->len;
+}
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+                         unsigned int *hash, int optional)
+{
+        struct nlattr *a;
+        a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
+        if (a) {
+                addr->family = AF_INET;
+                addr->addr.a4 = nla_get_be32(a);
+                *hash = (__force unsigned int) addr->addr.a4;
+                return 0;
+        }
+        a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
+        if (a) {
+                if (nla_len(a) != sizeof(sizeof(struct in6_addr)))
+                        return -EINVAL;
+                addr->family = AF_INET6;
+                memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
+                *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
+                return 0;
+        }
+        return optional ? 1 : -EAFNOSUPPORT;
+}
+static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+        struct tcp_metrics_block *tm;
+        struct inetpeer_addr addr;
+        unsigned int hash;
+        struct sk_buff *msg;
+        struct net *net = genl_info_net(info);
+        void *reply;
+        int ret;
+        ret = parse_nl_addr(info, &addr, &hash, 0);
+        if (ret < 0)
+                return ret;
+        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+        if (!msg)
+                return -ENOMEM;
+        reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
+                                  info->genlhdr->cmd);
+        if (!reply)
+                goto nla_put_failure;
+        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+        ret = -ESRCH;
+        rcu_read_lock();
+        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+             tm = rcu_dereference(tm->tcpm_next)) {
+                if (addr_same(&tm->tcpm_addr, &addr)) {
+                        ret = tcp_metrics_fill_info(msg, tm);
+                        break;
+                }
+        }
+        rcu_read_unlock();
+        if (ret < 0)
+                goto out_free;
+        genlmsg_end(msg, reply);
+        return genlmsg_reply(msg, info);
+nla_put_failure:
+        ret = -EMSGSIZE;
+out_free:
+        nlmsg_free(msg);
+        return ret;
+}
+#define deref_locked_genl(p)    \
+        rcu_dereference_protected(p, lockdep_genl_is_held() && \
+                                     lockdep_is_held(&tcp_metrics_lock))
+#define deref_genl(p)   rcu_dereference_protected(p, lockdep_genl_is_held())
+static int tcp_metrics_flush_all(struct net *net)
+{
+        unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+        struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
+        struct tcp_metrics_block *tm;
+        unsigned int row;
+        for (row = 0; row < max_rows; row++, hb++) {
+                spin_lock_bh(&tcp_metrics_lock);
+                tm = deref_locked_genl(hb->chain);
+                if (tm)
+                        hb->chain = NULL;
+                spin_unlock_bh(&tcp_metrics_lock);
+                while (tm) {
+                        struct tcp_metrics_block *next;
+                        next = deref_genl(tm->tcpm_next);
+                        kfree_rcu(tm, rcu_head);
+                        tm = next;
+                }
+        }
+        return 0;
+}
+static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+        struct tcpm_hash_bucket *hb;
+        struct tcp_metrics_block *tm;
+        struct tcp_metrics_block __rcu **pp;
+        struct inetpeer_addr addr;
+        unsigned int hash;
+        struct net *net = genl_info_net(info);
+        int ret;
+        ret = parse_nl_addr(info, &addr, &hash, 1);
+        if (ret < 0)
+                return ret;
+        if (ret > 0)
+                return tcp_metrics_flush_all(net);
+        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+        hb = net->ipv4.tcp_metrics_hash + hash;
+        pp = &hb->chain;
+        spin_lock_bh(&tcp_metrics_lock);
+        for (tm = deref_locked_genl(*pp); tm;
+             pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
+                if (addr_same(&tm->tcpm_addr, &addr)) {
+                        *pp = tm->tcpm_next;
+                        break;
+                }
+        }
+        spin_unlock_bh(&tcp_metrics_lock);
+        if (!tm)
+                return -ESRCH;
+        kfree_rcu(tm, rcu_head);
+        return 0;
+}
+static struct genl_ops tcp_metrics_nl_ops[] = {
+        {
+                .cmd = TCP_METRICS_CMD_GET,
+                .doit = tcp_metrics_nl_cmd_get,
+                .dumpit = tcp_metrics_nl_dump,
+                .policy = tcp_metrics_nl_policy,
+                .flags = GENL_ADMIN_PERM,
+        },
+        {
+                .cmd = TCP_METRICS_CMD_DEL,
+                .doit = tcp_metrics_nl_cmd_del,
+                .policy = tcp_metrics_nl_policy,
+                .flags = GENL_ADMIN_PERM,
+        },
+};
 static unsigned int tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
@@ -753,5 +1065,21 @@ static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
 void __init tcp_metrics_init(void)
 {
-        register_pernet_subsys(&tcp_net_metrics_ops);
+        int ret;
+        ret = register_pernet_subsys(&tcp_net_metrics_ops);
+        if (ret < 0)
+                goto cleanup;
+        ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
+                                            tcp_metrics_nl_ops,
+                                            ARRAY_SIZE(tcp_metrics_nl_ops));
+        if (ret < 0)
+                goto cleanup_subsys;
+        return;
+cleanup_subsys:
+        unregister_pernet_subsys(&tcp_net_metrics_ops);
+cleanup:
+        return;
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9d..27536ba16c9d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -85,6 +85,8 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 * spinlock it. I do not want! Well, probability of misbehaviour
 * is ridiculously low and, seems, we could use some mb() tricks
 * to avoid misread sequence numbers, states etc.  --ANK
+ *
+ * We don't need to initialize tmp_out.sack_ok as we don't use the results
 */
 enum tcp_tw_status
 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
@@ -507,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                        newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
                newtp->rx_opt.mss_clamp = req->mss;
                TCP_ECN_openreq_child(newtp, req);
+                newtp->fastopen_rsk = NULL;
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
        }
@@ -515,13 +518,20 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 EXPORT_SYMBOL(tcp_create_openreq_child);
 /*
- *      Process an incoming packet for SYN_RECV sockets represented
+ * Process an incoming packet for SYN_RECV sockets represented as a
- *      as a request_sock.
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
+ *
+ * We don't need to initialize tmp_opt.sack_ok as we don't use the results
 */
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           struct request_sock *req,
-                           struct request_sock **prev)
+                           struct request_sock **prev,
+                           bool fastopen)
 {
        struct tcp_options_received tmp_opt;
        const u8 *hash_location;
@@ -530,6 +540,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
        bool paws_reject = false;
+        BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
        tmp_opt.saw_tstamp = 0;
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +577,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                 *
                 * Enforce "SYN-ACK" according to figure 8, figure 6
                 * of RFC793, fixed by RFC1122.
+                 *
+                 * Note that even if there is new data in the SYN packet
+                 * they will be thrown away too.
                 */
                req->rsk_ops->rtx_syn_ack(sk, req, NULL);
                return NULL;
@@ -622,9 +637,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
         *                  sent (the segment carries an unacceptable ACK) ...
         *                  a reset is sent."
         *
-         * Invalid ACK: reset will be sent by listening socket
+         * Invalid ACK: reset will be sent by listening socket.
+         * Note that the ACK validity check for a Fast Open socket is done
+         * elsewhere and is checked directly against the child socket rather
+         * than req because user data may have been sent out.
         */
-        if ((flg & TCP_FLAG_ACK) &&
+        if ((flg & TCP_FLAG_ACK) && !fastopen &&
            (TCP_SKB_CB(skb)->ack_seq !=
             tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
                return sk;
@@ -637,7 +655,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* RFC793: "first check sequence number". */
        if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-                                          tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
+                                          tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
                /* Out of window: send ACK and drop. */
                if (!(flg & TCP_FLAG_RST))
                        req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +666,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* In sequence, PAWS is OK. */
-        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
+        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
                req->ts_recent = tmp_opt.rcv_tsval;
        if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +685,25 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* ACK sequence verified above, just make sure ACK is
         * set.  If ACK not set, just silently drop the packet.
+         *
+         * XXX (TFO) - if we ever allow "data after SYN", the
+         * following check needs to be removed.
         */
        if (!(flg & TCP_FLAG_ACK))
                return NULL;
+        /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
+        if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
+                tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
+        else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
+                tcp_rsk(req)->snt_synack = 0;
+        /* For Fast Open no more processing is needed (sk is the
+         * child socket).
+         */
+        if (fastopen)
+                return sk;
        /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
        if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
            TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -678,10 +711,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
                return NULL;
        }
-        if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
-                tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
-        else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
-                tcp_rsk(req)->snt_synack = 0;
        /* OK, ACK is valid, create big socket and
         * feed this segment to it. It will repeat all
@@ -706,11 +735,21 @@ listen_overflow:
        }
 embryonic_reset:
-        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+        if (!(flg & TCP_FLAG_RST)) {
-        if (!(flg & TCP_FLAG_RST))
+                /* Received a bad SYN pkt - for TFO We try not to reset
+                 * the local connection unless it's really necessary to
+                 * avoid becoming vulnerable to outside attack aiming at
+                 * resetting legit local connections.
+                 */
                req->rsk_ops->send_reset(sk, skb);
+        } else if (fastopen) { /* received a valid RST pkt */
-        inet_csk_reqsk_queue_drop(sk, req, prev);
+                reqsk_fastopen_remove(sk, req, true);
+                tcp_reset(sk);
+        }
+        if (!fastopen) {
+                inet_csk_reqsk_queue_drop(sk, req, prev);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+        }
        return NULL;
 }
 EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +758,12 @@ EXPORT_SYMBOL(tcp_check_req);
 * Queue segment on the new socket if the new socket is active,
 * otherwise we just shortcircuit this and continue with
 * the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
 */
 int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d04632673a9e..cfe6ffe1c177 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
                                   unsigned int mss, struct sk_buff *skb,
                                   struct tcp_out_options *opts,
                                   struct tcp_md5sig_key **md5,
-                                   struct tcp_extend_values *xvp)
+                                   struct tcp_extend_values *xvp,
+                                   struct tcp_fastopen_cookie *foc)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
                if (unlikely(!ireq->tstamp_ok))
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
+        if (foc != NULL) {
+                u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+                need = (need + 3) & ~3U;  /* Align to 32 bits */
+                if (remaining >= need) {
+                        opts->options |= OPTION_FAST_OPEN_COOKIE;
+                        opts->fastopen_cookie = foc;
+                        remaining -= need;
+                }
+        }
        /* Similar rationale to tcp_syn_options() applies here, too.
         * If the <SYN> options fit, the same options should fit now!
         */
@@ -2028,10 +2037,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                if (push_one)
                        break;
        }
-        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
-                tp->prr_out += sent_pkts;
        if (likely(sent_pkts)) {
+                if (tcp_in_cwnd_reduction(sk))
+                        tp->prr_out += sent_pkts;
                tcp_cwnd_validate(sk);
                return false;
        }
@@ -2533,7 +2542,7 @@ begin_fwd:
                }
                NET_INC_STATS_BH(sock_net(sk), mib_idx);
-                if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
+                if (tcp_in_cwnd_reduction(sk))
                        tp->prr_out += tcp_skb_pcount(skb);
                if (skb == tcp_write_queue_head(sk))
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
 */
 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
-                                struct request_values *rvp)
+                                struct request_values *rvp,
+                                struct tcp_fastopen_cookie *foc)
 {
        struct tcp_out_options opts;
        struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 #endif
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
        tcp_header_size = tcp_synack_options(sk, req, mss,
-                                             skb, &opts, &md5, xvp)
+                                             skb, &opts, &md5, xvp, foc)
                        + sizeof(*th);
        skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        }
        th->seq = htonl(TCP_SKB_CB(skb)->seq);
-        th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
+        /* XXX data is queued and acked as is. No buffer/window check */
+        th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b774a03bd1dc..fc04711e80c8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk)
 }
 /*
+ *      Timer for Fast Open socket to retransmit SYNACK. Note that the
+ *      sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        int max_retries = icsk->icsk_syn_retries ? :
+            sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+        struct request_sock *req;
+        req = tcp_sk(sk)->fastopen_rsk;
+        req->rsk_ops->syn_ack_timeout(sk, req);
+        if (req->retrans >= max_retries) {
+                tcp_write_err(sk);
+                return;
+        }
+        /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+         * returned from rtx_syn_ack() to make it more persistent like
+         * regular retransmit because if the child socket has been accepted
+         * it's not good to give up too easily.
+         */
+        req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+        req->retrans++;
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                          TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
+}
+/*
 *      The TCP retransmit timer.
 */
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
                tcp_resume_early_retransmit(sk);
                return;
        }
+        if (tp->fastopen_rsk) {
+                BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                    sk->sk_state != TCP_FIN_WAIT1);
+                tcp_fastopen_synack_timer(sk);
+                /* Before we receive ACK to our SYN-ACK don't retransmit
+                 * anything else (e.g., data or FIN segments).
+                 */
+                return;
+        }
        if (!tp->packets_out)
                goto out;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index d2f336ea82ca..505b30ad9182 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -26,7 +26,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
        return inet_sk_diag_fill(sk, NULL, skb, req,
                        sk_user_ns(NETLINK_CB(cb->skb).ssk),
-                        NETLINK_CB(cb->skb).pid,
+                        NETLINK_CB(cb->skb).portid,
                        cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -72,14 +72,14 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
        err = inet_sk_diag_fill(sk, NULL, rep, req,
                           sk_user_ns(NETLINK_CB(in_skb).ssk),
-                           NETLINK_CB(in_skb).pid,
+                           NETLINK_CB(in_skb).portid,
                           nlh->nlmsg_seq, 0, nlh);
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(rep);
                goto out;
        }
-        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
                              MSG_DONTWAIT);
        if (err > 0)
                err = 0;