Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next

This merges (3f509c6 netfilter: nf_nat_sip: fix incorrect handling of EBUSY for RTCP expectation) to Patrick McHardy's IPv6 NAT changes.
author: Pablo Neira Ayuso <pablo@netfilter.org> 2012-09-03 09:28:30 -0400
committer: Pablo Neira Ayuso <pablo@netfilter.org> 2012-09-03 09:34:51 -0400
commit: ace1fe1231bdfffd60b5e703aa5b7283fbf98dbd (patch)
tree: 06c7492a8f3cc65f916768616ca24c6bc7171761 /net/ipv4
parent: ce9f3f31efb88841e4df98794b13dbac8c4901da (diff)
parent: a2dc375e12334b3d8f787a48b2fb6172ccfb80ae (diff)
21 files changed, 760 insertions, 129 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6681ccf5c3ee..4f70ef0b946d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
                pr_err("Attempt to release alive inet socket %p\n", sk);
                return;
        }
+        if (sk->sk_type == SOCK_STREAM) {
+                struct fastopen_queue *fastopenq =
+                        inet_csk(sk)->icsk_accept_queue.fastopenq;
+                kfree(fastopenq);
+        }
        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(atomic_read(&sk->sk_wmem_alloc));
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != TCP_LISTEN) {
+                /* Check special setups for testing purpose to enable TFO w/o
+                 * requiring TCP_FASTOPEN sockopt.
+                 * Note that only TCP sockets (SOCK_STREAM) will reach here.
+                 * Also fastopenq may already been allocated because this
+                 * socket was in TCP_LISTEN state previously but was
+                 * shutdown() (rather than close()).
+                 */
+                if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+                    inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+                        if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+                                err = fastopen_init_queue(sk, backlog);
+                        else if ((sysctl_tcp_fastopen &
+                                  TFO_SERVER_WO_SOCKOPT2) != 0)
+                                err = fastopen_init_queue(sk,
+                                    ((uint)sysctl_tcp_fastopen) >> 16);
+                        else
+                                err = 0;
+                        if (err)
+                                goto out;
+                }
                err = inet_csk_listen_start(sk, backlog);
                if (err)
                        goto out;
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
        sock_rps_record_flow(sk2);
        WARN_ON(!((1 << sk2->sk_state) &
-                  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+                  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+                  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
        sock_graft(sk2, newsock);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6a5e6e4b142c..adf273f8ad2e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1147,12 +1147,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
                         void *ptr)
 {
        struct net_device *dev = ptr;
-        struct in_device *in_dev;
+        struct in_device *in_dev = __in_dev_get_rtnl(dev);
-        if (event == NETDEV_UNREGISTER_FINAL)
-                goto out;
-        in_dev = __in_dev_get_rtnl(dev);
        ASSERT_RTNL();
        if (!in_dev) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index fd7d9ae64f16..acdee325d972 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1050,9 +1050,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                return NOTIFY_DONE;
        }
-        if (event == NETDEV_UNREGISTER_FINAL)
-                return NOTIFY_DONE;
        in_dev = __in_dev_get_rtnl(dev);
        switch (event) {
@@ -1064,14 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                fib_sync_up(dev);
 #endif
                atomic_inc(&net->ipv4.dev_addr_genid);
-                rt_cache_flush(dev_net(dev), -1);
+                rt_cache_flush(net, -1);
                break;
        case NETDEV_DOWN:
                fib_disable_ip(dev, 0, 0);
                break;
        case NETDEV_CHANGEMTU:
        case NETDEV_CHANGE:
-                rt_cache_flush(dev_net(dev), 0);
+                rt_cache_flush(net, 0);
                break;
        }
        return NOTIFY_DONE;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7f75f21d7b83..8464b79c493f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct sock *newsk;
+        struct request_sock *req;
        int error;
        lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                goto out_err;
        /* Find already established connection */
-        if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+        if (reqsk_queue_empty(queue)) {
                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
                /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                if (error)
                        goto out_err;
        }
+        req = reqsk_queue_remove(queue);
-        newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+        newsk = req->sk;
-        WARN_ON(newsk->sk_state == TCP_SYN_RECV);
+        sk_acceptq_removed(sk);
+        if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
+                spin_lock_bh(&queue->fastopenq->lock);
+                if (tcp_rsk(req)->listener) {
+                        /* We are still waiting for the final ACK from 3WHS
+                         * so can't free req now. Instead, we set req->sk to
+                         * NULL to signify that the child socket is taken
+                         * so reqsk_fastopen_remove() will free the req
+                         * when 3WHS finishes (or is aborted).
+                         */
+                        req->sk = NULL;
+                        req = NULL;
+                }
+                spin_unlock_bh(&queue->fastopenq->lock);
+        }
 out:
        release_sock(sk);
+        if (req)
+                __reqsk_free(req);
        return newsk;
 out_err:
        newsk = NULL;
+        req = NULL;
        *err = error;
        goto out;
 }
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 void inet_csk_listen_stop(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct request_sock *acc_req;
        struct request_sock *req;
        inet_csk_delete_keepalive_timer(sk);
        /* make all the listen_opt local to us */
-        acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+        acc_req = reqsk_queue_yank_acceptq(queue);
        /* Following specs, it would be better either to send FIN
         * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
         * To be honest, we are not able to make either
         * of the variants now.                 --ANK
         */
-        reqsk_queue_destroy(&icsk->icsk_accept_queue);
+        reqsk_queue_destroy(queue);
        while ((req = acc_req) != NULL) {
                struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
                percpu_counter_inc(sk->sk_prot->orphan_count);
+                if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
+                        BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+                        BUG_ON(sk != tcp_rsk(req)->listener);
+                        /* Paranoid, to prevent race condition if
+                         * an inbound pkt destined for child is
+                         * blocked by sock lock in tcp_v4_rcv().
+                         * Also to satisfy an assertion in
+                         * tcp_v4_destroy_sock().
+                         */
+                        tcp_sk(child)->fastopen_rsk = NULL;
+                        sock_put(sk);
+                }
                inet_csk_destroy_sock(child);
                bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
                sk_acceptq_removed(sk);
                __reqsk_free(req);
        }
+        if (queue->fastopenq != NULL) {
+                /* Free all the reqs queued in rskq_rst_head. */
+                spin_lock_bh(&queue->fastopenq->lock);
+                acc_req = queue->fastopenq->rskq_rst_head;
+                queue->fastopenq->rskq_rst_head = NULL;
+                spin_unlock_bh(&queue->fastopenq->lock);
+                while ((req = acc_req) != NULL) {
+                        acc_req = req->dl_next;
+                        __reqsk_free(req);
+                }
+        }
        WARN_ON(sk->sk_ack_backlog);
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 570e61f9611f..8bc005b1435f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -69,6 +69,7 @@ static inline void inet_diag_unlock_handler(
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
                              struct sk_buff *skb, struct inet_diag_req_v2 *req,
+                              struct user_namespace *user_ns,                   
                              u32 pid, u32 seq, u16 nlmsg_flags,
                              const struct nlmsghdr *unlh)
 {
@@ -124,7 +125,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
        }
 #endif
-        r->idiag_uid = sock_i_uid(sk);
+        r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
        r->idiag_inode = sock_i_ino(sk);
        if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -199,11 +200,12 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
 static int inet_csk_diag_fill(struct sock *sk,
                              struct sk_buff *skb, struct inet_diag_req_v2 *req,
+                              struct user_namespace *user_ns,
                              u32 pid, u32 seq, u16 nlmsg_flags,
                              const struct nlmsghdr *unlh)
 {
        return inet_sk_diag_fill(sk, inet_csk(sk),
-                        skb, req, pid, seq, nlmsg_flags, unlh);
+                        skb, req, user_ns, pid, seq, nlmsg_flags, unlh);
 }
 static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
@@ -256,14 +258,16 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 }
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
-                        struct inet_diag_req_v2 *r, u32 pid, u32 seq, u16 nlmsg_flags,
+                        struct inet_diag_req_v2 *r,
+                        struct user_namespace *user_ns,
+                        u32 pid, u32 seq, u16 nlmsg_flags,
                        const struct nlmsghdr *unlh)
 {
        if (sk->sk_state == TCP_TIME_WAIT)
                return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
                                           skb, r, pid, seq, nlmsg_flags,
                                           unlh);
-        return inet_csk_diag_fill(sk, skb, r, pid, seq, nlmsg_flags, unlh);
+        return inet_csk_diag_fill(sk, skb, r, user_ns, pid, seq, nlmsg_flags, unlh);
 }
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -311,6 +315,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
        }
        err = sk_diag_fill(sk, rep, req,
+                           sk_user_ns(NETLINK_CB(in_skb).ssk),
                           NETLINK_CB(in_skb).pid,
                           nlh->nlmsg_seq, 0, nlh);
        if (err < 0) {
@@ -551,6 +556,7 @@ static int inet_csk_diag_dump(struct sock *sk,
                return 0;
        return inet_csk_diag_fill(sk, skb, r,
+                                  sk_user_ns(NETLINK_CB(cb->skb).ssk),
                                  NETLINK_CB(cb->skb).pid,
                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -591,7 +597,9 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
 }
 static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
-                              struct request_sock *req, u32 pid, u32 seq,
+                              struct request_sock *req,
+                              struct user_namespace *user_ns,
+                              u32 pid, u32 seq,
                              const struct nlmsghdr *unlh)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
@@ -625,7 +633,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
        r->idiag_expires = jiffies_to_msecs(tmo);
        r->idiag_rqueue = 0;
        r->idiag_wqueue = 0;
-        r->idiag_uid = sock_i_uid(sk);
+        r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
        r->idiag_inode = 0;
 #if IS_ENABLED(CONFIG_IPV6)
        if (r->idiag_family == AF_INET6) {
@@ -702,6 +710,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
                        }
                        err = inet_diag_fill_req(skb, sk, req,
+                                               sk_user_ns(NETLINK_CB(cb->skb).ssk),
                                               NETLINK_CB(cb->skb).pid,
                                               cb->nlh->nlmsg_seq, cb->nlh);
                        if (err < 0) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3a57570c8ee5..8aa7a4cf9139 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 static struct kmem_cache *mrt_cachep __read_mostly;
 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static void ipmr_free_table(struct mr_table *mrt);
 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
                         struct sk_buff *skb, struct mfc_cache *cache,
                         int local);
@@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
                             struct sk_buff *pkt, vifi_t vifi, int assert);
 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                              struct mfc_cache *c, struct rtmsg *rtm);
+static void mroute_clean_tables(struct mr_table *mrt);
 static void ipmr_expire_process(unsigned long arg);
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
        list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
                list_del(&mrt->list);
-                kfree(mrt);
+                ipmr_free_table(mrt);
        }
        fib_rules_unregister(net->ipv4.mr_rules_ops);
 }
@@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net)
 static void __net_exit ipmr_rules_exit(struct net *net)
 {
-        kfree(net->ipv4.mrt);
+        ipmr_free_table(net->ipv4.mrt);
 }
 #endif
@@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
        return mrt;
 }
+static void ipmr_free_table(struct mr_table *mrt)
+{
+        del_timer_sync(&mrt->ipmr_expire_timer);
+        mroute_clean_tables(mrt);
+        kfree(mrt);
+}
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 6232d476f37e..8f3d05424a3e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -185,10 +185,10 @@ exit:
        return sk;
 }
-static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
+static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
-                                          gid_t *high)
+                                          kgid_t *high)
 {
-        gid_t *data = net->ipv4.sysctl_ping_group_range;
+        kgid_t *data = net->ipv4.sysctl_ping_group_range;
        unsigned int seq;
        do {
@@ -203,19 +203,13 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
 static int ping_init_sock(struct sock *sk)
 {
        struct net *net = sock_net(sk);
-        gid_t group = current_egid();
+        kgid_t group = current_egid();
-        gid_t range[2];
        struct group_info *group_info = get_current_groups();
        int i, j, count = group_info->ngroups;
        kgid_t low, high;
-        inet_get_ping_group_range_net(net, range, range+1);
+        inet_get_ping_group_range_net(net, &low, &high);
-        low = make_kgid(&init_user_ns, range[0]);
+        if (gid_lte(low, group) && gid_lte(group, high))
-        high = make_kgid(&init_user_ns, range[1]);
-        if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low))
-                return -EACCES;
-        if (range[0] <= group && group <= range[1])
                return 0;
        for (i = 0; i < group_info->nblocks; i++) {
@@ -845,7 +839,9 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f,
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
-                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+                0, 0L, 0,
+                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp,
                atomic_read(&sp->sk_drops), len);
 }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 957acd12250b..8de53e1ddd54 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
        SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
        SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+        SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+        SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+        SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+        SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
        SNMP_MIB_SENTINEL
 };
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ff0f071969ea..f2425785d40a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -992,7 +992,9 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
                i, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
-                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+                0, 0L, 0,
+                from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 50f6d3adb474..dc9549b5eb1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -934,12 +934,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
        if (mtu < ip_rt_min_pmtu)
                mtu = ip_rt_min_pmtu;
+        rcu_read_lock();
        if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
                struct fib_nh *nh = &FIB_RES_NH(res);
                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
                                      jiffies + ip_rt_mtu_expires);
        }
+        rcu_read_unlock();
        return mtu;
 }
@@ -956,7 +958,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                dst->obsolete = DST_OBSOLETE_KILL;
        } else {
                rt->rt_pmtu = mtu;
-                dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+                rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
        }
 }
@@ -1132,10 +1134,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        const struct rtable *rt = (const struct rtable *) dst;
        unsigned int mtu = rt->rt_pmtu;
-        if (mtu && time_after_eq(jiffies, rt->dst.expires))
+        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
-                mtu = 0;
-        if (!mtu)
                mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu && rt_is_output_route(rt))
@@ -1263,7 +1262,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 {
        struct rtable *rt = (struct rtable *) dst;
-        if (dst->flags & DST_NOCACHE) {
+        if (!list_empty(&rt->rt_uncached)) {
                spin_lock_bh(&rt_uncached_lock);
                list_del(&rt->rt_uncached);
                spin_unlock_bh(&rt_uncached_lock);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e6..ba48e799b031 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        ireq->tstamp_ok         = tcp_opt.saw_tstamp;
        req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
        treq->snt_synack        = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+        treq->listener          = NULL;
        /* We throwed the options of the initial SYN away, so we hope
         * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1b5ce96707a3..9205e492dc9d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -76,9 +76,9 @@ static int ipv4_local_port_range(ctl_table *table, int write,
 }
-static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
+static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
 {
-        gid_t *data = table->data;
+        kgid_t *data = table->data;
        unsigned int seq;
        do {
                seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -89,12 +89,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low,
 }
 /* Update system visible IP port range */
-static void set_ping_group_range(struct ctl_table *table, gid_t range[2])
+static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
 {
-        gid_t *data = table->data;
+        kgid_t *data = table->data;
        write_seqlock(&sysctl_local_ports.lock);
-        data[0] = range[0];
+        data[0] = low;
-        data[1] = range[1];
+        data[1] = high;
        write_sequnlock(&sysctl_local_ports.lock);
 }
@@ -103,21 +103,33 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
                                 void __user *buffer,
                                 size_t *lenp, loff_t *ppos)
 {
+        struct user_namespace *user_ns = current_user_ns();
        int ret;
-        gid_t range[2];
+        gid_t urange[2];
+        kgid_t low, high;
        ctl_table tmp = {
-                .data = &range,
+                .data = &urange,
-                .maxlen = sizeof(range),
+                .maxlen = sizeof(urange),
                .mode = table->mode,
                .extra1 = &ip_ping_group_range_min,
                .extra2 = &ip_ping_group_range_max,
        };
-        inet_get_ping_group_range_table(table, range, range + 1);
+        inet_get_ping_group_range_table(table, &low, &high);
+        urange[0] = from_kgid_munged(user_ns, low);
+        urange[1] = from_kgid_munged(user_ns, high);
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
-        if (write && ret == 0)
+        if (write && ret == 0) {
-                set_ping_group_range(table, range);
+                low = make_kgid(user_ns, urange[0]);
+                high = make_kgid(user_ns, urange[1]);
+                if (!gid_valid(low) || !gid_valid(high) ||
+                    (urange[1] < urange[0]) || gid_lt(high, low)) {
+                        low = make_kgid(&init_user_ns, 1);
+                        high = make_kgid(&init_user_ns, 0);
+                }
+                set_ping_group_range(table, low, high);
+        }
        return ret;
 }
@@ -220,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
        return 0;
 }
+int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
+                          size_t *lenp, loff_t *ppos)
+{
+        ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
+        struct tcp_fastopen_context *ctxt;
+        int ret;
+        u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+        tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+        if (!tbl.data)
+                return -ENOMEM;
+        rcu_read_lock();
+        ctxt = rcu_dereference(tcp_fastopen_ctx);
+        if (ctxt)
+                memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+        rcu_read_unlock();
+        snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
+                user_key[0], user_key[1], user_key[2], user_key[3]);
+        ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+        if (write && ret == 0) {
+                if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
+                           user_key + 2, user_key + 3) != 4) {
+                        ret = -EINVAL;
+                        goto bad_key;
+                }
+                tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+        }
+bad_key:
+        pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+               user_key[0], user_key[1], user_key[2], user_key[3],
+               (char *)tbl.data, ret);
+        kfree(tbl.data);
+        return ret;
+}
 static struct ctl_table ipv4_table[] = {
        {
                .procname       = "tcp_timestamps",
@@ -374,6 +425,12 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
+                .procname       = "tcp_fastopen_key",
+                .mode           = 0600,
+                .maxlen         = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+                .proc_handler   = proc_tcp_fastopen_key,
+        },
+        {
                .procname       = "tcp_tw_recycle",
                .data           = &tcp_death_row.sysctl_tw_recycle,
                .maxlen         = sizeof(int),
@@ -786,7 +843,7 @@ static struct ctl_table ipv4_net_table[] = {
        {
                .procname       = "ping_group_range",
                .data           = &init_net.ipv4.sysctl_ping_group_range,
-                .maxlen         = sizeof(init_net.ipv4.sysctl_ping_group_range),
+                .maxlen         = sizeof(gid_t)*2,
                .mode           = 0644,
                .proc_handler   = ipv4_ping_group_range,
        },
@@ -830,8 +887,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
         * Sane defaults - nobody may create ping sockets.
         * Boot scripts should set this to distro-specific group.
         */
-        net->ipv4.sysctl_ping_group_range[0] = 1;
+        net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
-        net->ipv4.sysctl_ping_group_range[1] = 0;
+        net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
        tcp_init_mem(net);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4a1daf..df83d744e380 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-        /* Connected? */
+        /* Connected or passive Fast Open socket? */
-        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+        if (sk->sk_state != TCP_SYN_SENT &&
+            (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
                int target = sock_rcvlowat(sk, 0, INT_MAX);
                if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
        ssize_t copied;
        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-        /* Wait for a connection to finish. */
+        /* Wait for a connection to finish. One exception is TCP Fast Open
-        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+         * (passive side) where data is allowed to be sent before a connection
+         * is fully established.
+         */
+        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+            !tcp_passive_fastopen(sk)) {
                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
                        goto out_err;
+        }
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-        /* Wait for a connection to finish. */
+        /* Wait for a connection to finish. One exception is TCP Fast Open
-        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+         * (passive side) where data is allowed to be sent before a connection
+         * is fully established.
+         */
+        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+            !tcp_passive_fastopen(sk)) {
                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
                        goto do_error;
+        }
        if (unlikely(tp->repair)) {
                if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
                 * they look as CLOSING or LAST_ACK for Linux)
                 * Probably, I missed some more holelets.
                 *                                              --ANK
+                 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+                 * in a single packet! (May consider it later but will
+                 * probably need API support or TCP_CORK SYN-ACK until
+                 * data is written and socket is closed.)
                 */
                tcp_send_fin(sk);
        }
@@ -2215,8 +2230,16 @@ adjudge_to_death:
                }
        }
-        if (sk->sk_state == TCP_CLOSE)
+        if (sk->sk_state == TCP_CLOSE) {
+                struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+                /* We could get here with a non-NULL req if the socket is
+                 * aborted (e.g., closed with unread data) before 3WHS
+                 * finishes.
+                 */
+                if (req != NULL)
+                        reqsk_fastopen_remove(sk, req, false);
                inet_csk_destroy_sock(sk);
+        }
        /* Otherwise, socket is reprieved until protocol close. */
 out:
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                else
                        icsk->icsk_user_timeout = msecs_to_jiffies(val);
                break;
+        case TCP_FASTOPEN:
+                if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+                    TCPF_LISTEN)))
+                        err = fastopen_init_queue(sk, val);
+                else
+                        err = -EINVAL;
+                break;
        default:
                err = -ENOPROTOOPT;
                break;
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
 void tcp_done(struct sock *sk)
 {
+        struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
        if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
        tcp_set_state(sk, TCP_CLOSE);
        tcp_clear_xmit_timers(sk);
+        if (req != NULL)
+                reqsk_fastopen_remove(sk, req, false);
        sk->sk_shutdown = SHUTDOWN_MASK;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c409d7..8f7ef0ad80e5 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,91 @@
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <net/inetpeer.h>
+#include <net/tcp.h>
-int sysctl_tcp_fastopen;
+int sysctl_tcp_fastopen __read_mostly;
+struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+        struct tcp_fastopen_context *ctx =
+            container_of(head, struct tcp_fastopen_context, rcu);
+        crypto_free_cipher(ctx->tfm);
+        kfree(ctx);
+}
+int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+{
+        int err;
+        struct tcp_fastopen_context *ctx, *octx;
+        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+        if (!ctx)
+                return -ENOMEM;
+        ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
+        if (IS_ERR(ctx->tfm)) {
+                err = PTR_ERR(ctx->tfm);
+error:          kfree(ctx);
+                pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
+                return err;
+        }
+        err = crypto_cipher_setkey(ctx->tfm, key, len);
+        if (err) {
+                pr_err("TCP: TFO cipher key error: %d\n", err);
+                crypto_free_cipher(ctx->tfm);
+                goto error;
+        }
+        memcpy(ctx->key, key, len);
+        spin_lock(&tcp_fastopen_ctx_lock);
+        octx = rcu_dereference_protected(tcp_fastopen_ctx,
+                                lockdep_is_held(&tcp_fastopen_ctx_lock));
+        rcu_assign_pointer(tcp_fastopen_ctx, ctx);
+        spin_unlock(&tcp_fastopen_ctx_lock);
+        if (octx)
+                call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+        return err;
+}
+/* Computes the fastopen cookie for the peer.
+ * The peer address is a 128 bits long (pad with zeros for IPv4).
+ *
+ * The caller must check foc->len to determine if a valid cookie
+ * has been generated successfully.
+*/
+void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
+{
+        __be32 peer_addr[4] = { addr, 0, 0, 0 };
+        struct tcp_fastopen_context *ctx;
+        rcu_read_lock();
+        ctx = rcu_dereference(tcp_fastopen_ctx);
+        if (ctx) {
+                crypto_cipher_encrypt_one(ctx->tfm,
+                                          foc->val,
+                                          (__u8 *)peer_addr);
+                foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+        }
+        rcu_read_unlock();
+}
 static int __init tcp_fastopen_init(void)
 {
+        __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+        get_random_bytes(key, sizeof(key));
+        tcp_fastopen_reset_cipher(key, sizeof(key));
        return 0;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bcfccc5cb8d0..8c304a400798 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -378,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
 /* 4. Try to fixup all. It is made immediately after connection enters
 *    established state.
 */
-static void tcp_init_buffer_space(struct sock *sk)
+void tcp_init_buffer_space(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int maxwin;
@@ -2930,13 +2930,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 * tcp_xmit_retransmit_queue().
 */
 static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
-                                  int newly_acked_sacked, bool is_dupack,
+                                  int prior_sacked, bool is_dupack,
                                  int flag)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
                                    (tcp_fackets_out(tp) > tp->reordering));
+        int newly_acked_sacked = 0;
        int fast_rexmit = 0;
        if (WARN_ON(!tp->packets_out && tp->sacked_out))
@@ -2996,6 +2997,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                                tcp_add_reno_sack(sk);
                } else
                        do_lost = tcp_try_undo_partial(sk, pkts_acked);
+                newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
                break;
        case TCP_CA_Loss:
                if (flag & FLAG_DATA_ACKED)
@@ -3017,6 +3019,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                        if (is_dupack)
                                tcp_add_reno_sack(sk);
                }
+                newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
                if (icsk->icsk_ca_state <= TCP_CA_Disorder)
                        tcp_try_undo_dsack(sk);
@@ -3124,6 +3127,12 @@ void tcp_rearm_rto(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        /* If the retrans timer is currently being used by Fast Open
+         * for SYN-ACK retrans purpose, stay put.
+         */
+        if (tp->fastopen_rsk)
+                return;
        if (!tp->packets_out) {
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
        } else {
@@ -3594,7 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        int prior_packets;
        int prior_sacked = tp->sacked_out;
        int pkts_acked = 0;
-        int newly_acked_sacked = 0;
        bool frto_cwnd = false;
        /* If the ack is older than previous acks
@@ -3670,8 +3678,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
        pkts_acked = prior_packets - tp->packets_out;
-        newly_acked_sacked = (prior_packets - prior_sacked) -
-                             (tp->packets_out - tp->sacked_out);
        if (tp->frto_counter)
                frto_cwnd = tcp_process_frto(sk, flag);
@@ -3685,7 +3691,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                    tcp_may_raise_cwnd(sk, flag))
                        tcp_cong_avoid(sk, ack, prior_in_flight);
                is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-                tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
+                tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
                                      is_dupack, flag);
        } else {
                if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
@@ -3702,7 +3708,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 no_queue:
        /* If data was DSACKed, see if we can undo a cwnd reduction. */
        if (flag & FLAG_DSACKING_ACK)
-                tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
+                tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
                                      is_dupack, flag);
        /* If this ack opens up a zero window, clear backoff.  It was
         * being used to time the probes, and is probably far higher than
@@ -3722,8 +3728,7 @@ old_ack:
         */
        if (TCP_SKB_CB(skb)->sacked) {
                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
-                newly_acked_sacked = tp->sacked_out - prior_sacked;
+                tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
-                tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
                                      is_dupack, flag);
        }
@@ -4039,7 +4044,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
 }
 /* When we get a reset we do this. */
-static void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk)
 {
        /* We want the right error as BSD sees it (and indeed as we do). */
        switch (sk->sk_state) {
@@ -5896,7 +5901,9 @@ discard:
                tcp_send_synack(sk);
 #if 0
                /* Note, we could accept data and URG from this segment.
-                 * There are no obstacles to make this.
+                 * There are no obstacles to make this (except that we must
+                 * either change tcp_recvmsg() to prevent it from returning data
+                 * before 3WHS completes per RFC793, or employ TCP Fast Open).
                 *
                 * However, if we ignore data in ACKless segments sometimes,
                 * we have no reasons to accept it sometimes.
@@ -5936,6 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock *req;
        int queued = 0;
        tp->rx_opt.saw_tstamp = 0;
@@ -5991,7 +5999,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                return 0;
        }
-        if (!tcp_validate_incoming(sk, skb, th, 0))
+        req = tp->fastopen_rsk;
+        if (req != NULL) {
+                BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                    sk->sk_state != TCP_FIN_WAIT1);
+                if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+                        goto discard;
+        } else if (!tcp_validate_incoming(sk, skb, th, 0))
                return 0;
        /* step 5: check the ACK field */
@@ -6001,7 +6016,22 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                switch (sk->sk_state) {
                case TCP_SYN_RECV:
                        if (acceptable) {
-                                tp->copied_seq = tp->rcv_nxt;
+                                /* Once we leave TCP_SYN_RECV, we no longer
+                                 * need req so release it.
+                                 */
+                                if (req) {
+                                        reqsk_fastopen_remove(sk, req, false);
+                                } else {
+                                        /* Make sure socket is routed, for
+                                         * correct metrics.
+                                         */
+                                        icsk->icsk_af_ops->rebuild_header(sk);
+                                        tcp_init_congestion_control(sk);
+                                        tcp_mtup_init(sk);
+                                        tcp_init_buffer_space(sk);
+                                        tp->copied_seq = tp->rcv_nxt;
+                                }
                                smp_mb();
                                tcp_set_state(sk, TCP_ESTABLISHED);
                                sk->sk_state_change(sk);
@@ -6023,23 +6053,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                if (tp->rx_opt.tstamp_ok)
                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-                                /* Make sure socket is routed, for
+                                if (req) {
-                                 * correct metrics.
+                                        /* Re-arm the timer because data may
-                                 */
+                                         * have been sent out. This is similar
-                                icsk->icsk_af_ops->rebuild_header(sk);
+                                         * to the regular data transmission case
+                                         * when new data has just been ack'ed.
-                                tcp_init_metrics(sk);
+                                         *
+                                         * (TFO) - we could try to be more
-                                tcp_init_congestion_control(sk);
+                                         * aggressive and retranmitting any data
+                                         * sooner based on when they were sent
+                                         * out.
+                                         */
+                                        tcp_rearm_rto(sk);
+                                } else
+                                        tcp_init_metrics(sk);
                                /* Prevent spurious tcp_cwnd_restart() on
                                 * first data packet.
                                 */
                                tp->lsndtime = tcp_time_stamp;
-                                tcp_mtup_init(sk);
                                tcp_initialize_rcv_mss(sk);
-                                tcp_init_buffer_space(sk);
                                tcp_fast_path_on(tp);
                        } else {
                                return 1;
@@ -6047,6 +6081,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        break;
                case TCP_FIN_WAIT1:
+                        /* If we enter the TCP_FIN_WAIT1 state and we are a
+                         * Fast Open socket and this is the first acceptable
+                         * ACK we have received, this would have acknowledged
+                         * our SYNACK so stop the SYNACK timer.
+                         */
+                        if (acceptable && req != NULL) {
+                                /* We no longer need the request sock. */
+                                reqsk_fastopen_remove(sk, req, false);
+                                tcp_rearm_rto(sk);
+                        }
                        if (tp->snd_una == tp->write_seq) {
                                struct dst_entry *dst;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e15c5be04e7..e64abed249cc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        const int code = icmp_hdr(icmp_skb)->code;
        struct sock *sk;
        struct sk_buff *skb;
+        struct request_sock *req;
        __u32 seq;
        __u32 remaining;
        int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        icsk = inet_csk(sk);
        tp = tcp_sk(sk);
+        req = tp->fastopen_rsk;
        seq = ntohl(th->seq);
        if (sk->sk_state != TCP_LISTEN &&
-            !between(seq, tp->snd_una, tp->snd_nxt)) {
+            !between(seq, tp->snd_una, tp->snd_nxt) &&
+            (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
+                /* For a Fast Open socket, allow seq to be snt_isn. */
                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                    !icsk->icsk_backoff)
                        break;
+                /* XXX (TFO) - revisit the following logic for TFO */
                if (sock_owned_by_user(sk))
                        break;
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                goto out;
        }
+        /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
+         * than following the TCP_SYN_RECV case and closing the socket,
+         * we ignore the ICMP error and keep trying like a fully established
+         * socket. Is this the right thing to do?
+         */
+        if (req && req->sk == NULL)
+                goto out;
        switch (sk->sk_state) {
                struct request_sock *req, **prev;
        case TCP_LISTEN:
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:  /* Cannot happen.
-                               It can f.e. if SYNs crossed.
+                               It can f.e. if SYNs crossed,
+                               or Fast Open.
                             */
                if (!sock_owned_by_user(sk)) {
                        sk->sk_err = err;
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
 {
-        tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
+        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
-                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
+         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+         */
+        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
                        req->ts_recent,
                        0,
                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -839,7 +858,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                return -1;
-        skb = tcp_make_synack(sk, dst, req, rvp);
+        skb = tcp_make_synack(sk, dst, req, rvp, NULL);
        if (skb) {
                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 };
 #endif
+static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
+                               struct request_sock *req,
+                               struct tcp_fastopen_cookie *foc,
+                               struct tcp_fastopen_cookie *valid_foc)
+{
+        bool skip_cookie = false;
+        struct fastopen_queue *fastopenq;
+        if (likely(!fastopen_cookie_present(foc))) {
+                /* See include/net/tcp.h for the meaning of these knobs */
+                if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
+                    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
+                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
+                        skip_cookie = true; /* no cookie to validate */
+                else
+                        return false;
+        }
+        fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+        /* A FO option is present; bump the counter. */
+        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
+        /* Make sure the listener has enabled fastopen, and we don't
+         * exceed the max # of pending TFO requests allowed before trying
+         * to validating the cookie in order to avoid burning CPU cycles
+         * unnecessarily.
+         *
+         * XXX (TFO) - The implication of checking the max_qlen before
+         * processing a cookie request is that clients can't differentiate
+         * between qlen overflow causing Fast Open to be disabled
+         * temporarily vs a server not supporting Fast Open at all.
+         */
+        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
+            fastopenq == NULL || fastopenq->max_qlen == 0)
+                return false;
+        if (fastopenq->qlen >= fastopenq->max_qlen) {
+                struct request_sock *req1;
+                spin_lock(&fastopenq->lock);
+                req1 = fastopenq->rskq_rst_head;
+                if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+                        spin_unlock(&fastopenq->lock);
+                        NET_INC_STATS_BH(sock_net(sk),
+                            LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+                        /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
+                        foc->len = -1;
+                        return false;
+                }
+                fastopenq->rskq_rst_head = req1->dl_next;
+                fastopenq->qlen--;
+                spin_unlock(&fastopenq->lock);
+                reqsk_free(req1);
+        }
+        if (skip_cookie) {
+                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                return true;
+        }
+        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
+                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
+                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
+                            memcmp(&foc->val[0], &valid_foc->val[0],
+                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
+                                return false;
+                        valid_foc->len = -1;
+                }
+                /* Acknowledge the data received from the peer. */
+                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+                return true;
+        } else if (foc->len == 0) { /* Client requesting a cookie */
+                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                NET_INC_STATS_BH(sock_net(sk),
+                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+        } else {
+                /* Client sent a cookie with wrong size. Treat it
+                 * the same as invalid and return a valid one.
+                 */
+                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+        }
+        return false;
+}
+static int tcp_v4_conn_req_fastopen(struct sock *sk,
+                                    struct sk_buff *skb,
+                                    struct sk_buff *skb_synack,
+                                    struct request_sock *req,
+                                    struct request_values *rvp)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+        const struct inet_request_sock *ireq = inet_rsk(req);
+        struct sock *child;
+        req->retrans = 0;
+        req->sk = NULL;
+        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+        if (child == NULL) {
+                NET_INC_STATS_BH(sock_net(sk),
+                                 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+                kfree_skb(skb_synack);
+                return -1;
+        }
+        ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+                        ireq->rmt_addr, ireq->opt);
+        /* XXX (TFO) - is it ok to ignore error and continue? */
+        spin_lock(&queue->fastopenq->lock);
+        queue->fastopenq->qlen++;
+        spin_unlock(&queue->fastopenq->lock);
+        /* Initialize the child socket. Have to fix some values to take
+         * into account the child is a Fast Open socket and is created
+         * only out of the bits carried in the SYN packet.
+         */
+        tp = tcp_sk(child);
+        tp->fastopen_rsk = req;
+        /* Do a hold on the listner sk so that if the listener is being
+         * closed, the child that has been accepted can live on and still
+         * access listen_lock.
+         */
+        sock_hold(sk);
+        tcp_rsk(req)->listener = sk;
+        /* RFC1323: The window in SYN & SYN/ACK segments is never
+         * scaled. So correct it appropriately.
+         */
+        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+        /* Activate the retrans timer so that SYNACK can be retransmitted.
+         * The request socket is not added to the SYN table of the parent
+         * because it's been added to the accept queue directly.
+         */
+        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+            TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+        /* Add the child socket directly into the accept queue */
+        inet_csk_reqsk_queue_add(sk, req, child);
+        /* Now finish processing the fastopen child socket. */
+        inet_csk(child)->icsk_af_ops->rebuild_header(child);
+        tcp_init_congestion_control(child);
+        tcp_mtup_init(child);
+        tcp_init_buffer_space(child);
+        tcp_init_metrics(child);
+        /* Queue the data carried in the SYN packet. We need to first
+         * bump skb's refcnt because the caller will attempt to free it.
+         *
+         * XXX (TFO) - we honor a zero-payload TFO request for now.
+         * (Any reason not to?)
+         */
+        if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
+                /* Don't queue the skb if there is no payload in SYN.
+                 * XXX (TFO) - How about SYN+FIN?
+                 */
+                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+        } else {
+                skb = skb_get(skb);
+                skb_dst_drop(skb);
+                __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+                skb_set_owner_r(skb, child);
+                __skb_queue_tail(&child->sk_receive_queue, skb);
+                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+        }
+        sk->sk_data_ready(sk, 0);
+        bh_unlock_sock(child);
+        sock_put(child);
+        WARN_ON(req->sk == NULL);
+        return 0;
+}
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_extend_values tmp_ext;
@@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        __be32 daddr = ip_hdr(skb)->daddr;
        __u32 isn = TCP_SKB_CB(skb)->when;
        bool want_cookie = false;
+        struct flowi4 fl4;
+        struct tcp_fastopen_cookie foc = { .len = -1 };
+        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+        struct sk_buff *skb_synack;
+        int do_fastopen;
        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
-        tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+        tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
+            want_cookie ? NULL : &foc);
        if (tmp_opt.cookie_plus > 0 &&
            tmp_opt.saw_tstamp &&
@@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
-                struct flowi4 fl4;
                /* VJ's idea. We save last timestamp seen
                 * from the destination in peer table, when entering
                 * state TIME-WAIT, and check against it before
@@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_rsk(req)->snt_isn = isn;
        tcp_rsk(req)->snt_synack = tcp_time_stamp;
-        if (tcp_v4_send_synack(sk, dst, req,
+        if (dst == NULL) {
-                               (struct request_values *)&tmp_ext,
+                dst = inet_csk_route_req(sk, &fl4, req);
-                               skb_get_queue_mapping(skb),
+                if (dst == NULL)
-                               want_cookie) ||
+                        goto drop_and_free;
-            want_cookie)
+        }
+        do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
+        /* We don't call tcp_v4_send_synack() directly because we need
+         * to make sure a child socket can be created successfully before
+         * sending back synack!
+         *
+         * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
+         * (or better yet, call tcp_send_synack() in the child context
+         * directly, but will have to fix bunch of other code first)
+         * after syn_recv_sock() except one will need to first fix the
+         * latter to remove its dependency on the current implementation
+         * of tcp_v4_send_synack()->tcp_select_initial_window().
+         */
+        skb_synack = tcp_make_synack(sk, dst, req,
+            (struct request_values *)&tmp_ext,
+            fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
+        if (skb_synack) {
+                __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
+                skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
+        } else
+                goto drop_and_free;
+        if (likely(!do_fastopen)) {
+                int err;
+                err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+                     ireq->rmt_addr, ireq->opt);
+                err = net_xmit_eval(err);
+                if (err || want_cookie)
+                        goto drop_and_free;
+                tcp_rsk(req)->listener = NULL;
+                /* Add the request_sock to the SYN table */
+                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+                if (fastopen_cookie_present(&foc) && foc.len != 0)
+                        NET_INC_STATS_BH(sock_net(sk),
+                            LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+        } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
+            (struct request_values *)&tmp_ext))
                goto drop_and_free;
-        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
        return 0;
 drop_and_release:
@@ -1554,7 +1787,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
                                                       iph->saddr, iph->daddr);
        if (req)
-                return tcp_check_req(sk, skb, req, prev);
+                return tcp_check_req(sk, skb, req, prev, false);
        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
                        th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
                         tcp_cookie_values_release);
                tp->cookie_values = NULL;
        }
+        BUG_ON(tp->fastopen_rsk != NULL);
        /* If socket is aborted during connect operation */
        tcp_free_fastopen_req(tp);
@@ -2393,7 +2627,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 EXPORT_SYMBOL(tcp_proc_unregister);
 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
-                         struct seq_file *f, int i, int uid, int *len)
+                         struct seq_file *f, int i, kuid_t uid, int *len)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        long delta = req->expires - jiffies;
@@ -2410,7 +2644,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
                1,    /* timers active (only the expire timer) */
                jiffies_delta_to_clock_t(delta),
                req->retrans,
-                uid,
+                from_kuid_munged(seq_user_ns(f), uid),
                0,  /* non standard timer */
                0, /* open_requests have no inode */
                atomic_read(&sk->sk_refcnt),
@@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct inet_sock *inet = inet_sk(sk);
+        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
        __be32 dest = inet->inet_daddr;
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
@@ -2461,7 +2696,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                timer_active,
                jiffies_delta_to_clock_t(timer_expires - jiffies),
                icsk->icsk_retransmits,
-                sock_i_uid(sk),
+                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
                icsk->icsk_probes_out,
                sock_i_ino(sk),
                atomic_read(&sk->sk_refcnt), sk,
@@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                jiffies_to_clock_t(icsk->icsk_ack.ato),
                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
                tp->snd_cwnd,
-                tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
+                sk->sk_state == TCP_LISTEN ?
+                    (fastopenq ? fastopenq->max_qlen : 0) :
+                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
                len);
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9d..e965319d610b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                        newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
                newtp->rx_opt.mss_clamp = req->mss;
                TCP_ECN_openreq_child(newtp, req);
+                newtp->fastopen_rsk = NULL;
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
        }
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 EXPORT_SYMBOL(tcp_create_openreq_child);
 /*
- *      Process an incoming packet for SYN_RECV sockets represented
+ * Process an incoming packet for SYN_RECV sockets represented as a
- *      as a request_sock.
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
 */
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           struct request_sock *req,
-                           struct request_sock **prev)
+                           struct request_sock **prev,
+                           bool fastopen)
 {
        struct tcp_options_received tmp_opt;
        const u8 *hash_location;
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
        bool paws_reject = false;
+        BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
        tmp_opt.saw_tstamp = 0;
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                 *
                 * Enforce "SYN-ACK" according to figure 8, figure 6
                 * of RFC793, fixed by RFC1122.
+                 *
+                 * Note that even if there is new data in the SYN packet
+                 * they will be thrown away too.
                 */
                req->rsk_ops->rtx_syn_ack(sk, req, NULL);
                return NULL;
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
         *                  sent (the segment carries an unacceptable ACK) ...
         *                  a reset is sent."
         *
-         * Invalid ACK: reset will be sent by listening socket
+         * Invalid ACK: reset will be sent by listening socket.
+         * Note that the ACK validity check for a Fast Open socket is done
+         * elsewhere and is checked directly against the child socket rather
+         * than req because user data may have been sent out.
         */
-        if ((flg & TCP_FLAG_ACK) &&
+        if ((flg & TCP_FLAG_ACK) && !fastopen &&
            (TCP_SKB_CB(skb)->ack_seq !=
             tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
                return sk;
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* RFC793: "first check sequence number". */
        if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-                                          tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
+                                          tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
                /* Out of window: send ACK and drop. */
                if (!(flg & TCP_FLAG_RST))
                        req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* In sequence, PAWS is OK. */
-        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
+        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
                req->ts_recent = tmp_opt.rcv_tsval;
        if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* ACK sequence verified above, just make sure ACK is
         * set.  If ACK not set, just silently drop the packet.
+         *
+         * XXX (TFO) - if we ever allow "data after SYN", the
+         * following check needs to be removed.
         */
        if (!(flg & TCP_FLAG_ACK))
                return NULL;
+        /* For Fast Open no more processing is needed (sk is the
+         * child socket).
+         */
+        if (fastopen)
+                return sk;
        /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
        if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
            TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -706,11 +729,21 @@ listen_overflow:
        }
 embryonic_reset:
-        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+        if (!(flg & TCP_FLAG_RST)) {
-        if (!(flg & TCP_FLAG_RST))
+                /* Received a bad SYN pkt - for TFO We try not to reset
+                 * the local connection unless it's really necessary to
+                 * avoid becoming vulnerable to outside attack aiming at
+                 * resetting legit local connections.
+                 */
                req->rsk_ops->send_reset(sk, skb);
+        } else if (fastopen) { /* received a valid RST pkt */
-        inet_csk_reqsk_queue_drop(sk, req, prev);
+                reqsk_fastopen_remove(sk, req, true);
+                tcp_reset(sk);
+        }
+        if (!fastopen) {
+                inet_csk_reqsk_queue_drop(sk, req, prev);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+        }
        return NULL;
 }
 EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
 * Queue segment on the new socket if the new socket is active,
 * otherwise we just shortcircuit this and continue with
 * the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
 */
 int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d04632673a9e..9383b51f3efc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
                                   unsigned int mss, struct sk_buff *skb,
                                   struct tcp_out_options *opts,
                                   struct tcp_md5sig_key **md5,
-                                   struct tcp_extend_values *xvp)
+                                   struct tcp_extend_values *xvp,
+                                   struct tcp_fastopen_cookie *foc)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
                if (unlikely(!ireq->tstamp_ok))
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
+        if (foc != NULL) {
+                u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+                need = (need + 3) & ~3U;  /* Align to 32 bits */
+                if (remaining >= need) {
+                        opts->options |= OPTION_FAST_OPEN_COOKIE;
+                        opts->fastopen_cookie = foc;
+                        remaining -= need;
+                }
+        }
        /* Similar rationale to tcp_syn_options() applies here, too.
         * If the <SYN> options fit, the same options should fit now!
         */
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
 */
 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
-                                struct request_values *rvp)
+                                struct request_values *rvp,
+                                struct tcp_fastopen_cookie *foc)
 {
        struct tcp_out_options opts;
        struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 #endif
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
        tcp_header_size = tcp_synack_options(sk, req, mss,
-                                             skb, &opts, &md5, xvp)
+                                             skb, &opts, &md5, xvp, foc)
                        + sizeof(*th);
        skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        }
        th->seq = htonl(TCP_SKB_CB(skb)->seq);
-        th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
+        /* XXX data is queued and acked as is. No buffer/window check */
+        th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b774a03bd1dc..fc04711e80c8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk)
 }
 /*
+ *      Timer for Fast Open socket to retransmit SYNACK. Note that the
+ *      sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        int max_retries = icsk->icsk_syn_retries ? :
+            sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+        struct request_sock *req;
+        req = tcp_sk(sk)->fastopen_rsk;
+        req->rsk_ops->syn_ack_timeout(sk, req);
+        if (req->retrans >= max_retries) {
+                tcp_write_err(sk);
+                return;
+        }
+        /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+         * returned from rtx_syn_ack() to make it more persistent like
+         * regular retransmit because if the child socket has been accepted
+         * it's not good to give up too easily.
+         */
+        req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+        req->retrans++;
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                          TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
+}
+/*
 *      The TCP retransmit timer.
 */
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
                tcp_resume_early_retransmit(sk);
                return;
        }
+        if (tp->fastopen_rsk) {
+                BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                    sk->sk_state != TCP_FIN_WAIT1);
+                tcp_fastopen_synack_timer(sk);
+                /* Before we receive ACK to our SYN-ACK don't retransmit
+                 * anything else (e.g., data or FIN segments).
+                 */
+                return;
+        }
        if (!tp->packets_out)
                goto out;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6f6d1aca3c3d..c4e64328d8ba 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2110,7 +2110,9 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
-                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+                0, 0L, 0,
+                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp,
                atomic_read(&sp->sk_drops), len);
 }
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 16d0960062be..d2f336ea82ca 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -24,7 +24,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
        if (!inet_diag_bc_sk(bc, sk))
                return 0;
-        return inet_sk_diag_fill(sk, NULL, skb, req, NETLINK_CB(cb->skb).pid,
+        return inet_sk_diag_fill(sk, NULL, skb, req,
+                        sk_user_ns(NETLINK_CB(cb->skb).ssk),
+                        NETLINK_CB(cb->skb).pid,
                        cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -69,6 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
                goto out;
        err = inet_sk_diag_fill(sk, NULL, rep, req,
+                           sk_user_ns(NETLINK_CB(in_skb).ssk),
                           NETLINK_CB(in_skb).pid,
                           nlh->nlmsg_seq, 0, nlh);
        if (err < 0) {
author	Pablo Neira Ayuso <pablo@netfilter.org>	2012-09-03 09:28:30 -0400
committer	Pablo Neira Ayuso <pablo@netfilter.org>	2012-09-03 09:34:51 -0400
commit	ace1fe1231bdfffd60b5e703aa5b7283fbf98dbd (patch)
tree	06c7492a8f3cc65f916768616ca24c6bc7171761 /net/ipv4
parent	ce9f3f31efb88841e4df98794b13dbac8c4901da (diff)
parent	a2dc375e12334b3d8f787a48b2fb6172ccfb80ae (diff)