1 files changed, 641 insertions, 0 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000000..fe3c6d3d0c91
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              Support for INET connection oriented protocols.
+ *
+ * Authors:     See the TCP sources
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or(at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+{
+        const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
+        struct sock *sk2;
+        struct hlist_node *node;
+        int reuse = sk->sk_reuse;
+        sk_for_each_bound(sk2, node, &tb->owners) {
+                if (sk != sk2 &&
+                    !inet_v6_ipv6only(sk2) &&
+                    (!sk->sk_bound_dev_if ||
+                     !sk2->sk_bound_dev_if ||
+                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+                        if (!reuse || !sk2->sk_reuse ||
+                            sk2->sk_state == TCP_LISTEN) {
+                                const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+                                    sk2_rcv_saddr == sk_rcv_saddr)
+                                        break;
+                        }
+                }
+        }
+        return node != NULL;
+}
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+                      struct sock *sk, unsigned short snum)
+{
+        struct inet_bind_hashbucket *head;
+        struct hlist_node *node;
+        struct inet_bind_bucket *tb;
+        int ret;
+        local_bh_disable();
+        if (!snum) {
+                int low = sysctl_local_port_range[0];
+                int high = sysctl_local_port_range[1];
+                int remaining = (high - low) + 1;
+                int rover;
+                spin_lock(&hashinfo->portalloc_lock);
+                if (hashinfo->port_rover < low)
+                        rover = low;
+                else
+                        rover = hashinfo->port_rover;
+                do {
+                        rover++;
+                        if (rover > high)
+                                rover = low;
+                        head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+                        spin_lock(&head->lock);
+                        inet_bind_bucket_for_each(tb, node, &head->chain)
+                                if (tb->port == rover)
+                                        goto next;
+                        break;
+                next:
+                        spin_unlock(&head->lock);
+                } while (--remaining > 0);
+                hashinfo->port_rover = rover;
+                spin_unlock(&hashinfo->portalloc_lock);
+                /* Exhausted local port range during search?  It is not
+                 * possible for us to be holding one of the bind hash
+                 * locks if this test triggers, because if 'remaining'
+                 * drops to zero, we broke out of the do/while loop at
+                 * the top level, not from the 'break;' statement.
+                 */
+                ret = 1;
+                if (remaining <= 0)
+                        goto fail;
+                /* OK, here is the one we will use.  HEAD is
+                 * non-NULL and we hold it's mutex.
+                 */
+                snum = rover;
+        } else {
+                head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+                spin_lock(&head->lock);
+                inet_bind_bucket_for_each(tb, node, &head->chain)
+                        if (tb->port == snum)
+                                goto tb_found;
+        }
+        tb = NULL;
+        goto tb_not_found;
+tb_found:
+        if (!hlist_empty(&tb->owners)) {
+                if (sk->sk_reuse > 1)
+                        goto success;
+                if (tb->fastreuse > 0 &&
+                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+                        goto success;
+                } else {
+                        ret = 1;
+                        if (inet_csk_bind_conflict(sk, tb))
+                                goto fail_unlock;
+                }
+        }
+tb_not_found:
+        ret = 1;
+        if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+                goto fail_unlock;
+        if (hlist_empty(&tb->owners)) {
+                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+                        tb->fastreuse = 1;
+                else
+                        tb->fastreuse = 0;
+        } else if (tb->fastreuse &&
+                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+                tb->fastreuse = 0;
+success:
+        if (!inet_csk(sk)->icsk_bind_hash)
+                inet_bind_hash(sk, tb, snum);
+        BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+        ret = 0;
+fail_unlock:
+        spin_unlock(&head->lock);
+fail:
+        local_bh_enable();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        DEFINE_WAIT(wait);
+        int err;
+        /*
+         * True wake-one mechanism for incoming connections: only
+         * one process gets woken up, not the 'whole herd'.
+         * Since we do not 'race & poll' for established sockets
+         * anymore, the common case will execute the loop only once.
+         *
+         * Subtle issue: "add_wait_queue_exclusive()" will be added
+         * after any current non-exclusive waiters, and we know that
+         * it will always _stay_ after any new non-exclusive waiters
+         * because all non-exclusive waiters are added at the
+         * beginning of the wait-queue. As such, it's ok to "drop"
+         * our exclusiveness temporarily when we get woken up without
+         * having to remove and re-insert us on the wait queue.
+         */
+        for (;;) {
+                prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+                                          TASK_INTERRUPTIBLE);
+                release_sock(sk);
+                if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+                        timeo = schedule_timeout(timeo);
+                lock_sock(sk);
+                err = 0;
+                if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+                        break;
+                err = -EINVAL;
+                if (sk->sk_state != TCP_LISTEN)
+                        break;
+                err = sock_intr_errno(timeo);
+                if (signal_pending(current))
+                        break;
+                err = -EAGAIN;
+                if (!timeo)
+                        break;
+        }
+        finish_wait(sk->sk_sleep, &wait);
+        return err;
+}
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct sock *newsk;
+        int error;
+        lock_sock(sk);
+        /* We need to make sure that this socket is listening,
+         * and that it has something pending.
+         */
+        error = -EINVAL;
+        if (sk->sk_state != TCP_LISTEN)
+                goto out_err;
+        /* Find already established connection */
+        if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+                /* If this is a non blocking socket don't sleep */
+                error = -EAGAIN;
+                if (!timeo)
+                        goto out_err;
+                error = inet_csk_wait_for_connect(sk, timeo);
+                if (error)
+                        goto out_err;
+        }
+        newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+        BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+out:
+        release_sock(sk);
+        return newsk;
+out_err:
+        newsk = NULL;
+        *err = error;
+        goto out;
+}
+EXPORT_SYMBOL(inet_csk_accept);
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+                               void (*retransmit_handler)(unsigned long),
+                               void (*delack_handler)(unsigned long),
+                               void (*keepalive_handler)(unsigned long))
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        init_timer(&icsk->icsk_retransmit_timer);
+        init_timer(&icsk->icsk_delack_timer);
+        init_timer(&sk->sk_timer);
+        icsk->icsk_retransmit_timer.function = retransmit_handler;
+        icsk->icsk_delack_timer.function     = delack_handler;
+        sk->sk_timer.function                = keepalive_handler;
+        icsk->icsk_retransmit_timer.data = 
+                icsk->icsk_delack_timer.data =
+                        sk->sk_timer.data  = (unsigned long)sk;
+        icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+        sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+        sk_stop_timer(sk, &icsk->icsk_delack_timer);
+        sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+        sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+        sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+struct dst_entry* inet_csk_route_req(struct sock *sk,
+                                     const struct request_sock *req)
+{
+        struct rtable *rt;
+        const struct inet_request_sock *ireq = inet_rsk(req);
+        struct ip_options *opt = inet_rsk(req)->opt;
+        struct flowi fl = { .oif = sk->sk_bound_dev_if,
+                            .nl_u = { .ip4_u =
+                                      { .daddr = ((opt && opt->srr) ?
+                                                  opt->faddr :
+                                                  ireq->rmt_addr),
+                                        .saddr = ireq->loc_addr,
+                                        .tos = RT_CONN_FLAGS(sk) } },
+                            .proto = sk->sk_protocol,
+                            .uli_u = { .ports =
+                                       { .sport = inet_sk(sk)->sport,
+                                         .dport = ireq->rmt_port } } };
+        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+                return NULL;
+        }
+        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+                ip_rt_put(rt);
+                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+                return NULL;
+        }
+        return &rt->u.dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
+                                 const u32 rnd, const u16 synq_hsize)
+{
+        return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
+}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+                                         struct request_sock ***prevp,
+                                         const __u16 rport, const __u32 raddr,
+                                         const __u32 laddr)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+        struct request_sock *req, **prev;
+        for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+                                                    lopt->nr_table_entries)];
+             (req = *prev) != NULL;
+             prev = &req->dl_next) {
+                const struct inet_request_sock *ireq = inet_rsk(req);
+                if (ireq->rmt_port == rport &&
+                    ireq->rmt_addr == raddr &&
+                    ireq->loc_addr == laddr &&
+                    AF_INET_FAMILY(req->rsk_ops->family)) {
+                        BUG_TRAP(!req->sk);
+                        *prevp = prev;
+                        break;
+                }
+        }
+        return req;
+}
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+                                   const unsigned timeout)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+        const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+                                     lopt->hash_rnd, lopt->nr_table_entries);
+        reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+        inet_csk_reqsk_queue_added(sk, timeout);
+}
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+                                const unsigned long interval,
+                                const unsigned long timeout,
+                                const unsigned long max_rto)
+{
+        struct inet_connection_sock *icsk = inet_csk(parent);
+        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+        struct listen_sock *lopt = queue->listen_opt;
+        int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+        int thresh = max_retries;
+        unsigned long now = jiffies;
+        struct request_sock **reqp, *req;
+        int i, budget;
+        if (lopt == NULL || lopt->qlen == 0)
+                return;
+        /* Normally all the openreqs are young and become mature
+         * (i.e. converted to established socket) for first timeout.
+         * If synack was not acknowledged for 3 seconds, it means
+         * one of the following things: synack was lost, ack was lost,
+         * rtt is high or nobody planned to ack (i.e. synflood).
+         * When server is a bit loaded, queue is populated with old
+         * open requests, reducing effective size of queue.
+         * When server is well loaded, queue size reduces to zero
+         * after several minutes of work. It is not synflood,
+         * it is normal operation. The solution is pruning
+         * too old entries overriding normal timeout, when
+         * situation becomes dangerous.
+         *
+         * Essentially, we reserve half of room for young
+         * embrions; and abort old ones without pity, if old
+         * ones are about to clog our table.
+         */
+        if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+                int young = (lopt->qlen_young<<1);
+                while (thresh > 2) {
+                        if (lopt->qlen < young)
+                                break;
+                        thresh--;
+                        young <<= 1;
+                }
+        }
+        if (queue->rskq_defer_accept)
+                max_retries = queue->rskq_defer_accept;
+        budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+        i = lopt->clock_hand;
+        do {
+                reqp=&lopt->syn_table[i];
+                while ((req = *reqp) != NULL) {
+                        if (time_after_eq(now, req->expires)) {
+                                if ((req->retrans < thresh ||
+                                     (inet_rsk(req)->acked && req->retrans < max_retries))
+                                    && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+                                        unsigned long timeo;
+                                        if (req->retrans++ == 0)
+                                                lopt->qlen_young--;
+                                        timeo = min((timeout << req->retrans), max_rto);
+                                        req->expires = now + timeo;
+                                        reqp = &req->dl_next;
+                                        continue;
+                                }
+                                /* Drop this request */
+                                inet_csk_reqsk_queue_unlink(parent, req, reqp);
+                                reqsk_queue_removed(queue, req);
+                                reqsk_free(req);
+                                continue;
+                        }
+                        reqp = &req->dl_next;
+                }
+                i = (i + 1) & (lopt->nr_table_entries - 1);
+        } while (--budget > 0);
+        lopt->clock_hand = i;
+        if (lopt->qlen)
+                inet_csk_reset_keepalive_timer(parent, interval);
+}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+                            const unsigned int __nocast priority)
+{
+        struct sock *newsk = sk_clone(sk, priority);
+        if (newsk != NULL) {
+                struct inet_connection_sock *newicsk = inet_csk(newsk);
+                newsk->sk_state = TCP_SYN_RECV;
+                newicsk->icsk_bind_hash = NULL;
+                inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+                newsk->sk_write_space = sk_stream_write_space;
+                newicsk->icsk_retransmits = 0;
+                newicsk->icsk_backoff     = 0;
+                newicsk->icsk_probes_out  = 0;
+                /* Deinitialize accept_queue to trap illegal accesses. */
+                memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+        }
+        return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+        BUG_TRAP(sk->sk_state == TCP_CLOSE);
+        BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+        /* It cannot be in hash table! */
+        BUG_TRAP(sk_unhashed(sk));
+        /* If it has not 0 inet_sk(sk)->num, it must be bound */
+        BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+        sk->sk_prot->destroy(sk);
+        sk_stream_kill_queues(sk);
+        xfrm_sk_free_policy(sk);
+        sk_refcnt_debug_release(sk);
+        atomic_dec(sk->sk_prot->orphan_count);
+        sock_put(sk);
+}
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+        if (rc != 0)
+                return rc;
+        sk->sk_max_ack_backlog = 0;
+        sk->sk_ack_backlog = 0;
+        inet_csk_delack_init(sk);
+        /* There is race window here: we announce ourselves listening,
+         * but this transition is still not validated by get_port().
+         * It is OK, because this socket enters to hash table only
+         * after validation is complete.
+         */
+        sk->sk_state = TCP_LISTEN;
+        if (!sk->sk_prot->get_port(sk, inet->num)) {
+                inet->sport = htons(inet->num);
+                sk_dst_reset(sk);
+                sk->sk_prot->hash(sk);
+                return 0;
+        }
+        sk->sk_state = TCP_CLOSE;
+        __reqsk_queue_destroy(&icsk->icsk_accept_queue);
+        return -EADDRINUSE;
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+/*
+ *      This routine closes sockets which have been at least partially
+ *      opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock *acc_req;
+        struct request_sock *req;
+        inet_csk_delete_keepalive_timer(sk);
+        /* make all the listen_opt local to us */
+        acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+        /* Following specs, it would be better either to send FIN
+         * (and enter FIN-WAIT-1, it is normal close)
+         * or to send active reset (abort).
+         * Certainly, it is pretty dangerous while synflood, but it is
+         * bad justification for our negligence 8)
+         * To be honest, we are not able to make either
+         * of the variants now.                 --ANK
+         */
+        reqsk_queue_destroy(&icsk->icsk_accept_queue);
+        while ((req = acc_req) != NULL) {
+                struct sock *child = req->sk;
+                acc_req = req->dl_next;
+                local_bh_disable();
+                bh_lock_sock(child);
+                BUG_TRAP(!sock_owned_by_user(child));
+                sock_hold(child);
+                sk->sk_prot->disconnect(child, O_NONBLOCK);
+                sock_orphan(child);
+                atomic_inc(sk->sk_prot->orphan_count);
+                inet_csk_destroy_sock(child);
+                bh_unlock_sock(child);
+                local_bh_enable();
+                sock_put(child);
+                sk_acceptq_removed(sk);
+                __reqsk_free(req);
+        }
+        BUG_TRAP(!sk->sk_ack_backlog);
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c new file mode 100644 index 000000000000..fe3c6d3d0c91 --- /dev/null +++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
	1	/*
	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
	3	* operating system. INET is implemented using the BSD Socket
	4	* interface as the means of communication with the user level.
	5	*
	6	* Support for INET connection oriented protocols.
	7	*
	8	* Authors: See the TCP sources
	9	*
	10	* This program is free software; you can redistribute it and/or
	11	* modify it under the terms of the GNU General Public License
	12	* as published by the Free Software Foundation; either version
	13	* 2 of the License, or(at your option) any later version.
	14	*/
	15
	16	#include <linux/config.h>
	17	#include <linux/module.h>
	18	#include <linux/jhash.h>
	19
	20	#include <net/inet_connection_sock.h>
	21	#include <net/inet_hashtables.h>
	22	#include <net/inet_timewait_sock.h>
	23	#include <net/ip.h>
	24	#include <net/route.h>
	25	#include <net/tcp_states.h>
	26	#include <net/xfrm.h>
	27
	28	#ifdef INET_CSK_DEBUG
	29	const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
	30	EXPORT_SYMBOL(inet_csk_timer_bug_msg);
	31	#endif
	32
	33	/*
	34	* This array holds the first and last local port number.
	35	* For high-usage systems, use sysctl to change this to
	36	* 32768-61000
	37	*/
	38	int sysctl_local_port_range[2] = { 1024, 4999 };
	39
	40	static inline int inet_csk_bind_conflict(struct sock sk, struct inet_bind_bucket tb)
	41	{
	42	const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
	43	struct sock *sk2;
	44	struct hlist_node *node;
	45	int reuse = sk->sk_reuse;
	46
	47	sk_for_each_bound(sk2, node, &tb->owners) {
	48	if (sk != sk2 &&
	49	!inet_v6_ipv6only(sk2) &&
	50	(!sk->sk_bound_dev_if \|\|
	51	!sk2->sk_bound_dev_if \|\|
	52	sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
	53	if (!reuse \|\| !sk2->sk_reuse \|\|
	54	sk2->sk_state == TCP_LISTEN) {
	55	const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
	56	if (!sk2_rcv_saddr \|\| !sk_rcv_saddr \|\|
	57	sk2_rcv_saddr == sk_rcv_saddr)
	58	break;
	59	}
	60	}
	61	}
	62	return node != NULL;
	63	}
	64
	65	/* Obtain a reference to a local port for the given sock,
	66	* if snum is zero it means select any available local port.
	67	*/
	68	int inet_csk_get_port(struct inet_hashinfo *hashinfo,
	69	struct sock *sk, unsigned short snum)
	70	{
	71	struct inet_bind_hashbucket *head;
	72	struct hlist_node *node;
	73	struct inet_bind_bucket *tb;
	74	int ret;
	75
	76	local_bh_disable();
	77	if (!snum) {
	78	int low = sysctl_local_port_range[0];
	79	int high = sysctl_local_port_range[1];
	80	int remaining = (high - low) + 1;
	81	int rover;
	82
	83	spin_lock(&hashinfo->portalloc_lock);
	84	if (hashinfo->port_rover < low)
	85	rover = low;
	86	else
	87	rover = hashinfo->port_rover;
	88	do {
	89	rover++;
	90	if (rover > high)
	91	rover = low;
	92	head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
	93	spin_lock(&head->lock);
	94	inet_bind_bucket_for_each(tb, node, &head->chain)
	95	if (tb->port == rover)
	96	goto next;
	97	break;
	98	next:
	99	spin_unlock(&head->lock);
	100	} while (--remaining > 0);
	101	hashinfo->port_rover = rover;
	102	spin_unlock(&hashinfo->portalloc_lock);
	103
	104	/* Exhausted local port range during search? It is not
	105	* possible for us to be holding one of the bind hash
	106	* locks if this test triggers, because if 'remaining'
	107	* drops to zero, we broke out of the do/while loop at
	108	* the top level, not from the 'break;' statement.
	109	*/
	110	ret = 1;
	111	if (remaining <= 0)
	112	goto fail;
	113
	114	/* OK, here is the one we will use. HEAD is
	115	* non-NULL and we hold it's mutex.
	116	*/
	117	snum = rover;
	118	} else {
	119	head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
	120	spin_lock(&head->lock);
	121	inet_bind_bucket_for_each(tb, node, &head->chain)
	122	if (tb->port == snum)
	123	goto tb_found;
	124	}
	125	tb = NULL;
	126	goto tb_not_found;
	127	tb_found:
	128	if (!hlist_empty(&tb->owners)) {
	129	if (sk->sk_reuse > 1)
	130	goto success;
	131	if (tb->fastreuse > 0 &&
	132	sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
	133	goto success;
	134	} else {
	135	ret = 1;
	136	if (inet_csk_bind_conflict(sk, tb))
	137	goto fail_unlock;
	138	}
	139	}
	140	tb_not_found:
	141	ret = 1;
	142	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
	143	goto fail_unlock;
	144	if (hlist_empty(&tb->owners)) {
	145	if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
	146	tb->fastreuse = 1;
	147	else
	148	tb->fastreuse = 0;
	149	} else if (tb->fastreuse &&
	150	(!sk->sk_reuse \|\| sk->sk_state == TCP_LISTEN))
	151	tb->fastreuse = 0;
	152	success:
	153	if (!inet_csk(sk)->icsk_bind_hash)
	154	inet_bind_hash(sk, tb, snum);
	155	BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
	156	ret = 0;
	157
	158	fail_unlock:
	159	spin_unlock(&head->lock);
	160	fail:
	161	local_bh_enable();
	162	return ret;
	163	}
	164
	165	EXPORT_SYMBOL_GPL(inet_csk_get_port);
	166
	167	/*
	168	* Wait for an incoming connection, avoid race conditions. This must be called
	169	* with the socket locked.
	170	*/
	171	static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
	172	{
	173	struct inet_connection_sock *icsk = inet_csk(sk);
	174	DEFINE_WAIT(wait);
	175	int err;
	176
	177	/*
	178	* True wake-one mechanism for incoming connections: only
	179	* one process gets woken up, not the 'whole herd'.
	180	* Since we do not 'race & poll' for established sockets
	181	* anymore, the common case will execute the loop only once.
	182	*
	183	* Subtle issue: "add_wait_queue_exclusive()" will be added
	184	* after any current non-exclusive waiters, and we know that
	185	* it will always _stay_ after any new non-exclusive waiters
	186	* because all non-exclusive waiters are added at the
	187	* beginning of the wait-queue. As such, it's ok to "drop"
	188	* our exclusiveness temporarily when we get woken up without
	189	* having to remove and re-insert us on the wait queue.
	190	*/
	191	for (;;) {
	192	prepare_to_wait_exclusive(sk->sk_sleep, &wait,
	193	TASK_INTERRUPTIBLE);
	194	release_sock(sk);
	195	if (reqsk_queue_empty(&icsk->icsk_accept_queue))
	196	timeo = schedule_timeout(timeo);
	197	lock_sock(sk);
	198	err = 0;
	199	if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
	200	break;
	201	err = -EINVAL;
	202	if (sk->sk_state != TCP_LISTEN)
	203	break;
	204	err = sock_intr_errno(timeo);
	205	if (signal_pending(current))
	206	break;
	207	err = -EAGAIN;
	208	if (!timeo)
	209	break;
	210	}
	211	finish_wait(sk->sk_sleep, &wait);
	212	return err;
	213	}
	214
	215	/*
	216	* This will accept the next outstanding connection.
	217	*/
	218	struct sock inet_csk_accept(struct sock sk, int flags, int *err)
	219	{
	220	struct inet_connection_sock *icsk = inet_csk(sk);
	221	struct sock *newsk;
	222	int error;
	223
	224	lock_sock(sk);
	225
	226	/* We need to make sure that this socket is listening,
	227	* and that it has something pending.
	228	*/
	229	error = -EINVAL;
	230	if (sk->sk_state != TCP_LISTEN)
	231	goto out_err;
	232
	233	/* Find already established connection */
	234	if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
	235	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
	236
	237	/* If this is a non blocking socket don't sleep */
	238	error = -EAGAIN;
	239	if (!timeo)
	240	goto out_err;
	241
	242	error = inet_csk_wait_for_connect(sk, timeo);
	243	if (error)
	244	goto out_err;
	245	}
	246
	247	newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
	248	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
	249	out:
	250	release_sock(sk);
	251	return newsk;
	252	out_err:
	253	newsk = NULL;
	254	*err = error;
	255	goto out;
	256	}
	257
	258	EXPORT_SYMBOL(inet_csk_accept);
	259
	260	/*
	261	* Using different timers for retransmit, delayed acks and probes
	262	* We may wish use just one timer maintaining a list of expire jiffies
	263	* to optimize.
	264	*/
	265	void inet_csk_init_xmit_timers(struct sock *sk,
	266	void (*retransmit_handler)(unsigned long),
	267	void (*delack_handler)(unsigned long),
	268	void (*keepalive_handler)(unsigned long))
	269	{
	270	struct inet_connection_sock *icsk = inet_csk(sk);
	271
	272	init_timer(&icsk->icsk_retransmit_timer);
	273	init_timer(&icsk->icsk_delack_timer);
	274	init_timer(&sk->sk_timer);
	275
	276	icsk->icsk_retransmit_timer.function = retransmit_handler;
	277	icsk->icsk_delack_timer.function = delack_handler;
	278	sk->sk_timer.function = keepalive_handler;
	279
	280	icsk->icsk_retransmit_timer.data =
	281	icsk->icsk_delack_timer.data =
	282	sk->sk_timer.data = (unsigned long)sk;
	283
	284	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
	285	}
	286
	287	EXPORT_SYMBOL(inet_csk_init_xmit_timers);
	288
	289	void inet_csk_clear_xmit_timers(struct sock *sk)
	290	{
	291	struct inet_connection_sock *icsk = inet_csk(sk);
	292
	293	icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
	294
	295	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
	296	sk_stop_timer(sk, &icsk->icsk_delack_timer);
	297	sk_stop_timer(sk, &sk->sk_timer);
	298	}
	299
	300	EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
	301
	302	void inet_csk_delete_keepalive_timer(struct sock *sk)
	303	{
	304	sk_stop_timer(sk, &sk->sk_timer);
	305	}
	306
	307	EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
	308
	309	void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
	310	{
	311	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
	312	}
	313
	314	EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
	315
	316	struct dst_entry* inet_csk_route_req(struct sock *sk,
	317	const struct request_sock *req)
	318	{
	319	struct rtable *rt;
	320	const struct inet_request_sock *ireq = inet_rsk(req);
	321	struct ip_options *opt = inet_rsk(req)->opt;
	322	struct flowi fl = { .oif = sk->sk_bound_dev_if,
	323	.nl_u = { .ip4_u =
	324	{ .daddr = ((opt && opt->srr) ?
	325	opt->faddr :
	326	ireq->rmt_addr),
	327	.saddr = ireq->loc_addr,
	328	.tos = RT_CONN_FLAGS(sk) } },
	329	.proto = sk->sk_protocol,
	330	.uli_u = { .ports =
	331	{ .sport = inet_sk(sk)->sport,
	332	.dport = ireq->rmt_port } } };
	333
	334	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
	335	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
	336	return NULL;
	337	}
	338	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
	339	ip_rt_put(rt);
	340	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
	341	return NULL;
	342	}
	343	return &rt->u.dst;
	344	}
	345
	346	EXPORT_SYMBOL_GPL(inet_csk_route_req);
	347
	348	static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
	349	const u32 rnd, const u16 synq_hsize)
	350	{
	351	return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
	352	}
	353
	354	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
	355	#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
	356	#else
	357	#define AF_INET_FAMILY(fam) 1
	358	#endif
	359
	360	struct request_sock inet_csk_search_req(const struct sock sk,
	361	struct request_sock ***prevp,
	362	const __u16 rport, const __u32 raddr,
	363	const __u32 laddr)
	364	{
	365	const struct inet_connection_sock *icsk = inet_csk(sk);
	366	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
	367	struct request_sock req, *prev;
	368
	369	for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
	370	lopt->nr_table_entries)];
	371	(req = *prev) != NULL;
	372	prev = &req->dl_next) {
	373	const struct inet_request_sock *ireq = inet_rsk(req);
	374
	375	if (ireq->rmt_port == rport &&
	376	ireq->rmt_addr == raddr &&
	377	ireq->loc_addr == laddr &&
	378	AF_INET_FAMILY(req->rsk_ops->family)) {
	379	BUG_TRAP(!req->sk);
	380	*prevp = prev;
	381	break;
	382	}
	383	}
	384
	385	return req;
	386	}
	387
	388	EXPORT_SYMBOL_GPL(inet_csk_search_req);
	389
	390	void inet_csk_reqsk_queue_hash_add(struct sock sk, struct request_sock req,
	391	const unsigned timeout)
	392	{
	393	struct inet_connection_sock *icsk = inet_csk(sk);
	394	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
	395	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
	396	lopt->hash_rnd, lopt->nr_table_entries);
	397
	398	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
	399	inet_csk_reqsk_queue_added(sk, timeout);
	400	}
	401
	402	/* Only thing we need from tcp.h */
	403	extern int sysctl_tcp_synack_retries;
	404
	405	EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
	406
	407	void inet_csk_reqsk_queue_prune(struct sock *parent,
	408	const unsigned long interval,
	409	const unsigned long timeout,
	410	const unsigned long max_rto)
	411	{
	412	struct inet_connection_sock *icsk = inet_csk(parent);
	413	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
	414	struct listen_sock *lopt = queue->listen_opt;
	415	int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
	416	int thresh = max_retries;
	417	unsigned long now = jiffies;
	418	struct request_sock *reqp, req;
	419	int i, budget;
	420
	421	if (lopt == NULL \|\| lopt->qlen == 0)
	422	return;
	423
	424	/* Normally all the openreqs are young and become mature
	425	* (i.e. converted to established socket) for first timeout.
	426	* If synack was not acknowledged for 3 seconds, it means
	427	* one of the following things: synack was lost, ack was lost,
	428	* rtt is high or nobody planned to ack (i.e. synflood).
	429	* When server is a bit loaded, queue is populated with old
	430	* open requests, reducing effective size of queue.
	431	* When server is well loaded, queue size reduces to zero
	432	* after several minutes of work. It is not synflood,
	433	* it is normal operation. The solution is pruning
	434	* too old entries overriding normal timeout, when
	435	* situation becomes dangerous.
	436	*
	437	* Essentially, we reserve half of room for young
	438	* embrions; and abort old ones without pity, if old
	439	* ones are about to clog our table.
	440	*/
	441	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
	442	int young = (lopt->qlen_young<<1);
	443
	444	while (thresh > 2) {
	445	if (lopt->qlen < young)
	446	break;
	447	thresh--;
	448	young <<= 1;
	449	}
	450	}
	451
	452	if (queue->rskq_defer_accept)
	453	max_retries = queue->rskq_defer_accept;
	454
	455	budget = 2 * (lopt->nr_table_entries / (timeout / interval));
	456	i = lopt->clock_hand;
	457
	458	do {
	459	reqp=&lopt->syn_table[i];
	460	while ((req = *reqp) != NULL) {
	461	if (time_after_eq(now, req->expires)) {
	462	if ((req->retrans < thresh \|\|
	463	(inet_rsk(req)->acked && req->retrans < max_retries))
	464	&& !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
	465	unsigned long timeo;
	466
	467	if (req->retrans++ == 0)
	468	lopt->qlen_young--;
	469	timeo = min((timeout << req->retrans), max_rto);
	470	req->expires = now + timeo;
	471	reqp = &req->dl_next;
	472	continue;
	473	}
	474
	475	/* Drop this request */
	476	inet_csk_reqsk_queue_unlink(parent, req, reqp);
	477	reqsk_queue_removed(queue, req);
	478	reqsk_free(req);
	479	continue;
	480	}
	481	reqp = &req->dl_next;
	482	}
	483
	484	i = (i + 1) & (lopt->nr_table_entries - 1);
	485
	486	} while (--budget > 0);
	487
	488	lopt->clock_hand = i;
	489
	490	if (lopt->qlen)
	491	inet_csk_reset_keepalive_timer(parent, interval);
	492	}
	493
	494	EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
	495
	496	struct sock inet_csk_clone(struct sock sk, const struct request_sock *req,
	497	const unsigned int __nocast priority)
	498	{
	499	struct sock *newsk = sk_clone(sk, priority);
	500
	501	if (newsk != NULL) {
	502	struct inet_connection_sock *newicsk = inet_csk(newsk);
	503
	504	newsk->sk_state = TCP_SYN_RECV;
	505	newicsk->icsk_bind_hash = NULL;
	506
	507	inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
	508	newsk->sk_write_space = sk_stream_write_space;
	509
	510	newicsk->icsk_retransmits = 0;
	511	newicsk->icsk_backoff = 0;
	512	newicsk->icsk_probes_out = 0;
	513
	514	/* Deinitialize accept_queue to trap illegal accesses. */
	515	memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
	516	}
	517	return newsk;
	518	}
	519
	520	EXPORT_SYMBOL_GPL(inet_csk_clone);
	521
	522	/*
	523	* At this point, there should be no process reference to this
	524	* socket, and thus no user references at all. Therefore we
	525	* can assume the socket waitqueue is inactive and nobody will
	526	* try to jump onto it.
	527	*/
	528	void inet_csk_destroy_sock(struct sock *sk)
	529	{
	530	BUG_TRAP(sk->sk_state == TCP_CLOSE);
	531	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
	532
	533	/* It cannot be in hash table! */
	534	BUG_TRAP(sk_unhashed(sk));
	535
	536	/* If it has not 0 inet_sk(sk)->num, it must be bound */
	537	BUG_TRAP(!inet_sk(sk)->num \|\| inet_csk(sk)->icsk_bind_hash);
	538
	539	sk->sk_prot->destroy(sk);
	540
	541	sk_stream_kill_queues(sk);
	542
	543	xfrm_sk_free_policy(sk);
	544
	545	sk_refcnt_debug_release(sk);
	546
	547	atomic_dec(sk->sk_prot->orphan_count);
	548	sock_put(sk);
	549	}
	550
	551	EXPORT_SYMBOL(inet_csk_destroy_sock);
	552
	553	int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
	554	{
	555	struct inet_sock *inet = inet_sk(sk);
	556	struct inet_connection_sock *icsk = inet_csk(sk);
	557	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
	558
	559	if (rc != 0)
	560	return rc;
	561
	562	sk->sk_max_ack_backlog = 0;
	563	sk->sk_ack_backlog = 0;
	564	inet_csk_delack_init(sk);
	565
	566	/* There is race window here: we announce ourselves listening,
	567	* but this transition is still not validated by get_port().
	568	* It is OK, because this socket enters to hash table only
	569	* after validation is complete.
	570	*/
	571	sk->sk_state = TCP_LISTEN;
	572	if (!sk->sk_prot->get_port(sk, inet->num)) {
	573	inet->sport = htons(inet->num);
	574
	575	sk_dst_reset(sk);
	576	sk->sk_prot->hash(sk);
	577
	578	return 0;
	579	}
	580
	581	sk->sk_state = TCP_CLOSE;
	582	__reqsk_queue_destroy(&icsk->icsk_accept_queue);
	583	return -EADDRINUSE;
	584	}
	585
	586	EXPORT_SYMBOL_GPL(inet_csk_listen_start);
	587
	588	/*
	589	* This routine closes sockets which have been at least partially
	590	* opened, but not yet accepted.
	591	*/
	592	void inet_csk_listen_stop(struct sock *sk)
	593	{
	594	struct inet_connection_sock *icsk = inet_csk(sk);
	595	struct request_sock *acc_req;
	596	struct request_sock *req;
	597
	598	inet_csk_delete_keepalive_timer(sk);
	599
	600	/* make all the listen_opt local to us */
	601	acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
	602
	603	/* Following specs, it would be better either to send FIN
	604	* (and enter FIN-WAIT-1, it is normal close)
	605	* or to send active reset (abort).
	606	* Certainly, it is pretty dangerous while synflood, but it is
	607	* bad justification for our negligence 8)
	608	* To be honest, we are not able to make either
	609	* of the variants now. --ANK
	610	*/
	611	reqsk_queue_destroy(&icsk->icsk_accept_queue);
	612
	613	while ((req = acc_req) != NULL) {
	614	struct sock *child = req->sk;
	615
	616	acc_req = req->dl_next;
	617
	618	local_bh_disable();
	619	bh_lock_sock(child);
	620	BUG_TRAP(!sock_owned_by_user(child));
	621	sock_hold(child);
	622
	623	sk->sk_prot->disconnect(child, O_NONBLOCK);
	624
	625	sock_orphan(child);
	626
	627	atomic_inc(sk->sk_prot->orphan_count);
	628
	629	inet_csk_destroy_sock(child);
	630
	631	bh_unlock_sock(child);
	632	local_bh_enable();
	633	sock_put(child);
	634
	635	sk_acceptq_removed(sk);
	636	__reqsk_free(req);
	637	}
	638	BUG_TRAP(!sk->sk_ack_backlog);
	639	}
	640
	641	EXPORT_SYMBOL_GPL(inet_csk_listen_stop);