6 files changed, 162 insertions, 81 deletions
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 324e7e0fdb2a..97069399d864 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -329,6 +329,7 @@ drop:
 static inline int ip_rcv_finish(struct sk_buff *skb)
 {
        const struct iphdr *iph = ip_hdr(skb);
+        struct rtable *rt;
        /*
         *      Initialise the virtual path cache for the packet. It describes
@@ -340,6 +341,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
                if (unlikely(err)) {
                        if (err == -EHOSTUNREACH)
                                IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
+                        else if (err == -ENETUNREACH)
+                                IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
                        goto drop;
                }
        }
@@ -358,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
        if (iph->ihl > 5 && ip_rcv_options(skb))
                goto drop;
+        rt = (struct rtable*)skb->dst;
+        if (rt->rt_type == RTN_MULTICAST)
+                IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
+        else if (rt->rt_type == RTN_BROADCAST)
+                IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
        return dst_input(skb);
 drop:
@@ -414,7 +423,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
                goto inhdr_error;
        len = ntohs(iph->tot_len);
-        if (skb->len < len || len < (iph->ihl*4))
+        if (skb->len < len) {
+                IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
+                goto drop;
+        } else if (len < (iph->ihl*4))
                goto inhdr_error;
        /* Our transport medium may have padded the buffer out. Now we know it
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 534650cad3a8..d6427d918512 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -160,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 static inline int ip_finish_output2(struct sk_buff *skb)
 {
        struct dst_entry *dst = skb->dst;
+        struct rtable *rt = (struct rtable *)dst;
        struct net_device *dev = dst->dev;
        int hh_len = LL_RESERVED_SPACE(dev);
+        if (rt->rt_type == RTN_MULTICAST)
+                IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
+        else if (rt->rt_type == RTN_BROADCAST)
+                IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
        /* Be paranoid, rather than too clever. */
        if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
                struct sk_buff *skb2;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2cf9a898ce50..d6e488668171 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1573,14 +1573,12 @@ void tcp_close(struct sock *sk, long timeout)
        sk_stream_mem_reclaim(sk);
-        /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+        /* As outlined in RFC 2525, section 2.17, we send a RST here because
-         * 3.10, we send a RST here because data was lost.  To
+         * data was lost. To witness the awful effects of the old behavior of
-         * witness the awful effects of the old behavior of always
+         * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
-         * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+         * GET in an FTP client, suspend the process, wait for the client to
-         * a bulk GET in an FTP client, suspend the process, wait
+         * advertise a zero window, then kill -9 the FTP client, wheee...
-         * for the client to advertise a zero window, then kill -9
+         * Note: timeout is always zero in such a case.
-         * the FTP client, wheee...  Note: timeout is always zero
-         * in such a case.
         */
        if (data_was_unread) {
                /* Unread data was tossed, zap the connection. */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 051f0f815f17..7641b2761a14 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
        return flag;
 }
-/* F-RTO can only be used if these conditions are satisfied:
+/* F-RTO can only be used if TCP has never retransmitted anything other than
- *  - there must be some unsent new data
+ * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
- *  - the advertised window should allow sending it
- *  - TCP has never retransmitted anything other than head (SACK enhanced
- *    variant from Appendix B of RFC4138 is more robust here)
 */
 int tcp_use_frto(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-        if (!sysctl_tcp_frto || !tcp_send_head(sk) ||
+        if (!sysctl_tcp_frto)
-                after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
-                      tp->snd_una + tp->snd_wnd))
                return 0;
        if (IsSackFrto())
@@ -2642,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
 *                  algorithm is not part of the F-RTO detection algorithm
 *                  given in RFC4138 but can be selected separately).
 * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery.
+ * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
+ * of Nagle, this is done using frto_counter states 2 and 3, when a new data
+ * segment of any size sent during F-RTO, state 2 is upgraded to 3.
 *
 * Rationale: if the RTO was spurious, new ACKs should arrive from the
 * original window even after we transmit two new data segments.
@@ -2671,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
                inet_csk(sk)->icsk_retransmits = 0;
        if (!before(tp->snd_una, tp->frto_highmark)) {
-                tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+                tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
                return 1;
        }
@@ -2697,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
                        return 1;
                }
-                if ((tp->frto_counter == 2) &&
+                if ((tp->frto_counter >= 2) &&
                    (!(flag&FLAG_FORWARD_PROGRESS) ||
                     ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
                        /* RFC4138 shortcoming (see comment above) */
@@ -2710,10 +2707,19 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
        }
        if (tp->frto_counter == 1) {
+                /* Sending of the next skb must be allowed or no FRTO */
+                if (!tcp_send_head(sk) ||
+                    after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
+                                     tp->snd_una + tp->snd_wnd)) {
+                        tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3),
+                                            flag);
+                        return 1;
+                }
                tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
                tp->frto_counter = 2;
                return 1;
-        } else /* frto_counter == 2 */ {
+        } else {
                switch (sysctl_tcp_frto_response) {
                case 2:
                        tcp_undo_spur_to_response(sk, flag);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e70a6840cb64..0faacf9c419d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
        if (nonagle & TCP_NAGLE_PUSH)
                return 1;
-        /* Don't use the nagle rule for urgent data (or for the final FIN).  */
+        /* Don't use the nagle rule for urgent data (or for the final FIN).
-        if (tp->urg_mode ||
+         * Nagle can be ignored during F-RTO too (see RFC4138).
+         */
+        if (tp->urg_mode || (tp->frto_counter == 2) ||
            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
                return 1;
@@ -2035,7 +2037,7 @@ void tcp_send_fin(struct sock *sk)
 /* We get here when a process closes a file descriptor (either due to
 * an explicit close() or as a byproduct of exit()'ing) and there
 * was unread data in the receive queue.  This behavior is recommended
- * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
+ * by RFC 2525, section 2.17.  -DaveM
 */
 void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cec0f2cc49b7..144970704c2c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock);
 static int udp_port_rover;
-static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[])
+/*
+ * Note about this hash function :
+ * Typical use is probably daddr = 0, only dport is going to vary hash
+ */
+static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr)
+{
+        addr ^= addr >> 16;
+        addr ^= addr >> 8;
+        return port ^ addr;
+}
+static inline int __udp_lib_port_inuse(unsigned int hash, int port,
+        __be32 daddr, struct hlist_head udptable[])
 {
        struct sock *sk;
        struct hlist_node *node;
+        struct inet_sock *inet;
-        sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)])
+        sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
-                if (sk->sk_hash == num)
+                if (sk->sk_hash != hash)
+                        continue;
+                inet = inet_sk(sk);
+                if (inet->num != port)
+                        continue;
+                if (inet->rcv_saddr == daddr)
                        return 1;
+        }
        return 0;
 }
@@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
        struct hlist_node *node;
        struct hlist_head *head;
        struct sock *sk2;
+        unsigned int hash;
        int    error = 1;
        write_lock_bh(&udp_hash_lock);
@@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
                for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
                        int size;
-                        head = &udptable[result & (UDP_HTABLE_SIZE - 1)];
+                        hash = hash_port_and_addr(result,
+                                        inet_sk(sk)->rcv_saddr);
+                        head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
                        if (hlist_empty(head)) {
                                if (result > sysctl_local_port_range[1])
                                        result = sysctl_local_port_range[0] +
@@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
                                result = sysctl_local_port_range[0]
                                        + ((result - sysctl_local_port_range[0]) &
                                           (UDP_HTABLE_SIZE - 1));
-                        if (! __udp_lib_lport_inuse(result, udptable))
+                        hash = hash_port_and_addr(result,
+                                        inet_sk(sk)->rcv_saddr);
+                        if (! __udp_lib_port_inuse(hash, result,
+                                inet_sk(sk)->rcv_saddr, udptable))
                                break;
                }
                if (i >= (1 << 16) / UDP_HTABLE_SIZE)
@@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
 gotit:
                *port_rover = snum = result;
        } else {
-                head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
+                hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr);
+                head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
                sk_for_each(sk2, node, head)
-                        if (sk2->sk_hash == snum                             &&
+                        if (sk2->sk_hash == hash                             &&
                            sk2 != sk                                        &&
+                            inet_sk(sk2)->num == snum                        &&
                            (!sk2->sk_reuse        || !sk->sk_reuse)         &&
                            (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
                             || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
@@ -201,9 +228,9 @@ gotit:
                                goto fail;
        }
        inet_sk(sk)->num = snum;
-        sk->sk_hash = snum;
+        sk->sk_hash = hash;
        if (sk_unhashed(sk)) {
-                head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
+                head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
                sk_add_node(sk, head);
                sock_prot_inc_use(sk->sk_prot);
        }
@@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
 {
        struct sock *sk, *result = NULL;
        struct hlist_node *node;
-        unsigned short hnum = ntohs(dport);
+        unsigned int hash, hashwild;
-        int badness = -1;
+        int score, best = -1;
+        hash = hash_port_and_addr(ntohs(dport), daddr);
+        hashwild = hash_port_and_addr(ntohs(dport), 0);
        read_lock(&udp_hash_lock);
-        sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {
+lookup:
+        sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
                struct inet_sock *inet = inet_sk(sk);
-                if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) {
+                if (sk->sk_hash != hash || ipv6_only_sock(sk) ||
-                        int score = (sk->sk_family == PF_INET ? 1 : 0);
+                        inet->num != dport)
-                        if (inet->rcv_saddr) {
+                        continue;
-                                if (inet->rcv_saddr != daddr)
-                                        continue;
+                score = (sk->sk_family == PF_INET ? 1 : 0);
-                                score+=2;
+                if (inet->rcv_saddr) {
-                        }
+                        if (inet->rcv_saddr != daddr)
-                        if (inet->daddr) {
+                                continue;
-                                if (inet->daddr != saddr)
+                        score+=2;
-                                        continue;
+                }
-                                score+=2;
+                if (inet->daddr) {
-                        }
+                        if (inet->daddr != saddr)
-                        if (inet->dport) {
+                                continue;
-                                if (inet->dport != sport)
+                        score+=2;
-                                        continue;
+                }
-                                score+=2;
+                if (inet->dport) {
-                        }
+                        if (inet->dport != sport)
-                        if (sk->sk_bound_dev_if) {
+                                continue;
-                                if (sk->sk_bound_dev_if != dif)
+                        score+=2;
-                                        continue;
+                }
-                                score+=2;
+                if (sk->sk_bound_dev_if) {
-                        }
+                        if (sk->sk_bound_dev_if != dif)
-                        if (score == 9) {
+                                continue;
-                                result = sk;
+                        score+=2;
-                                break;
+                }
-                        } else if (score > badness) {
+                if (score == 9) {
-                                result = sk;
+                        result = sk;
-                                badness = score;
+                        goto found;
-                        }
+                } else if (score > best) {
+                        result = sk;
+                        best = score;
                }
        }
+        if (hash != hashwild) {
+                hash = hashwild;
+                goto lookup;
+        }
+found:
        if (result)
                sock_hold(result);
        read_unlock(&udp_hash_lock);
        return result;
 }
-static inline struct sock *udp_v4_mcast_next(struct sock *sk,
+static inline struct sock *udp_v4_mcast_next(
-                                             __be16 loc_port, __be32 loc_addr,
+                        struct sock *sk,
-                                             __be16 rmt_port, __be32 rmt_addr,
+                        unsigned int hnum, __be16 loc_port, __be32 loc_addr,
-                                             int dif)
+                        __be16 rmt_port, __be32 rmt_addr,
+                        int dif)
 {
        struct hlist_node *node;
        struct sock *s = sk;
-        unsigned short hnum = ntohs(loc_port);
        sk_for_each_from(s, node) {
                struct inet_sock *inet = inet_sk(s);
                if (s->sk_hash != hnum                                  ||
+                    inet->num != loc_port                               ||
                    (inet->daddr && inet->daddr != rmt_addr)            ||
                    (inet->dport != rmt_port && inet->dport)            ||
                    (inet->rcv_saddr && inet->rcv_saddr != loc_addr)    ||
@@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
                                    __be32 saddr, __be32 daddr,
                                    struct hlist_head udptable[])
 {
-        struct sock *sk;
+        struct sock *sk, *skw, *sknext;
        int dif;
+        unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr);
+        unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0);
-        read_lock(&udp_hash_lock);
-        sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
        dif = skb->dev->ifindex;
-        sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
-        if (sk) {
-                struct sock *sknext = NULL;
+        read_lock(&udp_hash_lock);
+        sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]);
+        skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]);
+        sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif);
+        if (!sk) {
+                hash = hashwild;
+                sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source,
+                        saddr, dif);
+        }
+        if (sk) {
                do {
                        struct sk_buff *skb1 = skb;
+                        sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest,
-                        sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
+                                daddr, uh->source, saddr, dif);
-                                                   uh->source, saddr, dif);
+                        if (!sknext && hash != hashwild) {
+                                hash = hashwild;
+                                sknext = udp_v4_mcast_next(skw, hash, uh->dest,
+                                        daddr, uh->source, saddr, dif);
+                        }
                        if (sknext)
                                skb1 = skb_clone(skb, GFP_ATOMIC);
                        if (skb1) {
                                int ret = udp_queue_rcv_skb(sk, skb1);
                                if (ret > 0)
-                                        /* we should probably re-process instead
+                                        /*
-                                         * of dropping packets here. */
+                                         * we should probably re-process
+                                         * instead of dropping packets here.
+                                         */
                                        kfree_skb(skb1);
                        }
                        sk = sknext;

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 324e7e0fdb2a..97069399d864 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c
@@ -329,6 +329,7 @@ drop:
329	static inline int ip_rcv_finish(struct sk_buff *skb)	329	static inline int ip_rcv_finish(struct sk_buff *skb)
330	{	330	{
331	const struct iphdr *iph = ip_hdr(skb);	331	const struct iphdr *iph = ip_hdr(skb);
		332	struct rtable *rt;
332		333
333	/*	334	/*
334	* Initialise the virtual path cache for the packet. It describes	335	* Initialise the virtual path cache for the packet. It describes
@@ -340,6 +341,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
340	if (unlikely(err)) {	341	if (unlikely(err)) {
341	if (err == -EHOSTUNREACH)	342	if (err == -EHOSTUNREACH)
342	IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);	343	IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
		344	else if (err == -ENETUNREACH)
		345	IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
343	goto drop;	346	goto drop;
344	}	347	}
345	}	348	}
@@ -358,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
358	if (iph->ihl > 5 && ip_rcv_options(skb))	361	if (iph->ihl > 5 && ip_rcv_options(skb))
359	goto drop;	362	goto drop;
360		363
		364	rt = (struct rtable*)skb->dst;
		365	if (rt->rt_type == RTN_MULTICAST)
		366	IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
		367	else if (rt->rt_type == RTN_BROADCAST)
		368	IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
		369
361	return dst_input(skb);	370	return dst_input(skb);
362		371
363	drop:	372	drop:
@@ -414,7 +423,10 @@ int ip_rcv(struct sk_buff skb, struct net_device dev, struct packet_type *pt,
414	goto inhdr_error;	423	goto inhdr_error;
415		424
416	len = ntohs(iph->tot_len);	425	len = ntohs(iph->tot_len);
417	if (skb->len < len \|\| len < (iph->ihl*4))	426	if (skb->len < len) {
		427	IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
		428	goto drop;
		429	} else if (len < (iph->ihl*4))
418	goto inhdr_error;	430	goto inhdr_error;
419		431
420	/* Our transport medium may have padded the buffer out. Now we know it	432	/* Our transport medium may have padded the buffer out. Now we know it


diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 534650cad3a8..d6427d918512 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c
@@ -160,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
160	static inline int ip_finish_output2(struct sk_buff *skb)	160	static inline int ip_finish_output2(struct sk_buff *skb)
161	{	161	{
162	struct dst_entry *dst = skb->dst;	162	struct dst_entry *dst = skb->dst;
		163	struct rtable rt = (struct rtable )dst;
163	struct net_device *dev = dst->dev;	164	struct net_device *dev = dst->dev;
164	int hh_len = LL_RESERVED_SPACE(dev);	165	int hh_len = LL_RESERVED_SPACE(dev);
165		166
		167	if (rt->rt_type == RTN_MULTICAST)
		168	IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
		169	else if (rt->rt_type == RTN_BROADCAST)
		170	IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
		171
166	/* Be paranoid, rather than too clever. */	172	/* Be paranoid, rather than too clever. */
167	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {	173	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
168	struct sk_buff *skb2;	174	struct sk_buff *skb2;


diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2cf9a898ce50..d6e488668171 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -1573,14 +1573,12 @@ void tcp_close(struct sock *sk, long timeout)
1573		1573
1574	sk_stream_mem_reclaim(sk);	1574	sk_stream_mem_reclaim(sk);
1575		1575
1576	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section	1576	/* As outlined in RFC 2525, section 2.17, we send a RST here because
1577	* 3.10, we send a RST here because data was lost. To	1577	* data was lost. To witness the awful effects of the old behavior of
1578	* witness the awful effects of the old behavior of always	1578	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
1579	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start	1579	* GET in an FTP client, suspend the process, wait for the client to
1580	* a bulk GET in an FTP client, suspend the process, wait	1580	* advertise a zero window, then kill -9 the FTP client, wheee...
1581	* for the client to advertise a zero window, then kill -9	1581	* Note: timeout is always zero in such a case.
1582	* the FTP client, wheee... Note: timeout is always zero
1583	* in such a case.
1584	*/	1582	*/
1585	if (data_was_unread) {	1583	if (data_was_unread) {
1586	/* Unread data was tossed, zap the connection. */	1584	/* Unread data was tossed, zap the connection. */


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 051f0f815f17..7641b2761a14 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock sk, struct sk_buff ack_skb, u32 prior_snd_
1265	return flag;	1265	return flag;
1266	}	1266	}
1267		1267
1268	/* F-RTO can only be used if these conditions are satisfied:	1268	/* F-RTO can only be used if TCP has never retransmitted anything other than
1269	* - there must be some unsent new data	1269	* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1270	* - the advertised window should allow sending it
1271	* - TCP has never retransmitted anything other than head (SACK enhanced
1272	* variant from Appendix B of RFC4138 is more robust here)
1273	*/	1270	*/
1274	int tcp_use_frto(struct sock *sk)	1271	int tcp_use_frto(struct sock *sk)
1275	{	1272	{
1276	const struct tcp_sock *tp = tcp_sk(sk);	1273	const struct tcp_sock *tp = tcp_sk(sk);
1277	struct sk_buff *skb;	1274	struct sk_buff *skb;
1278		1275
1279	if (!sysctl_tcp_frto \|\| !tcp_send_head(sk) \|\|	1276	if (!sysctl_tcp_frto)
1280	after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
1281	tp->snd_una + tp->snd_wnd))
1282	return 0;	1277	return 0;
1283		1278
1284	if (IsSackFrto())	1279	if (IsSackFrto())
@@ -2642,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
2642	* algorithm is not part of the F-RTO detection algorithm	2637	* algorithm is not part of the F-RTO detection algorithm
2643	* given in RFC4138 but can be selected separately).	2638	* given in RFC4138 but can be selected separately).
2644	* Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss	2639	* Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
2645	* and TCP falls back to conventional RTO recovery.	2640	* and TCP falls back to conventional RTO recovery. F-RTO allows overriding
		2641	* of Nagle, this is done using frto_counter states 2 and 3, when a new data
		2642	* segment of any size sent during F-RTO, state 2 is upgraded to 3.
2646	*	2643	*
2647	* Rationale: if the RTO was spurious, new ACKs should arrive from the	2644	* Rationale: if the RTO was spurious, new ACKs should arrive from the
2648	* original window even after we transmit two new data segments.	2645	* original window even after we transmit two new data segments.
@@ -2671,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
2671	inet_csk(sk)->icsk_retransmits = 0;	2668	inet_csk(sk)->icsk_retransmits = 0;
2672		2669
2673	if (!before(tp->snd_una, tp->frto_highmark)) {	2670	if (!before(tp->snd_una, tp->frto_highmark)) {
2674	tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);	2671	tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
2675	return 1;	2672	return 1;
2676	}	2673	}
2677		2674
@@ -2697,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
2697	return 1;	2694	return 1;
2698	}	2695	}
2699		2696
2700	if ((tp->frto_counter == 2) &&	2697	if ((tp->frto_counter >= 2) &&
2701	(!(flag&FLAG_FORWARD_PROGRESS) \|\|	2698	(!(flag&FLAG_FORWARD_PROGRESS) \|\|
2702	((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {	2699	((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
2703	/* RFC4138 shortcoming (see comment above) */	2700	/* RFC4138 shortcoming (see comment above) */
@@ -2710,10 +2707,19 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
2710	}	2707	}
2711		2708
2712	if (tp->frto_counter == 1) {	2709	if (tp->frto_counter == 1) {
		2710	/* Sending of the next skb must be allowed or no FRTO */
		2711	if (!tcp_send_head(sk) \|\|
		2712	after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
		2713	tp->snd_una + tp->snd_wnd)) {
		2714	tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3),
		2715	flag);
		2716	return 1;
		2717	}
		2718
2713	tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;	2719	tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
2714	tp->frto_counter = 2;	2720	tp->frto_counter = 2;
2715	return 1;	2721	return 1;
2716	} else /* frto_counter == 2 */ {	2722	} else {
2717	switch (sysctl_tcp_frto_response) {	2723	switch (sysctl_tcp_frto_response) {
2718	case 2:	2724	case 2:
2719	tcp_undo_spur_to_response(sk, flag);	2725	tcp_undo_spur_to_response(sk, flag);


diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e70a6840cb64..0faacf9c419d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock tp, struct sk_buff skb,
1035	if (nonagle & TCP_NAGLE_PUSH)	1035	if (nonagle & TCP_NAGLE_PUSH)
1036	return 1;	1036	return 1;
1037		1037
1038	/* Don't use the nagle rule for urgent data (or for the final FIN). */	1038	/* Don't use the nagle rule for urgent data (or for the final FIN).
1039	if (tp->urg_mode \|\|	1039	* Nagle can be ignored during F-RTO too (see RFC4138).
		1040	*/
		1041	if (tp->urg_mode \|\| (tp->frto_counter == 2) \|\|
1040	(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))	1042	(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
1041	return 1;	1043	return 1;
1042		1044
@@ -2035,7 +2037,7 @@ void tcp_send_fin(struct sock *sk)
2035	/* We get here when a process closes a file descriptor (either due to	2037	/* We get here when a process closes a file descriptor (either due to
2036	* an explicit close() or as a byproduct of exit()'ing) and there	2038	* an explicit close() or as a byproduct of exit()'ing) and there
2037	* was unread data in the receive queue. This behavior is recommended	2039	* was unread data in the receive queue. This behavior is recommended
2038	* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM	2040	* by RFC 2525, section 2.17. -DaveM
2039	*/	2041	*/
2040	void tcp_send_active_reset(struct sock *sk, gfp_t priority)	2042	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2041	{	2043	{


diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cec0f2cc49b7..144970704c2c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c
@@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock);
114		114
115	static int udp_port_rover;	115	static int udp_port_rover;
116		116
117	static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[])	117	/*
		118	* Note about this hash function :
		119	* Typical use is probably daddr = 0, only dport is going to vary hash
		120	*/
		121	static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr)
		122	{
		123	addr ^= addr >> 16;
		124	addr ^= addr >> 8;
		125	return port ^ addr;
		126	}
		127
		128	static inline int __udp_lib_port_inuse(unsigned int hash, int port,
		129	__be32 daddr, struct hlist_head udptable[])
118	{	130	{
119	struct sock *sk;	131	struct sock *sk;
120	struct hlist_node *node;	132	struct hlist_node *node;
		133	struct inet_sock *inet;
121		134
122	sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)])	135	sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
123	if (sk->sk_hash == num)	136	if (sk->sk_hash != hash)
		137	continue;
		138	inet = inet_sk(sk);
		139	if (inet->num != port)
		140	continue;
		141	if (inet->rcv_saddr == daddr)
124	return 1;	142	return 1;
		143	}
125	return 0;	144	return 0;
126	}	145	}
127		146
@@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
142	struct hlist_node *node;	161	struct hlist_node *node;
143	struct hlist_head *head;	162	struct hlist_head *head;
144	struct sock *sk2;	163	struct sock *sk2;
		164	unsigned int hash;
145	int error = 1;	165	int error = 1;
146		166
147	write_lock_bh(&udp_hash_lock);	167	write_lock_bh(&udp_hash_lock);
@@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
156	for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {	176	for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
157	int size;	177	int size;
158		178
159	head = &udptable[result & (UDP_HTABLE_SIZE - 1)];	179	hash = hash_port_and_addr(result,
		180	inet_sk(sk)->rcv_saddr);
		181	head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
160	if (hlist_empty(head)) {	182	if (hlist_empty(head)) {
161	if (result > sysctl_local_port_range[1])	183	if (result > sysctl_local_port_range[1])
162	result = sysctl_local_port_range[0] +	184	result = sysctl_local_port_range[0] +
@@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
181	result = sysctl_local_port_range[0]	203	result = sysctl_local_port_range[0]
182	+ ((result - sysctl_local_port_range[0]) &	204	+ ((result - sysctl_local_port_range[0]) &
183	(UDP_HTABLE_SIZE - 1));	205	(UDP_HTABLE_SIZE - 1));
184	if (! __udp_lib_lport_inuse(result, udptable))	206	hash = hash_port_and_addr(result,
		207	inet_sk(sk)->rcv_saddr);
		208	if (! __udp_lib_port_inuse(hash, result,
		209	inet_sk(sk)->rcv_saddr, udptable))
185	break;	210	break;
186	}	211	}
187	if (i >= (1 << 16) / UDP_HTABLE_SIZE)	212	if (i >= (1 << 16) / UDP_HTABLE_SIZE)
@@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
189	gotit:	214	gotit:
190	*port_rover = snum = result;	215	*port_rover = snum = result;
191	} else {	216	} else {
192	head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];	217	hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr);
		218	head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
193		219
194	sk_for_each(sk2, node, head)	220	sk_for_each(sk2, node, head)
195	if (sk2->sk_hash == snum &&	221	if (sk2->sk_hash == hash &&
196	sk2 != sk &&	222	sk2 != sk &&
		223	inet_sk(sk2)->num == snum &&
197	(!sk2->sk_reuse \|\| !sk->sk_reuse) &&	224	(!sk2->sk_reuse \|\| !sk->sk_reuse) &&
198	(!sk2->sk_bound_dev_if \|\| !sk->sk_bound_dev_if	225	(!sk2->sk_bound_dev_if \|\| !sk->sk_bound_dev_if
199	\|\| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&	226	\|\| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
@@ -201,9 +228,9 @@ gotit:
201	goto fail;	228	goto fail;
202	}	229	}
203	inet_sk(sk)->num = snum;	230	inet_sk(sk)->num = snum;
204	sk->sk_hash = snum;	231	sk->sk_hash = hash;
205	if (sk_unhashed(sk)) {	232	if (sk_unhashed(sk)) {
206	head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];	233	head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
207	sk_add_node(sk, head);	234	sk_add_node(sk, head);
208	sock_prot_inc_use(sk->sk_prot);	235	sock_prot_inc_use(sk->sk_prot);
209	}	236	}
@@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
242	{	269	{
243	struct sock sk, result = NULL;	270	struct sock sk, result = NULL;
244	struct hlist_node *node;	271	struct hlist_node *node;
245	unsigned short hnum = ntohs(dport);	272	unsigned int hash, hashwild;
246	int badness = -1;	273	int score, best = -1;
		274
		275	hash = hash_port_and_addr(ntohs(dport), daddr);
		276	hashwild = hash_port_and_addr(ntohs(dport), 0);
247		277
248	read_lock(&udp_hash_lock);	278	read_lock(&udp_hash_lock);
249	sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {	279
		280	lookup:
		281
		282	sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
250	struct inet_sock *inet = inet_sk(sk);	283	struct inet_sock *inet = inet_sk(sk);
251		284
252	if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) {	285	if (sk->sk_hash != hash \|\| ipv6_only_sock(sk) \|\|
253	int score = (sk->sk_family == PF_INET ? 1 : 0);	286	inet->num != dport)
254	if (inet->rcv_saddr) {	287	continue;
255	if (inet->rcv_saddr != daddr)	288
256	continue;	289	score = (sk->sk_family == PF_INET ? 1 : 0);
257	score+=2;	290	if (inet->rcv_saddr) {
258	}	291	if (inet->rcv_saddr != daddr)
259	if (inet->daddr) {	292	continue;
260	if (inet->daddr != saddr)	293	score+=2;
261	continue;	294	}
262	score+=2;	295	if (inet->daddr) {
263	}	296	if (inet->daddr != saddr)
264	if (inet->dport) {	297	continue;
265	if (inet->dport != sport)	298	score+=2;
266	continue;	299	}
267	score+=2;	300	if (inet->dport) {
268	}	301	if (inet->dport != sport)
269	if (sk->sk_bound_dev_if) {	302	continue;
270	if (sk->sk_bound_dev_if != dif)	303	score+=2;
271	continue;	304	}
272	score+=2;	305	if (sk->sk_bound_dev_if) {
273	}	306	if (sk->sk_bound_dev_if != dif)
274	if (score == 9) {	307	continue;
275	result = sk;	308	score+=2;
276	break;	309	}
277	} else if (score > badness) {	310	if (score == 9) {
278	result = sk;	311	result = sk;
279	badness = score;	312	goto found;
280	}	313	} else if (score > best) {
		314	result = sk;
		315	best = score;
281	}	316	}
282	}	317	}
		318
		319	if (hash != hashwild) {
		320	hash = hashwild;
		321	goto lookup;
		322	}
		323	found:
283	if (result)	324	if (result)
284	sock_hold(result);	325	sock_hold(result);
285	read_unlock(&udp_hash_lock);	326	read_unlock(&udp_hash_lock);
286	return result;	327	return result;
287	}	328	}
288		329
289	static inline struct sock udp_v4_mcast_next(struct sock sk,	330	static inline struct sock *udp_v4_mcast_next(
290	__be16 loc_port, __be32 loc_addr,	331	struct sock *sk,
291	__be16 rmt_port, __be32 rmt_addr,	332	unsigned int hnum, __be16 loc_port, __be32 loc_addr,
292	int dif)	333	__be16 rmt_port, __be32 rmt_addr,
		334	int dif)
293	{	335	{
294	struct hlist_node *node;	336	struct hlist_node *node;
295	struct sock *s = sk;	337	struct sock *s = sk;
296	unsigned short hnum = ntohs(loc_port);
297		338
298	sk_for_each_from(s, node) {	339	sk_for_each_from(s, node) {
299	struct inet_sock *inet = inet_sk(s);	340	struct inet_sock *inet = inet_sk(s);
300		341
301	if (s->sk_hash != hnum \|\|	342	if (s->sk_hash != hnum \|\|
		343	inet->num != loc_port \|\|
302	(inet->daddr && inet->daddr != rmt_addr) \|\|	344	(inet->daddr && inet->daddr != rmt_addr) \|\|
303	(inet->dport != rmt_port && inet->dport) \|\|	345	(inet->dport != rmt_port && inet->dport) \|\|
304	(inet->rcv_saddr && inet->rcv_saddr != loc_addr) \|\|	346	(inet->rcv_saddr && inet->rcv_saddr != loc_addr) \|\|
@@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
1129	__be32 saddr, __be32 daddr,	1171	__be32 saddr, __be32 daddr,
1130	struct hlist_head udptable[])	1172	struct hlist_head udptable[])
1131	{	1173	{
1132	struct sock *sk;	1174	struct sock sk, skw, *sknext;
1133	int dif;	1175	int dif;
		1176	unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr);
		1177	unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0);
1134		1178
1135	read_lock(&udp_hash_lock);
1136	sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
1137	dif = skb->dev->ifindex;	1179	dif = skb->dev->ifindex;
1138	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
1139	if (sk) {
1140	struct sock *sknext = NULL;
1141		1180
		1181	read_lock(&udp_hash_lock);
		1182
		1183	sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]);
		1184	skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]);
		1185
		1186	sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif);
		1187	if (!sk) {
		1188	hash = hashwild;
		1189	sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source,
		1190	saddr, dif);
		1191	}
		1192	if (sk) {
1142	do {	1193	do {
1143	struct sk_buff *skb1 = skb;	1194	struct sk_buff *skb1 = skb;
1144		1195	sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest,
1145	sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,	1196	daddr, uh->source, saddr, dif);
1146	uh->source, saddr, dif);	1197	if (!sknext && hash != hashwild) {
		1198	hash = hashwild;
		1199	sknext = udp_v4_mcast_next(skw, hash, uh->dest,
		1200	daddr, uh->source, saddr, dif);
		1201	}
1147	if (sknext)	1202	if (sknext)
1148	skb1 = skb_clone(skb, GFP_ATOMIC);	1203	skb1 = skb_clone(skb, GFP_ATOMIC);
1149		1204
1150	if (skb1) {	1205	if (skb1) {
1151	int ret = udp_queue_rcv_skb(sk, skb1);	1206	int ret = udp_queue_rcv_skb(sk, skb1);
1152	if (ret > 0)	1207	if (ret > 0)
1153	/* we should probably re-process instead	1208	/*
1154	* of dropping packets here. */	1209	* we should probably re-process
		1210	* instead of dropping packets here.
		1211	*/
1155	kfree_skb(skb1);	1212	kfree_skb(skb1);
1156	}	1213	}
1157	sk = sknext;	1214	sk = sknext;