From c8ac37746489f05a32a958b048f29ae45487e81e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 16 Aug 2005 20:43:40 -0700 Subject: [TCP]: Fix bug #5070: kernel BUG at net/ipv4/tcp_output.c:864 1) We send out a normal sized packet with TSO on to start off. 2) ICMP is received indicating a smaller MTU. 3) We send the current sk_send_head which needs to be fragmented since it was created before the ICMP event. The first fragment is then sent out. At this point the remaining fragment is allocated by tcp_fragment. However, its size is padded to fit the L1 cache-line size therefore creating tail-room up to 124 bytes long. This fragment will also be sitting at sk_send_head. 4) tcp_sendmsg is called again and it stores data in the tail-room of of the fragment. 5) tcp_push_one is called by tcp_sendmsg which then calls tso_fragment since the packet as a whole exceeds the MTU. At this point we have a packet that has data in the head area being fed to tso_fragment which bombs out. My take on this is that we shouldn't ever call tcp_fragment on a TSO socket for a packet that is yet to be transmitted since this creates a packet on sk_send_head that cannot be extended. So here is a patch to change it so that tso_fragment is always used in this case. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3ed6fc15815b..566045e58437 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -861,7 +861,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, u16 flags; /* All of a TSO frame must be composed of paged data. */ - BUG_ON(skb->len != skb->data_len); + if (skb->len != skb->data_len) + return tcp_fragment(sk, skb, len, mss_now); buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); if (unlikely(buff == NULL)) @@ -974,6 +975,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) sent_pkts = 0; while ((skb = sk->sk_send_head)) { + unsigned int limit; + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -994,9 +997,10 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) break; } + limit = mss_now; if (tso_segs > 1) { - u32 limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); + limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); if (skb->len < limit) { unsigned int trim = skb->len % mss_now; @@ -1004,15 +1008,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) if (trim) limit = skb->len - trim; } - if (skb->len > limit) { - if (tso_fragment(sk, skb, limit, mss_now)) - break; - } - } else if (unlikely(skb->len > mss_now)) { - if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now))) - break; } + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now))) + break; + TCP_SKB_CB(skb)->when = tcp_time_stamp; if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) @@ -1064,11 +1065,14 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); if (likely(cwnd_quota)) { + unsigned int limit; + BUG_ON(!tso_segs); + limit = mss_now; if (tso_segs > 1) { - u32 limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); + limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); if (skb->len < limit) { unsigned int trim = skb->len % mss_now; @@ -1076,15 +1080,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) if (trim) limit = skb->len - trim; } - if (skb->len > limit) { - if (unlikely(tso_fragment(sk, skb, limit, mss_now))) - return; - } - } else if (unlikely(skb->len > mss_now)) { - if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now))) - return; } + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now))) + return; + /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; -- cgit v1.2.2 From cb94c62c252796f42bb83fe40960d12f3ea5a82a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 18 Aug 2005 14:05:44 -0700 Subject: [IPV4]: Fix DST leak in icmp_push_reply() Based upon a bug report and initial patch by Ollie Wild. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 3d78464f64ea..badfc5849973 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -349,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, { struct sk_buff *skb; - ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, - icmp_param->data_len+icmp_param->head_len, - icmp_param->head_len, - ipc, rt, MSG_DONTWAIT); - - if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { + if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, + icmp_param->data_len+icmp_param->head_len, + icmp_param->head_len, + ipc, rt, MSG_DONTWAIT) < 0) + ip_flush_pending_frames(icmp_socket->sk); + else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = skb->h.icmph; unsigned int csum = 0; struct sk_buff *skb1; -- cgit v1.2.2 From 6fc8b9e7c60d4a3d4d7f1189f74e37651f5610e6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Aug 2005 14:36:59 -0700 Subject: [IPCOMP]: Fix false smp_processor_id warning This patch fixes a false-positive from debug_smp_processor_id(). The processor ID is only used to look up crypto_tfm objects. Any processor ID is acceptable here as long as it is one that is iterated on by for_each_cpu(). Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ipcomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 2065944fd9e5..7ded6e60f43a 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -358,7 +358,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name) int cpu; /* This can be any valid CPU ID so we don't need locking. */ - cpu = smp_processor_id(); + cpu = raw_smp_processor_id(); list_for_each_entry(pos, &ipcomp_tfms_list, list) { struct crypto_tfm *tfm; -- cgit v1.2.2 From fd841326d73096ad79be9c3fa348f9ad04541cc2 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 20 Aug 2005 17:38:40 -0700 Subject: [NETFILTER]: Fix ECN target TCP marking An incorrect check made it bail out before doing anything. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_ECN.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index ada9911118e9..d3250a3a5db4 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -61,10 +61,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) if (!tcph) return 0; - if (!(einfo->operation & IPT_ECN_OP_SET_ECE - || tcph->ece == einfo->proto.tcp.ece) - && (!(einfo->operation & IPT_ECN_OP_SET_CWR - || tcph->cwr == einfo->proto.tcp.cwr))) + if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) || + tcph->ece == einfo->proto.tcp.ece) && + ((!(einfo->operation & IPT_ECN_OP_SET_CWR) || + tcph->cwr == einfo->proto.tcp.cwr))) return 1; if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) -- cgit v1.2.2 From f93592ff4fa4a55aa7640d435fa93338e190294d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 20 Aug 2005 17:39:15 -0700 Subject: [NETFILTER]: Fix HW checksum handling in ECN target Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_ECN.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index d3250a3a5db4..94a0ce1c1c9d 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -71,6 +71,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) return 0; tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; + if ((*pskb)->ip_summed == CHECKSUM_HW && + skb_checksum_help(*pskb, inward)) + return 0; + diffs[0] = ((u_int16_t *)tcph)[6]; if (einfo->operation & IPT_ECN_OP_SET_ECE) tcph->ece = einfo->proto.tcp.ece; @@ -79,13 +83,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) diffs[1] = ((u_int16_t *)tcph)[6]; diffs[0] = diffs[0] ^ 0xFFFF; - if ((*pskb)->ip_summed != CHECKSUM_HW) + if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) tcph->check = csum_fold(csum_partial((char *)diffs, sizeof(diffs), tcph->check^0xFFFF)); - else - if (skb_checksum_help(*pskb, inward)) - return 0; (*pskb)->nfcache |= NFC_ALTERED; return 1; } -- cgit v1.2.2 From 7e71af49d46e4c25f17a2c8f53d62ffd14f01007 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 20 Aug 2005 17:40:41 -0700 Subject: [NETFILTER]: Fix HW checksum handling in TCPMSS target Most importantly, remove bogus BUG() in receive path. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_TCPMSS.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 1049050b2bfb..7b84a254440e 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -61,6 +61,10 @@ ipt_tcpmss_target(struct sk_buff **pskb, if (!skb_ip_make_writable(pskb, (*pskb)->len)) return NF_DROP; + if ((*pskb)->ip_summed == CHECKSUM_HW && + skb_checksum_help(*pskb, out == NULL)) + return NF_DROP; + iph = (*pskb)->nh.iph; tcplen = (*pskb)->len - iph->ihl*4; @@ -186,9 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb, newmss); retmodified: - /* We never hw checksum SYN packets. */ - BUG_ON((*pskb)->ip_summed == CHECKSUM_HW); - (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; return IPT_CONTINUE; } -- cgit v1.2.2 From 14869c388673e8db3348ab3706fa6485d0f0cf95 Mon Sep 17 00:00:00 2001 From: Dmitry Yusupov Date: Tue, 23 Aug 2005 10:09:27 -0700 Subject: [TCP]: Do TSO deferral even if tail SKB can go out now. If the tail SKB fits into the window, it is still benefitical to defer until the goal percentage of the window is available. This give the application time to feed more data into the send queue and thus results in larger TSO frames going out. Patch from Dmitry Yusupov . Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 566045e58437..dd30dd137b74 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -925,10 +925,6 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ limit = min(send_win, cong_win); - /* If sk_send_head can be sent fully now, just do it. */ - if (skb->len <= limit) - return 0; - if (sysctl_tcp_tso_win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); -- cgit v1.2.2 From 1344a41637114485fac7afa1505bce2ff862807a Mon Sep 17 00:00:00 2001 From: Dave Johnson Date: Tue, 23 Aug 2005 10:10:15 -0700 Subject: [IPV4]: Fix negative timer loop with lots of ipv4 peers. From: Dave Johnson Found this bug while doing some scaling testing that created 500K inet peers. peer_check_expire() in net/ipv4/inetpeer.c isn't using inet_peer_gc_mintime correctly and will end up creating an expire timer with less than the minimum duration, and even zero/negative if enough active peers are present. If >65K peers, the timer will be less than inet_peer_gc_mintime, and with >70K peers, the timer duration will reach zero and go negative. The timer handler will continue to schedule another zero/negative timer in a loop until peers can be aged. This can continue for at least a few minutes or even longer if the peers remain active due to arriving packets while the loop is occurring. Bug is present in both 2.4 and 2.6. Same patch will apply to both just fine. Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 95473953c406..ab18a853d7ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -450,10 +450,13 @@ static void peer_check_expire(unsigned long dummy) /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime * interval depending on the total number of entries (more entries, * less interval). */ - peer_periodic_timer.expires = jiffies - + inet_peer_gc_maxtime - - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * - peer_total / inet_peer_threshold * HZ; + if (peer_total >= inet_peer_threshold) + peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; + else + peer_periodic_timer.expires = jiffies + + inet_peer_gc_maxtime + - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * + peer_total / inet_peer_threshold * HZ; add_timer(&peer_periodic_timer); } -- cgit v1.2.2 From 66a79a19a7c582efd99bb143c3a59fbda006eb39 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 23 Aug 2005 10:10:35 -0700 Subject: [NETFILTER]: Fix HW checksum handling in ip_queue/ip6_queue The checksum needs to be filled in on output, after mangling a packet ip_summed needs to be reset. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index eda1fba431a4..c6baa8174389 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -214,6 +214,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) break; case IPQ_COPY_PACKET: + if (entry->skb->ip_summed == CHECKSUM_HW && + (*errp = skb_checksum_help(entry->skb, + entry->info->outdev == NULL))) { + read_unlock_bh(&queue_lock); + return NULL; + } if (copy_range == 0 || copy_range > entry->skb->len) data_len = entry->skb->len; else @@ -385,6 +391,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) if (!skb_ip_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); + e->skb->ip_summed = CHECKSUM_NONE; e->skb->nfcache |= NFC_ALTERED; /* -- cgit v1.2.2 From 89ebd197eb2cd31d6187db344d5117064e19fdde Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Aug 2005 10:13:06 -0700 Subject: [TCP]: Unconditionally clear TCP_NAGLE_PUSH in skb_entail(). Intention of this bit is to force pushing of the existing send queue when TCP_CORK or TCP_NODELAY state changes via setsockopt(). But it's easy to create a situation where the bit never clears. For example, if the send queue starts empty: 1) set TCP_NODELAY 2) clear TCP_NODELAY 3) set TCP_CORK 4) do small write() The current code will leave TCP_NAGLE_PUSH set after that sequence. Unconditionally clearing the bit when new data is added via skb_entail() solves the problem. Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ddb6ce4ecff2..69b1fcf70077 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -584,7 +584,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, sk_charge_skb(sk, skb); if (!sk->sk_send_head) sk->sk_send_head = skb; - else if (tp->nonagle&TCP_NAGLE_PUSH) + if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; } -- cgit v1.2.2 From d5d283751ef3c05b6766501a46800cbee84959d6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Aug 2005 10:49:54 -0700 Subject: [TCP]: Document non-trivial locking path in tcp_v{4,6}_get_port(). This trips up a lot of folks reading this code. Put an unlikely() around the port-exhaustion test for good measure. Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5d91213d34c0..67c670886c1f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -242,9 +242,14 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) tcp_port_rover = rover; spin_unlock(&tcp_portalloc_lock); - /* Exhausted local port range during search? */ + /* Exhausted local port range during search? It is not + * possible for us to be holding one of the bind hash + * locks if this test triggers, because if 'remaining' + * drops to zero, we broke out of the do/while loop at + * the top level, not from the 'break;' statement. + */ ret = 1; - if (remaining <= 0) + if (unlikely(remaining <= 0)) goto fail; /* OK, here is the one we will use. HEAD is -- cgit v1.2.2