From b2532eb9abd88384aa586169b54a3e53574f29f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Oct 2014 18:06:35 -0700 Subject: tcp: fix ooo_okay setting vs Small Queues TCP Small Queues (tcp_tsq_handler()) can hold one reference on sk->sk_wmem_alloc, preventing skb->ooo_okay being set. We should relax test done to set skb->ooo_okay to take care of this extra reference. Minimal truesize of skb containing one byte of payload is SKB_TRUESIZE(1) Without this fix, we have more chance locking flows into the wrong transmit queue. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8d4eac793700..0a5d97c20aa9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -914,9 +914,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_ca_event(sk, CA_EVENT_TX_START); /* if no packet is in qdisc/device queue, then allow XPS to select - * another queue. + * another queue. We can be called from tcp_tsq_handler() + * which holds one reference to sk_wmem_alloc. + * + * TODO: Ideally, in-flight pure ACK packets should not matter here. + * One way to get this would be to set skb->truesize = 2 on them. */ - skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; + skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); -- cgit v1.2.2 From ad971f616aa98ea2503f1a1064637bfb4ef7b21e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 11 Oct 2014 15:17:29 -0700 Subject: tcp: fix tcp_ack() performance problem We worked hard to improve tcp_ack() performance, by not accessing skb_shinfo() in fast path (cd7d8498c9a5 tcp: change tcp_skb_pcount() location) We still have one spurious access because of ACK timestamping, added in commit e1c8a607b281 ("net-timestamp: ACK timestamp for bytestreams") By checking if sk_tsflags has SOF_TIMESTAMPING_TX_ACK set, we can avoid two cache line misses for the common case. While we are at it, add two prefetchw() : One in tcp_ack() to bring skb at the head of write queue. One in tcp_clean_rtx_queue() loop to bring following skb, as we will delete skb from the write queue and dirty skb->next->prev. Add a couple of [un]likely() clauses. After this patch, tcp_ack() is no longer the most consuming function in tcp stack. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Van Jacobson Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 00a41499d52c..a12b455928e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -3029,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) return packets_acked; } +static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, + u32 prior_snd_una) +{ + const struct skb_shared_info *shinfo; + + /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ + if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) + return; + + shinfo = skb_shinfo(skb); + if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && + between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); +} + /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. @@ -3052,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { - struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; - if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + tcp_ack_tstamp(sk, skb, prior_snd_una); /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3073,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, fully_acked = false; } else { + /* Speedup tcp_unlink_write_queue() and next loop */ + prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } - if (sacked & TCPCB_RETRANS) { + if (unlikely(sacked & TCPCB_RETRANS)) { if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; @@ -3107,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if (!(scb->tcp_flags & TCPHDR_SYN)) { + if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; @@ -3119,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - if (skb == tp->retransmit_skb_hint) + if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; - if (skb == tp->lost_skb_hint) + if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; } @@ -3132,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_SACK_RENEGING; skb_mstamp_get(&now); - if (first_ackt.v64) { + if (likely(first_ackt.v64)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); } @@ -3394,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int acked = 0; /* Number of packets newly acked */ long sack_rtt_us = -1L; + /* We very likely will need to access write queue head. */ + prefetchw(sk->sk_write_queue.next); + /* If the ack is older than previous acks * then we can probably ignore it. */ -- cgit v1.2.2 From f76936d07c4eeb36d8dbb64ebd30ab46ff85d9f7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 13 Oct 2014 16:34:10 +0200 Subject: ipv4: fix nexthop attlen check in fib_nh_match fib_nh_match does not match nexthops correctly. Example: ip route add 172.16.10/24 nexthop via 192.168.122.12 dev eth0 \ nexthop via 192.168.122.13 dev eth0 ip route del 172.16.10/24 nexthop via 192.168.122.14 dev eth0 \ nexthop via 192.168.122.15 dev eth0 Del command is successful and route is removed. After this patch applied, the route is correctly matched and result is: RTNETLINK answers: No such process Please consider this for stable trees as well. Fixes: 4e902c57417c4 ("[IPv4]: FIB configuration using struct fib_config") Signed-off-by: Jiri Pirko Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5b6efb3d2308..f99f41bd15b8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -537,7 +537,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; attrlen = rtnh_attrlen(rtnh); - if (attrlen < 0) { + if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); -- cgit v1.2.2 From 9b462d02d6dd671a9ebdc45caed6fe98a53c0ebe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Oct 2014 06:27:47 -0700 Subject: tcp: TCP Small Queues and strange attractors TCP Small queues tries to keep number of packets in qdisc as small as possible, and depends on a tasklet to feed following packets at TX completion time. Choice of tasklet was driven by latencies requirements. Then, TCP stack tries to avoid reorders, by locking flows with outstanding packets in qdisc in a given TX queue. What can happen is that many flows get attracted by a low performing TX queue, and cpu servicing TX completion has to feed packets for all of them, making this cpu 100% busy in softirq mode. This became particularly visible with latest skb->xmit_more support Strategy adopted in this patch is to detect when tcp_wfree() is called from ksoftirqd and let the outstanding queue for this flow being drained before feeding additional packets, so that skb->ooo_okay can be set to allow select_queue() to select the optimal queue : Incoming ACKS are normally handled by different cpus, so this patch gives more chance for these cpus to take over the burden of feeding qdisc with future packets. Tested: lpaa23:~# ./super_netperf 1400 --google-pacing-rate 3028000 -H lpaa24 -l 3600 & lpaa23:~# sar -n DEV 1 10 | grep eth1 06:16:18 AM eth1 595448.00 1190564.00 38381.09 1760253.12 0.00 0.00 1.00 06:16:19 AM eth1 594858.00 1189686.00 38340.76 1758952.72 0.00 0.00 0.00 06:16:20 AM eth1 597017.00 1194019.00 38480.79 1765370.29 0.00 0.00 1.00 06:16:21 AM eth1 595450.00 1190936.00 38380.19 1760805.05 0.00 0.00 0.00 06:16:22 AM eth1 596385.00 1193096.00 38442.56 1763976.29 0.00 0.00 1.00 06:16:23 AM eth1 598155.00 1195978.00 38552.97 1768264.60 0.00 0.00 0.00 06:16:24 AM eth1 594405.00 1188643.00 38312.57 1757414.89 0.00 0.00 1.00 06:16:25 AM eth1 593366.00 1187154.00 38252.16 1755195.83 0.00 0.00 0.00 06:16:26 AM eth1 593188.00 1186118.00 38232.88 1753682.57 0.00 0.00 1.00 06:16:27 AM eth1 596301.00 1192241.00 38440.94 1762733.09 0.00 0.00 0.00 Average: eth1 595457.30 1190843.50 38381.69 1760664.84 0.00 0.00 0.50 lpaa23:~# ./tc -s -d qd sh dev eth1 | grep backlog backlog 7606336b 2513p requeues 167982 backlog 224072b 74p requeues 566 backlog 581376b 192p requeues 5598 backlog 181680b 60p requeues 1070 backlog 5305056b 1753p requeues 110166 // Here, this TX queue is attracting flows backlog 157456b 52p requeues 1758 backlog 672216b 222p requeues 3025 backlog 60560b 20p requeues 24541 backlog 448144b 148p requeues 21258 lpaa23:~# echo 1 >/proc/sys/net/ipv4/tcp_tsq_enable_tcp_wfree_ksoftirqd_detect Immediate jump to full bandwidth, and traffic is properly shard on all tx queues. lpaa23:~# sar -n DEV 1 10 | grep eth1 06:16:46 AM eth1 1397632.00 2795397.00 90081.87 4133031.26 0.00 0.00 1.00 06:16:47 AM eth1 1396874.00 2793614.00 90032.99 4130385.46 0.00 0.00 0.00 06:16:48 AM eth1 1395842.00 2791600.00 89966.46 4127409.67 0.00 0.00 1.00 06:16:49 AM eth1 1395528.00 2791017.00 89946.17 4126551.24 0.00 0.00 0.00 06:16:50 AM eth1 1397891.00 2795716.00 90098.74 4133497.39 0.00 0.00 1.00 06:16:51 AM eth1 1394951.00 2789984.00 89908.96 4125022.51 0.00 0.00 0.00 06:16:52 AM eth1 1394608.00 2789190.00 89886.90 4123851.36 0.00 0.00 1.00 06:16:53 AM eth1 1395314.00 2790653.00 89934.33 4125983.09 0.00 0.00 0.00 06:16:54 AM eth1 1396115.00 2792276.00 89984.25 4128411.21 0.00 0.00 1.00 06:16:55 AM eth1 1396829.00 2793523.00 90030.19 4130250.28 0.00 0.00 0.00 Average: eth1 1396158.40 2792297.00 89987.09 4128439.35 0.00 0.00 0.50 lpaa23:~# tc -s -d qd sh dev eth1 | grep backlog backlog 7900052b 2609p requeues 173287 backlog 878120b 290p requeues 589 backlog 1068884b 354p requeues 5621 backlog 996212b 329p requeues 1088 backlog 984100b 325p requeues 115316 backlog 956848b 316p requeues 1781 backlog 1080996b 357p requeues 3047 backlog 975016b 322p requeues 24571 backlog 990156b 327p requeues 21274 (All 8 TX queues get a fair share of the traffic) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0a5d97c20aa9..e13d77857225 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -839,26 +839,38 @@ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); + int wmem; + + /* Keep one reference on sk_wmem_alloc. + * Will be released by sk_free() from here or tcp_tasklet_func() + */ + wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* If this softirq is serviced by ksoftirqd, we are likely under stress. + * Wait until our queues (qdisc + devices) are drained. + * This gives : + * - less callbacks to tcp_write_xmit(), reducing stress (batches) + * - chance for incoming ACK (processed by another cpu maybe) + * to migrate this flow (skb->ooo_okay will be eventually set) + */ + if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) + goto out; if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { unsigned long flags; struct tsq_tasklet *tsq; - /* Keep a ref on socket. - * This last ref will be released in tcp_tasklet_func() - */ - atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); - /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = &__get_cpu_var(tsq_tasklet); list_add(&tp->tsq_node, &tsq->head); tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); - } else { - sock_wfree(skb); + return; } +out: + sk_free(sk); } /* This routine actually transmits TCP packets queued in by -- cgit v1.2.2 From 2077eebf7d8bf20b36524de45851e28111a60c52 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Oct 2014 14:33:20 -0700 Subject: ipv4: call __ip_options_echo() in cookie_v4_check() commit 971f10eca186cab238c49da ("tcp: better TCP_SKB_CB layout to reduce cache line misses") missed that cookie_v4_check() still calls ip_options_echo() which uses IPCB(). It should use TCPCB() at TCP layer, so call __ip_options_echo() instead. Fixes: commit 971f10eca186cab238c49da ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Cc: Krzysztof Kolasa Cc: Eric Dumazet Reported-by: Krzysztof Kolasa Tested-by: Krzysztof Kolasa Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0431a8f3c8f4..7e7401cdb9d7 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -321,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; ireq->opt = kmalloc(opt_size, GFP_ATOMIC); - if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) { + if (ireq->opt != NULL && __ip_options_echo(&ireq->opt->opt, skb, opt)) { kfree(ireq->opt); ireq->opt = NULL; } -- cgit v1.2.2 From e25f866fbc8a4bf387b5dbe8e25aa5b07e55c74f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Oct 2014 14:33:21 -0700 Subject: ipv4: share tcp_v4_save_options() with cookie_v4_check() cookie_v4_check() allocates ip_options_rcu in the same way with tcp_v4_save_options(), we can just make it a helper function. Cc: Krzysztof Kolasa Cc: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 10 +--------- net/ipv4/tcp_ipv4.c | 20 -------------------- 2 files changed, 1 insertion(+), 29 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 7e7401cdb9d7..c68d0a1a468e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -317,15 +317,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - if (opt && opt->optlen) { - int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; - - ireq->opt = kmalloc(opt_size, GFP_ATOMIC); - if (ireq->opt != NULL && __ip_options_echo(&ireq->opt->opt, skb, opt)) { - kfree(ireq->opt); - ireq->opt = NULL; - } - } + ireq->opt = tcp_v4_save_options(skb); if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 552e87e3c269..6a2a7d659a7a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -880,26 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk, } EXPORT_SYMBOL(tcp_syn_flood_action); -/* - * Save and compile IPv4 options into the request_sock if needed. - */ -static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) -{ - const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; - struct ip_options_rcu *dopt = NULL; - - if (opt && opt->optlen) { - int opt_size = sizeof(*dopt) + opt->optlen; - - dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { - kfree(dopt); - dopt = NULL; - } - } - return dopt; -} - #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of -- cgit v1.2.2 From 461b74c391c4ec9c766794e158508c357d8952e6 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Oct 2014 14:33:22 -0700 Subject: ipv4: clean up cookie_v4_check() We can retrieve opt from skb, no need to pass it as a parameter. And opt should always be non-NULL, no need to check. Cc: Krzysztof Kolasa Cc: Eric Dumazet Tested-by: Krzysztof Kolasa Signed-off-by: Cong Wang Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c68d0a1a468e..d346303fac77 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -255,9 +255,9 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_check_timestamp); -struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, - struct ip_options *opt) +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { + struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct tcp_options_received tcp_opt; struct inet_request_sock *ireq; struct tcp_request_sock *treq; @@ -336,7 +336,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), - (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, + opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a2a7d659a7a..94d1a7757ff7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1408,7 +1408,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_SYN_COOKIES if (!th->syn) - sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); + sk = cookie_v4_check(sk, skb); #endif return sk; } -- cgit v1.2.2 From 4062090e3e5caaf55bed4523a69f26c3265cc1d2 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 15 Oct 2014 16:24:02 +0400 Subject: ipv4: dst_entry leak in ip_send_unicast_reply() ip_setup_cork() called inside ip_append_data() steals dst entry from rt to cork and in case errors in __ip_append_data() nobody frees stolen dst entry Fixes: 2e77d89b2fa8 ("net: avoid a pair of dst_hold()/dst_release() in ip_append_data()") Signed-off-by: Vasily Averin Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e35b71289156..88e5ef2c7f51 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1535,6 +1535,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; struct sock *sk; struct inet_sock *inet; + int err; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; @@ -1574,8 +1575,13 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, sock_net_set(sk, net); __skb_queue_head_init(&sk->sk_write_queue); sk->sk_sndbuf = sysctl_wmem_default; - ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, &rt, MSG_DONTWAIT); + err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, + len, 0, &ipc, &rt, MSG_DONTWAIT); + if (unlikely(err)) { + ip_flush_pending_frames(sk); + goto out; + } + nskb = skb_peek(&sk->sk_write_queue); if (nskb) { if (arg->csumoffset >= 0) @@ -1587,7 +1593,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } - +out: put_cpu_var(unicast_sock); ip_rt_put(rt); -- cgit v1.2.2 From 1245dfc8cadb258386fcd27df38215a0eccb1f17 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 17 Oct 2014 16:53:23 +0800 Subject: ipv4: fix a potential use after free in ip_tunnel_core.c pskb_may_pull() maybe change skb->data and make eth pointer oboslete, so set eth after pskb_may_pull() Fixes:3d7b46cd("ip_tunnel: push generic protocol handling to ip_tunnel module") Cc: Pravin B Shelar Signed-off-by: Li RongQing Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f4c987bb7e94..88c386cf7d85 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -91,11 +91,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb_pull_rcsum(skb, hdr_len); if (inner_proto == htons(ETH_P_TEB)) { - struct ethhdr *eh = (struct ethhdr *)skb->data; + struct ethhdr *eh; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) return -ENOMEM; + eh = (struct ethhdr *)skb->data; if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) skb->protocol = eh->h_proto; else -- cgit v1.2.2 From d8f00d27105a1553a13d4a96c3eb4544f70ca908 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 17 Oct 2014 16:53:47 +0800 Subject: ipv4: fix a potential use after free in fou.c pskb_may_pull() maybe change skb->data and make uh pointer oboslete, so reload uh and guehdr Fixes: 37dd0247 ("gue: Receive side for Generic UDP Encapsulation") Cc: Tom Herbert Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/fou.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index efa70ad44906..32e78924e246 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -87,6 +87,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, len)) goto drop; + uh = udp_hdr(skb); + guehdr = (struct guehdr *)&uh[1]; + if (guehdr->version != 0) goto drop; -- cgit v1.2.2