From 5ffd5cddd4d353fe18c01b89397dcad02035bb95 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Sep 2013 17:48:47 +0200 Subject: net: netlink: filter particular protocols from analyzers Fix finer-grained control and let only a whitelist of allowed netlink protocols pass, in our case related to networking. If later on, other subsystems decide they want to add their protocol as well to the list of allowed protocols they shall simply add it. While at it, we also need to tell what protocol is in use otherwise BPF_S_ANC_PROTOCOL can not pick it up (as it's not filled out). Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a17dda1bbee0..8df7f64c6db3 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -168,16 +168,43 @@ int netlink_remove_tap(struct netlink_tap *nt) } EXPORT_SYMBOL_GPL(netlink_remove_tap); +static bool netlink_filter_tap(const struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + bool pass = false; + + /* We take the more conservative approach and + * whitelist socket protocols that may pass. + */ + switch (sk->sk_protocol) { + case NETLINK_ROUTE: + case NETLINK_USERSOCK: + case NETLINK_SOCK_DIAG: + case NETLINK_NFLOG: + case NETLINK_XFRM: + case NETLINK_FIB_LOOKUP: + case NETLINK_NETFILTER: + case NETLINK_GENERIC: + pass = true; + break; + } + + return pass; +} + static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; + struct sock *sk = skb->sk; int ret = -ENOMEM; dev_hold(dev); nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; + nskb->protocol = htons((u16) sk->sk_protocol); + ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); @@ -192,6 +219,9 @@ static void __netlink_deliver_tap(struct sk_buff *skb) int ret; struct netlink_tap *tmp; + if (!netlink_filter_tap(skb)) + return; + list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) -- cgit v1.2.2 From 16edfe7ee02dd7fcc13855ea19158333529533b2 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 5 Sep 2013 15:36:09 -0700 Subject: tcp: fix no cwnd growth after timeout In commit 0f7cc9a3 "tcp: increase throughput when reordering is high", it only allows cwnd to increase in Open state. This mistakenly disables slow start after timeout (CA_Loss). Moreover cwnd won't grow if the state moves from Disorder to Open later in tcp_fastretrans_alert(). Therefore the correct logic should be to allow cwnd to grow as long as the data is received in order in Open, Loss, or even Disorder state. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1969e16d936d..894bc174f472 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3162,16 +3162,14 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative - * and only grow cwnd on in-order delivery in Open state, and retain - * cwnd in Disordered state (RFC5681). A stretched ACK with + * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) return flag & FLAG_FORWARD_PROGRESS; - return inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - flag & FLAG_DATA_ACKED; + return flag & FLAG_DATA_ACKED; } /* Check that window update is acceptable. -- cgit v1.2.2 From 4e4f1fc226816905c937f9b29dabe351075dfe0f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Sep 2013 10:35:58 -0700 Subject: tcp: properly increase rcv_ssthresh for ofo packets TCP receive window handling is multi staged. A socket has a memory budget, static or dynamic, in sk_rcvbuf. Because we do not really know how this memory budget translates to a TCP window (payload), TCP announces a small initial window (about 20 MSS). When a packet is received, we increase TCP rcv_win depending on the payload/truesize ratio of this packet. Good citizen packets give a hint that it's reasonable to have rcv_win = sk_rcvbuf/2 This heuristic takes place in tcp_grow_window() Problem is : We currently call tcp_grow_window() only for in-order packets. This means that reorders or packet losses stop proper grow of rcv_win, and senders are unable to benefit from fast recovery, or proper reordering level detection. Really, a packet being stored in OFO queue is not a bad citizen. It should be part of the game as in-order packets. In our traces, we very often see sender is limited by linux small receive windows, even if linux hosts use autotuning (DRS) and should allow rcv_win to grow to ~3MB. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 894bc174f472..25a89eaa669d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4139,6 +4139,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; } @@ -4214,8 +4215,10 @@ add_sack: if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); end: - if (skb) + if (skb) { + tcp_grow_window(sk, skb); skb_set_owner_r(skb, sk); + } } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, -- cgit v1.2.2