From 4648dc97af9d496218a05353b0e442b3dfa6aaab Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 5 Mar 2012 19:35:04 +0000
Subject: tcp: fix tcp_shift_skb_data() to not shift SACKed data below snd_una

This commit fixes tcp_shift_skb_data() so that it does not shift
SACKed data below snd_una.

This fixes an issue whose symptoms exactly match reports showing
tp->sacked_out going negative since 3.3.0-rc4 (see "WARNING: at
net/ipv4/tcp_input.c:3418" thread on netdev).

Since 2008 (832d11c5cd076abc0aa1eaf7be96c81d1a59ce41)
tcp_shift_skb_data() had been shifting SACKed ranges that were below
snd_una. It checked that the *end* of the skb it was about to shift
from was above snd_una, but did not check that the end of the actual
shifted range was above snd_una; this commit adds that check.

Shifting SACKed ranges below snd_una is problematic because for such
ranges tcp_sacktag_one() short-circuits: it does not declare anything
as SACKed and does not increase sacked_out.

Before the fixes in commits cc9a672ee522d4805495b98680f4a3db5d0a0af9
and daef52bab1fd26e24e8e9578f8fb33ba1d0cb412, shifting SACKed ranges
below snd_una happened to work because tcp_shifted_skb() was always
(incorrectly) passing in to tcp_sacktag_one() an skb whose end_seq
tcp_shift_skb_data() had already guaranteed was beyond snd_una. Hence
tcp_sacktag_one() never short-circuited and always increased
tp->sacked_out in this case.

After those two fixes, my testing has verified that shifting SACKed
ranges below snd_una could cause tp->sacked_out to go negative with
the following sequence of events:

(1) tcp_shift_skb_data() sees an skb whose end_seq is beyond snd_una,
    then shifts a prefix of that skb that is below snd_una

(2) tcp_shifted_skb() increments the packet count of the
    already-SACKed prev sk_buff

(3) tcp_sacktag_one() sees the end of the new SACKed range is below
    snd_una, so it short-circuits and doesn't increase tp->sacked_out

(5) tcp_clean_rtx_queue() sees the SACKed skb has been ACKed,
    decrements tp->sacked_out by this "inflated" pcount that was
    missing a matching increase in tp->sacked_out, and hence
    tp->sacked_out underflows to a u32 like 0xFFFFFFFF, which casted
    to s32 is negative.

(6) this leads to the warnings seen in the recent "WARNING: at
    net/ipv4/tcp_input.c:3418" thread on the netdev list; e.g.:
    tcp_input.c:3418  WARN_ON((int)tp->sacked_out < 0);

More generally, I think this bug can be tickled in some cases where
two or more ACKs from the receiver are lost and then a DSACK arrives
that is immediately above an existing SACKed skb in the write queue.

This fix changes tcp_shift_skb_data() to abort this sequence at step
(1) in the scenario above by noticing that the bytes are below snd_una
and not shifting them.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d9b83d198c3d..b5e315f13641 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1585,6 +1585,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+		goto fallback;
+
 	if (!skb_shift(prev, skb, len))
 		goto fallback;
 	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
-- 
cgit v1.2.2


From 848edc69192a38bf9d261032f248b14f47e6af8b Mon Sep 17 00:00:00 2001
From: Santosh Nayak <santoshprasadnayak@gmail.com>
Date: Tue, 6 Mar 2012 01:22:50 +0000
Subject: netfilter: ebtables: fix wrong name length while copying to
 user-space

user-space ebtables expects 32 bytes-long names, but xt_match names
use 29 bytes. We have to copy less 29 bytes and then, make sure we
fill the remaining bytes with zeroes.

Signed-off-by: Santosh Nayak <santoshprasadnayak@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/ebtables.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 8aa4ad0e06af..15e9575f7207 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1335,7 +1335,12 @@ static inline int ebt_make_matchname(const struct ebt_entry_match *m,
     const char *base, char __user *ubase)
 {
 	char __user *hlp = ubase + ((char *)m - base);
-	if (copy_to_user(hlp, m->u.match->name, EBT_FUNCTION_MAXNAMELEN))
+	char name[EBT_FUNCTION_MAXNAMELEN] = {};
+
+	/* ebtables expects 32 bytes long names but xt_match names are 29 bytes
+	   long. Copy 29 bytes and fill remaining bytes with zeroes. */
+	strncpy(name, m->u.match->name, sizeof(name));
+	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
 		return -EFAULT;
 	return 0;
 }
@@ -1344,7 +1349,10 @@ static inline int ebt_make_watchername(const struct ebt_entry_watcher *w,
     const char *base, char __user *ubase)
 {
 	char __user *hlp = ubase + ((char *)w - base);
-	if (copy_to_user(hlp , w->u.watcher->name, EBT_FUNCTION_MAXNAMELEN))
+	char name[EBT_FUNCTION_MAXNAMELEN] = {};
+
+	strncpy(name, w->u.watcher->name, sizeof(name));
+	if (copy_to_user(hlp , name, EBT_FUNCTION_MAXNAMELEN))
 		return -EFAULT;
 	return 0;
 }
@@ -1355,10 +1363,12 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
 	int ret;
 	char __user *hlp;
 	const struct ebt_entry_target *t;
+	char name[EBT_FUNCTION_MAXNAMELEN] = {};
 
 	if (e->bitmask == 0)
 		return 0;
 
+	strncpy(name, t->u.target->name, sizeof(name));
 	hlp = ubase + (((char *)e + e->target_offset) - base);
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
 
@@ -1368,7 +1378,7 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
 	ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
 	if (ret != 0)
 		return ret;
-	if (copy_to_user(hlp, t->u.target->name, EBT_FUNCTION_MAXNAMELEN))
+	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
 		return -EFAULT;
 	return 0;
 }
-- 
cgit v1.2.2


From 8be619d1e430fd87a02587a2a6830b692cb91b84 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Mar 2012 01:22:51 +0000
Subject: netfilter: ctnetlink: remove incorrect spin_[un]lock_bh on NAT module
 autoload

Since 7d367e0, ctnetlink_new_conntrack is called without holding
the nf_conntrack_lock spinlock. Thus, ctnetlink_parse_nat_setup
does not require to release that spinlock anymore in the NAT module
autoload case.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_netlink.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 30c9d4ca0218..10687692831e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1041,16 +1041,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 	if (!parse_nat_setup) {
 #ifdef CONFIG_MODULES
 		rcu_read_unlock();
-		spin_unlock_bh(&nf_conntrack_lock);
 		nfnl_unlock();
 		if (request_module("nf-nat-ipv4") < 0) {
 			nfnl_lock();
-			spin_lock_bh(&nf_conntrack_lock);
 			rcu_read_lock();
 			return -EOPNOTSUPP;
 		}
 		nfnl_lock();
-		spin_lock_bh(&nf_conntrack_lock);
 		rcu_read_lock();
 		if (nfnetlink_parse_nat_setup_hook)
 			return -EAGAIN;
-- 
cgit v1.2.2


From a157b9d5b5b626e46eba2ac4e342da8db25cabc4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Mar 2012 01:22:53 +0000
Subject: netfilter: bridge: fix wrong pointer dereference

In adf7ff8, a invalid dereference was added in ebt_make_names.

CC [M]  net/bridge/netfilter/ebtables.o
net/bridge/netfilter/ebtables.c: In function `ebt_make_names':
net/bridge/netfilter/ebtables.c:1371:20: warning: `t' may be used uninitialized in this function [-Wuninitialized]

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/ebtables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 15e9575f7207..5fe2ff3b01ef 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1368,7 +1368,6 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
 	if (e->bitmask == 0)
 		return 0;
 
-	strncpy(name, t->u.target->name, sizeof(name));
 	hlp = ubase + (((char *)e + e->target_offset) - base);
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
 
@@ -1378,6 +1377,7 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
 	ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
 	if (ret != 0)
 		return ret;
+	strncpy(name, t->u.target->name, sizeof(name));
 	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
 		return -EFAULT;
 	return 0;
-- 
cgit v1.2.2


From 739e4505a0e8209622dc71743bfa1c804eacf7f4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 6 Mar 2012 01:22:54 +0000
Subject: bridge: netfilter: don't call iptables on vlan packets if sysctl is
 off

When net.bridge.bridge-nf-filter-vlan-tagged is 0 (default), vlan packets
arriving should not be sent to ip(6)tables by bridge netfilter.

However, it turns out that we currently always send VLAN packets to
netfilter, if ..
a), CONFIG_VLAN_8021Q is enabled ; or
b), CONFIG_VLAN_8021Q is not set but rx vlan offload is enabled
   on the bridge port.

This is because bridge netfilter treats skb with
skb->protocol == ETH_P_IP{V6} as "non-vlan packet".

With rx vlan offload on or CONFIG_VLAN_8021Q=y, the vlan header has
already been removed here, and we cannot rely on skb->protocol alone.

Fix this by only using skb->protocol if the skb has no vlan tag,
or if a vlan tag is present and filter-vlan-tagged bridge netfilter
sysctl is enabled.

We cannot remove the skb->protocol == htons(ETH_P_8021Q) test
because the vlan tag is still around in the CONFIG_VLAN_8021Q=n &&
"ethtool -K $itf rxvlan off" case.

reproducer:
iptables -t raw -I PREROUTING -i br0
iptables -t raw -I PREROUTING -i br0.1

Then send packets to an ip address configured on br0.1 interface.
Even with net.bridge.bridge-nf-filter-vlan-tagged=0, the 1st rule
will match instead of the 2nd one.

With this patch applied, the 2nd rule will match instead.
In the non-local address case, netfilter won't be consulted after
this patch unless the sysctl is switched on.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 84122472656c..dec4f3817133 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -62,6 +62,15 @@ static int brnf_filter_pppoe_tagged __read_mostly = 0;
 #define brnf_filter_pppoe_tagged 0
 #endif
 
+#define IS_IP(skb) \
+	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+
+#define IS_IPV6(skb) \
+	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+
+#define IS_ARP(skb) \
+	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+
 static inline __be16 vlan_proto(const struct sk_buff *skb)
 {
 	if (vlan_tx_tag_present(skb))
@@ -639,8 +648,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 		return NF_DROP;
 	br = p->br;
 
-	if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
-	    IS_PPPOE_IPV6(skb)) {
+	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
 		if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
 			return NF_ACCEPT;
 
@@ -651,8 +659,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 	if (!brnf_call_iptables && !br->nf_call_iptables)
 		return NF_ACCEPT;
 
-	if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) &&
-	    !IS_PPPOE_IP(skb))
+	if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
 		return NF_ACCEPT;
 
 	nf_bridge_pull_encap_header_rcsum(skb);
@@ -701,7 +708,7 @@ static int br_nf_forward_finish(struct sk_buff *skb)
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct net_device *in;
 
-	if (skb->protocol != htons(ETH_P_ARP) && !IS_VLAN_ARP(skb)) {
+	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
 		in = nf_bridge->physindev;
 		if (nf_bridge->mask & BRNF_PKT_TYPE) {
 			skb->pkt_type = PACKET_OTHERHOST;
@@ -718,6 +725,7 @@ static int br_nf_forward_finish(struct sk_buff *skb)
 	return 0;
 }
 
+
 /* This is the 'purely bridged' case.  For IP, we pass the packet to
  * netfilter with indev and outdev set to the bridge device,
  * but we are still able to filter on the 'real' indev/outdev
@@ -744,11 +752,9 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
 	if (!parent)
 		return NF_DROP;
 
-	if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) ||
-	    IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
 		pf = PF_INET;
-	else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
-		 IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
 		pf = PF_INET6;
 	else
 		return NF_ACCEPT;
@@ -795,7 +801,7 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 	if (!brnf_call_arptables && !br->nf_call_arptables)
 		return NF_ACCEPT;
 
-	if (skb->protocol != htons(ETH_P_ARP)) {
+	if (!IS_ARP(skb)) {
 		if (!IS_VLAN_ARP(skb))
 			return NF_ACCEPT;
 		nf_bridge_pull_encap_header(skb);
@@ -853,11 +859,9 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
 	if (!realoutdev)
 		return NF_DROP;
 
-	if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) ||
-	    IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
 		pf = PF_INET;
-	else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
-		 IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
 		pf = PF_INET6;
 	else
 		return NF_ACCEPT;
-- 
cgit v1.2.2


From 741385119706d4370eb7899c5ca96ad125c520e5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Mar 2012 01:22:55 +0000
Subject: netfilter: nf_conntrack: fix early_drop with reliable event delivery

If reliable event delivery is enabled and ctnetlink fails to deliver
the destroy event in early_drop, the conntrack subsystem cannot
drop any the candidate flow that was planned to be evicted.

Reported-by: Kerin Millar <kerframil@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_core.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ed86a3be678e..fa4b82c8ae80 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -635,8 +635,12 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 
 	if (del_timer(&ct->timeout)) {
 		death_by_timeout((unsigned long)ct);
-		dropped = 1;
-		NF_CT_STAT_INC_ATOMIC(net, early_drop);
+		/* Check if we indeed killed this entry. Reliable event
+		   delivery may have inserted it into the dying list. */
+		if (test_bit(IPS_DYING_BIT, &ct->status)) {
+			dropped = 1;
+			NF_CT_STAT_INC_ATOMIC(net, early_drop);
+		}
 	}
 	nf_ct_put(ct);
 	return dropped;
-- 
cgit v1.2.2


From d6ddef9e641d1229d4ec841dc75ae703171c3e92 Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Mon, 5 Mar 2012 14:45:17 +0000
Subject: IPv6: Fix not join all-router mcast group when forwarding set.

When forwarding was set and a new net device is register,
we need add this device to the all-router mcast group.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c02280a4d126..6b8ebc5da0e1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -434,6 +434,10 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 	/* Join all-node multicast group */
 	ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
 
+	/* Join all-router multicast group if forwarding is set */
+	if (ndev->cnf.forwarding && dev && (dev->flags & IFF_MULTICAST))
+		ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+
 	return ndev;
 }
 
-- 
cgit v1.2.2


From 651a68ea2ce9738b84e928836053b2e0fb5db2ba Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Tue, 6 Mar 2012 15:04:04 -0800
Subject: openvswitch: Honor dp_ifindex, when specified, for vport lookup by
 name.

When OVS_VPORT_ATTR_NAME is specified and dp_ifindex is nonzero, the
logical behavior would be for the vport name lookup scope to be limited
to the specified datapath, but in fact the dp_ifindex value was ignored.
This commit causes the search scope to be honored.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/datapath.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ce64c18b8c79..2c030505b335 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1521,6 +1521,9 @@ static struct vport *lookup_vport(struct ovs_header *ovs_header,
 		vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME]));
 		if (!vport)
 			return ERR_PTR(-ENODEV);
+		if (ovs_header->dp_ifindex &&
+		    ovs_header->dp_ifindex != get_dpifindex(vport->dp))
+			return ERR_PTR(-ENODEV);
 		return vport;
 	} else if (a[OVS_VPORT_ATTR_PORT_NO]) {
 		u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
-- 
cgit v1.2.2


From 81e5d41d7ed4f6c61ba3d2414f4f9ddf6d934ebb Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 6 Mar 2012 15:05:46 -0800
Subject: openvswitch: Fix checksum update for actions on UDP packets.

When modifying IP addresses or ports on a UDP packet we don't
correctly follow the rules for unchecksummed packets.  This meant
that packets without a checksum can be given a incorrect new checksum
and packets with a checksum can become marked as being unchecksummed.
This fixes it to handle those requirements.

Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/actions.c | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2725d1bdf291..48badffaafc1 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2012 Nicira Networks.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -145,9 +145,16 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
 			inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
 						 *addr, new_addr, 1);
 	} else if (nh->protocol == IPPROTO_UDP) {
-		if (likely(transport_len >= sizeof(struct udphdr)))
-			inet_proto_csum_replace4(&udp_hdr(skb)->check, skb,
-						 *addr, new_addr, 1);
+		if (likely(transport_len >= sizeof(struct udphdr))) {
+			struct udphdr *uh = udp_hdr(skb);
+
+			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+				inet_proto_csum_replace4(&uh->check, skb,
+							 *addr, new_addr, 1);
+				if (!uh->check)
+					uh->check = CSUM_MANGLED_0;
+			}
+		}
 	}
 
 	csum_replace4(&nh->check, *addr, new_addr);
@@ -197,8 +204,22 @@ static void set_tp_port(struct sk_buff *skb, __be16 *port,
 	skb->rxhash = 0;
 }
 
-static int set_udp_port(struct sk_buff *skb,
-			const struct ovs_key_udp *udp_port_key)
+static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port)
+{
+	struct udphdr *uh = udp_hdr(skb);
+
+	if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
+		set_tp_port(skb, port, new_port, &uh->check);
+
+		if (!uh->check)
+			uh->check = CSUM_MANGLED_0;
+	} else {
+		*port = new_port;
+		skb->rxhash = 0;
+	}
+}
+
+static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key)
 {
 	struct udphdr *uh;
 	int err;
@@ -210,16 +231,15 @@ static int set_udp_port(struct sk_buff *skb,
 
 	uh = udp_hdr(skb);
 	if (udp_port_key->udp_src != uh->source)
-		set_tp_port(skb, &uh->source, udp_port_key->udp_src, &uh->check);
+		set_udp_port(skb, &uh->source, udp_port_key->udp_src);
 
 	if (udp_port_key->udp_dst != uh->dest)
-		set_tp_port(skb, &uh->dest, udp_port_key->udp_dst, &uh->check);
+		set_udp_port(skb, &uh->dest, udp_port_key->udp_dst);
 
 	return 0;
 }
 
-static int set_tcp_port(struct sk_buff *skb,
-			const struct ovs_key_tcp *tcp_port_key)
+static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key)
 {
 	struct tcphdr *th;
 	int err;
@@ -328,11 +348,11 @@ static int execute_set_action(struct sk_buff *skb,
 		break;
 
 	case OVS_KEY_ATTR_TCP:
-		err = set_tcp_port(skb, nla_data(nested_attr));
+		err = set_tcp(skb, nla_data(nested_attr));
 		break;
 
 	case OVS_KEY_ATTR_UDP:
-		err = set_udp_port(skb, nla_data(nested_attr));
+		err = set_udp(skb, nla_data(nested_attr));
 		break;
 	}
 
-- 
cgit v1.2.2


From d9e179ecec0805c41b17f9a0c3b925d415677772 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@gmail.com>
Date: Tue, 6 Mar 2012 22:25:14 +0000
Subject: bridge: br_log_state() s/entering/entered/

When br_log_state() is reporting state it should say "entered"
istead of "entering" since state at this point is already
changed.

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_stp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 6751ed4e0c07..8c836d96ba76 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -31,7 +31,7 @@ static const char *const br_port_state_names[] = {
 
 void br_log_state(const struct net_bridge_port *p)
 {
-	br_info(p->br, "port %u(%s) entering %s state\n",
+	br_info(p->br, "port %u(%s) entered %s state\n",
 		(unsigned) p->port_no, p->dev->name,
 		br_port_state_names[p->state]);
 }
-- 
cgit v1.2.2


From 5200959b833ddacf28b6ffce8c331dfd6e0ca797 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@gmail.com>
Date: Tue, 6 Mar 2012 22:25:22 +0000
Subject: bridge: fix state reporting when port is disabled

Now we have:
eth0: link *down*
br0: port 1(eth0) entered *forwarding* state

br_log_state(p) should be called *after* p->state is set
to BR_STATE_DISABLED.

Reported-by: Zilvinas Valinskas <zilvinas@wilibox.com>
Signed-off-by: Paulius Zaleckas <paulius.zaleckas@gmail.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_stp_if.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 19308e305d85..f494496373d6 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -98,14 +98,13 @@ void br_stp_disable_port(struct net_bridge_port *p)
 	struct net_bridge *br = p->br;
 	int wasroot;
 
-	br_log_state(p);
-
 	wasroot = br_is_root_bridge(br);
 	br_become_designated_port(p);
 	p->state = BR_STATE_DISABLED;
 	p->topology_change_ack = 0;
 	p->config_pending = 0;
 
+	br_log_state(p);
 	br_ifinfo_notify(RTM_NEWLINK, p);
 
 	del_timer(&p->message_age_timer);
-- 
cgit v1.2.2


From 5faa5df1fa2024bd750089ff21dcc4191798263d Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 6 Mar 2012 21:20:26 +0000
Subject: inetpeer: Invalidate the inetpeer tree along with the routing cache

We initialize the routing metrics with the values cached on the
inetpeer in rt_init_metrics(). So if we have the metrics cached on the
inetpeer, we ignore the user configured fib_metrics.

To fix this issue, we replace the old tree with a fresh initialized
inet_peer_base. The old tree is removed later with a delayed work queue.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inetpeer.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/route.c    |  1 +
 2 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index bf4a9c4808e1..deea2e96b7f2 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/net.h>
+#include <linux/workqueue.h>
 #include <net/ip.h>
 #include <net/inetpeer.h>
 #include <net/secure_seq.h>
@@ -66,6 +67,11 @@
 
 static struct kmem_cache *peer_cachep __read_mostly;
 
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
 #define node_height(x) x->avl_height
 
 #define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -102,6 +108,50 @@ int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries m
 int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
 int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
 
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+	struct inet_peer *p, *n;
+	LIST_HEAD(list);
+
+	spin_lock_bh(&gc_lock);
+	list_replace_init(&gc_list, &list);
+	spin_unlock_bh(&gc_lock);
+
+	if (list_empty(&list))
+		return;
+
+	list_for_each_entry_safe(p, n, &list, gc_list) {
+
+		if(need_resched())
+			cond_resched();
+
+		if (p->avl_left != peer_avl_empty) {
+			list_add_tail(&p->avl_left->gc_list, &list);
+			p->avl_left = peer_avl_empty;
+		}
+
+		if (p->avl_right != peer_avl_empty) {
+			list_add_tail(&p->avl_right->gc_list, &list);
+			p->avl_right = peer_avl_empty;
+		}
+
+		n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+		if (!atomic_read(&p->refcnt)) {
+			list_del(&p->gc_list);
+			kmem_cache_free(peer_cachep, p);
+		}
+	}
+
+	if (list_empty(&list))
+		return;
+
+	spin_lock_bh(&gc_lock);
+	list_splice(&list, &gc_list);
+	spin_unlock_bh(&gc_lock);
+
+	schedule_delayed_work(&gc_work, gc_delay);
+}
 
 /* Called from ip_output.c:ip_init  */
 void __init inet_initpeers(void)
@@ -126,6 +176,7 @@ void __init inet_initpeers(void)
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
 			NULL);
 
+	INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker);
 }
 
 static int addr_compare(const struct inetpeer_addr *a,
@@ -449,7 +500,7 @@ relookup:
 		p->pmtu_orig = 0;
 		p->redirect_genid = 0;
 		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
-
+		INIT_LIST_HEAD(&p->gc_list);
 
 		/* Link the node. */
 		link_to_pool(p, base);
@@ -509,3 +560,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
 	return rc;
 }
 EXPORT_SYMBOL(inet_peer_xrlim_allow);
+
+void inetpeer_invalidate_tree(int family)
+{
+	struct inet_peer *old, *new, *prev;
+	struct inet_peer_base *base = family_to_base(family);
+
+	write_seqlock_bh(&base->lock);
+
+	old = base->root;
+	if (old == peer_avl_empty_rcu)
+		goto out;
+
+	new = peer_avl_empty_rcu;
+
+	prev = cmpxchg(&base->root, old, new);
+	if (prev == old) {
+		base->total = 0;
+		spin_lock(&gc_lock);
+		list_add_tail(&prev->gc_list, &gc_list);
+		spin_unlock(&gc_lock);
+		schedule_delayed_work(&gc_work, gc_delay);
+	}
+
+out:
+	write_sequnlock_bh(&base->lock);
+}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bcacf54e5418..23ce0c1287ab 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -938,6 +938,7 @@ static void rt_cache_invalidate(struct net *net)
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 	redirect_genid++;
+	inetpeer_invalidate_tree(AF_INET);
 }
 
 /*
-- 
cgit v1.2.2


From ac3f48de09d8f4b73397047e413fadff7f65cfa7 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 6 Mar 2012 21:21:10 +0000
Subject: route: Remove redirect_genid

As we invalidate the inetpeer tree along with the routing cache now,
we don't need a genid to reset the redirect handling when the routing
cache is flushed.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inetpeer.c |  1 -
 net/ipv4/route.c    | 11 ++---------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index deea2e96b7f2..d4d61b694fab 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -498,7 +498,6 @@ relookup:
 		p->rate_last = 0;
 		p->pmtu_expires = 0;
 		p->pmtu_orig = 0;
-		p->redirect_genid = 0;
 		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
 		INIT_LIST_HEAD(&p->gc_list);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 23ce0c1287ab..019774796174 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -132,7 +132,6 @@ static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
 static int rt_chain_length_max __read_mostly	= 20;
-static int redirect_genid;
 
 static struct delayed_work expires_work;
 static unsigned long expires_ljiffies;
@@ -937,7 +936,6 @@ static void rt_cache_invalidate(struct net *net)
 
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-	redirect_genid++;
 	inetpeer_invalidate_tree(AF_INET);
 }
 
@@ -1486,10 +1484,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 
 				peer = rt->peer;
 				if (peer) {
-					if (peer->redirect_learned.a4 != new_gw ||
-					    peer->redirect_genid != redirect_genid) {
+					if (peer->redirect_learned.a4 != new_gw) {
 						peer->redirect_learned.a4 = new_gw;
-						peer->redirect_genid = redirect_genid;
 						atomic_inc(&__rt_peer_genid);
 					}
 					check_peer_redir(&rt->dst, peer);
@@ -1794,8 +1790,6 @@ static void ipv4_validate_peer(struct rtable *rt)
 		if (peer) {
 			check_peer_pmtu(&rt->dst, peer);
 
-			if (peer->redirect_genid != redirect_genid)
-				peer->redirect_learned.a4 = 0;
 			if (peer->redirect_learned.a4 &&
 			    peer->redirect_learned.a4 != rt->rt_gateway)
 				check_peer_redir(&rt->dst, peer);
@@ -1959,8 +1953,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 		dst_init_metrics(&rt->dst, peer->metrics, false);
 
 		check_peer_pmtu(&rt->dst, peer);
-		if (peer->redirect_genid != redirect_genid)
-			peer->redirect_learned.a4 = 0;
+
 		if (peer->redirect_learned.a4 &&
 		    peer->redirect_learned.a4 != rt->rt_gateway) {
 			rt->rt_gateway = peer->redirect_learned.a4;
-- 
cgit v1.2.2