From 3fff4c42bd0a89869a0eb1e7874cc06ffa4aa0f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 22 Sep 2009 16:18:09 +0200
Subject: printk: Remove ratelimit.h from kernel.h

Decouple kernel.h from ratelimit.h: the global declaration of
printk's ratelimit_state is not needed, and it leads to messy
circular dependencies due to ratelimit.h's (new) adding of a
spinlock_types.h include.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 net/core/sysctl_net_core.c | 2 ++
 net/core/utils.c           | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net/core')
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7db1de0497c6..887c03c4e3c6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -10,7 +10,9 @@
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/netdevice.h>
+#include <linux/ratelimit.h>
 #include <linux/init.h>
+
 #include <net/ip.h>
 #include <net/sock.h>
 
diff --git a/net/core/utils.c b/net/core/utils.c
index 83221aee7084..838250241d26 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -24,6 +24,8 @@
 #include <linux/types.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/ratelimit.h>
+
 #include <net/sock.h>
 
 #include <asm/byteorder.h>
-- 
cgit v1.2.2


From a9828ec6bc0b7e19a65f7e13daa8bd35a926a753 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 1 Oct 2009 11:33:03 +0000
Subject: ethtool: Remove support for obsolete string query operations

The in-tree implementations have all been converted to
get_sset_count().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 58 ++++++++++--------------------------------------------
 1 file changed, 10 insertions(+), 48 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4c12ddb5f5ee..e1951084b973 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -198,13 +198,6 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
 		rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
 		if (rc >= 0)
 			info.n_priv_flags = rc;
-	} else {
-		/* code path for obsolete hooks */
-
-		if (ops->self_test_count)
-			info.testinfo_len = ops->self_test_count(dev);
-		if (ops->get_stats_count)
-			info.n_stats = ops->get_stats_count(dev);
 	}
 	if (ops->get_regs_len)
 		info.regdump_len = ops->get_regs_len(dev);
@@ -684,16 +677,10 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 	u64 *data;
 	int ret, test_len;
 
-	if (!ops->self_test)
-		return -EOPNOTSUPP;
-	if (!ops->get_sset_count && !ops->self_test_count)
+	if (!ops->self_test || !ops->get_sset_count)
 		return -EOPNOTSUPP;
 
-	if (ops->get_sset_count)
-		test_len = ops->get_sset_count(dev, ETH_SS_TEST);
-	else
-		/* code path for obsolete hook */
-		test_len = ops->self_test_count(dev);
+	test_len = ops->get_sset_count(dev, ETH_SS_TEST);
 	if (test_len < 0)
 		return test_len;
 	WARN_ON(test_len == 0);
@@ -728,36 +715,17 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
 	u8 *data;
 	int ret;
 
-	if (!ops->get_strings)
+	if (!ops->get_strings || !ops->get_sset_count)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
 		return -EFAULT;
 
-	if (ops->get_sset_count) {
-		ret = ops->get_sset_count(dev, gstrings.string_set);
-		if (ret < 0)
-			return ret;
-
-		gstrings.len = ret;
-	} else {
-		/* code path for obsolete hooks */
-
-		switch (gstrings.string_set) {
-		case ETH_SS_TEST:
-			if (!ops->self_test_count)
-				return -EOPNOTSUPP;
-			gstrings.len = ops->self_test_count(dev);
-			break;
-		case ETH_SS_STATS:
-			if (!ops->get_stats_count)
-				return -EOPNOTSUPP;
-			gstrings.len = ops->get_stats_count(dev);
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
+	ret = ops->get_sset_count(dev, gstrings.string_set);
+	if (ret < 0)
+		return ret;
+
+	gstrings.len = ret;
 
 	data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
 	if (!data)
@@ -798,16 +766,10 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 	u64 *data;
 	int ret, n_stats;
 
-	if (!ops->get_ethtool_stats)
-		return -EOPNOTSUPP;
-	if (!ops->get_sset_count && !ops->get_stats_count)
+	if (!ops->get_ethtool_stats || !ops->get_sset_count)
 		return -EOPNOTSUPP;
 
-	if (ops->get_sset_count)
-		n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
-	else
-		/* code path for obsolete hook */
-		n_stats = ops->get_stats_count(dev);
+	n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
 	if (n_stats < 0)
 		return n_stats;
 	WARN_ON(n_stats == 0);
-- 
cgit v1.2.2


From 0835acfe72e43b2f9bd46ec8c0d219e94c3525e0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 30 Sep 2009 13:03:33 +0000
Subject: pktgen: Avoid dirtying skb->users when txq is full

We can avoid two atomic ops on skb->users if packet is not going to be
sent to the device (because hardware txqueue is full)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b69455217ed6..e856ab0d0745 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3441,12 +3441,14 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	txq = netdev_get_tx_queue(odev, queue_map);
 
 	__netif_tx_lock_bh(txq);
-	atomic_inc(&(pkt_dev->skb->users));
 
-	if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq)))
+	if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) {
 		ret = NETDEV_TX_BUSY;
-	else
-		ret = (*xmit)(pkt_dev->skb, odev);
+		pkt_dev->last_ok = 0;
+		goto unlock;
+	}
+	atomic_inc(&(pkt_dev->skb->users));
+	ret = (*xmit)(pkt_dev->skb, odev);
 
 	switch (ret) {
 	case NETDEV_TX_OK:
@@ -3468,6 +3470,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		atomic_dec(&(pkt_dev->skb->users));
 		pkt_dev->last_ok = 0;
 	}
+unlock:
 	__netif_tx_unlock_bh(txq);
 
 	/* If pkt_dev->count is zero, then run forever */
-- 
cgit v1.2.2


From 7ffbe3fdace0bdfcdab8dc6c77506feda0871f79 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 2 Oct 2009 05:15:27 +0000
Subject: net: introduce NETDEV_POST_INIT notifier

For various purposes including a wireless extensions
bugfix, we need to hook into the netdev creation before
before netdev_register_kobject(). This will also ease
doing the dev type assignment that Marcel was working
on for cfg80211 drivers w/o touching them all.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index b8f74cfb1bfd..a74c8fd69556 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4836,6 +4836,12 @@ int register_netdevice(struct net_device *dev)
 		dev->features |= NETIF_F_GSO;
 
 	netdev_initialize_kobject(dev);
+
+	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		goto err_uninit;
+
 	ret = netdev_register_kobject(dev);
 	if (ret)
 		goto err_uninit;
-- 
cgit v1.2.2


From d519e17e2d01a0ee9abe083019532061b4438065 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <andy@greyhouse.net>
Date: Fri, 2 Oct 2009 09:26:12 +0000
Subject: net: export device speed and duplex via sysfs

This patch exports the link-speed (in Mbps) and duplex of an interface
via sysfs.  This eliminates the need to use ethtool just to check the
link-speed.  Not requiring 'ethtool' and not relying on the SIOCETHTOOL
ioctl should be helpful in an embedded environment where space is at a
premium as well.

NOTE: This patch also intentionally allows non-root users to check the link
speed and duplex -- something not possible with ethtool.

Here's some sample output:

# cat /sys/class/net/eth0/speed
100
# cat /sys/class/net/eth0/duplex
half
# ethtool eth0
Settings for eth0:
        Supported ports: [ TP ]
        Supported link modes:   10baseT/Half 10baseT/Full
                                100baseT/Half 100baseT/Full
                                1000baseT/Half 1000baseT/Full
        Supports auto-negotiation: Yes
        Advertised link modes:  Not reported
        Advertised auto-negotiation: No
        Speed: 100Mb/s
        Duplex: Half
        Port: Twisted Pair
        PHYAD: 1
        Transceiver: internal
        Auto-negotiation: off
        Supports Wake-on: g
        Wake-on: g
        Current message level: 0x000000ff (255)
        Link detected: yes

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 821d30918cfc..effb78410eb2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -130,6 +130,44 @@ static ssize_t show_carrier(struct device *dev,
 	return -EINVAL;
 }
 
+static ssize_t show_speed(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	int ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (netif_running(netdev) && netdev->ethtool_ops->get_settings) {
+		struct ethtool_cmd cmd = { ETHTOOL_GSET };
+
+		if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
+			ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
+	}
+	rtnl_unlock();
+	return ret;
+}
+
+static ssize_t show_duplex(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	int ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (netif_running(netdev) && netdev->ethtool_ops->get_settings) {
+		struct ethtool_cmd cmd = { ETHTOOL_GSET };
+
+		if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
+			ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half");
+	}
+	rtnl_unlock();
+	return ret;
+}
+
 static ssize_t show_dormant(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
@@ -259,6 +297,8 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(address, S_IRUGO, show_address, NULL),
 	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
 	__ATTR(carrier, S_IRUGO, show_carrier, NULL),
+	__ATTR(speed, S_IRUGO, show_speed, NULL),
+	__ATTR(duplex, S_IRUGO, show_duplex, NULL),
 	__ATTR(dormant, S_IRUGO, show_dormant, NULL),
 	__ATTR(operstate, S_IRUGO, show_operstate, NULL),
 	__ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu),
-- 
cgit v1.2.2


From d250a5f90e53f5e150618186230795352d154c88 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 2 Oct 2009 10:32:18 +0000
Subject: pkt_sched: gen_estimator: Dont report fake rate estimators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jarek Poplawski a écrit :
>
>
> Hmm... So you made me to do some "real" work here, and guess what?:
> there is one serious checkpatch warning! ;-) Plus, this new parameter
> should be added to the function description. Otherwise:
> Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
>
> Thanks,
> Jarek P.
>
> PS: I guess full "Don't" would show we really mean it...

Okay :) Here is the last round, before the night !

Thanks again

[RFC] pkt_sched: gen_estimator: Don't report fake rate estimators

We currently send TCA_STATS_RATE_EST elements to netlink users, even if no estimator
is running.

# tc -s -d qdisc
qdisc pfifo_fast 0: dev eth0 root bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 112833764978 bytes 1495081739 pkt (dropped 0, overlimits 0 requeues 0)
 rate 0bit 0pps backlog 0b 0p requeues 0

User has no way to tell if the "rate 0bit 0pps" is a real estimation, or a fake
one (because no estimator is active)

After this patch, tc command output is :
$ tc -s -d qdisc
qdisc pfifo_fast 0: dev eth0 root bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 561075 bytes 1196 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0

We add a parameter to gnet_stats_copy_rate_est() function so that
it can use gen_estimator_active(bstats, r), as suggested by Jarek.

This parameter can be NULL if check is not necessary, (htb for
example has a mandatory rate estimator)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_stats.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 8569310268ab..393b1d8618e2 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -127,6 +127,7 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
 /**
  * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
  * @d: dumping handle
+ * @b: basic statistics
  * @r: rate estimator statistics
  *
  * Appends the rate estimator statistics to the top level TLV created by
@@ -136,8 +137,13 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
  * if the room in the socket buffer was not sufficient.
  */
 int
-gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r)
+gnet_stats_copy_rate_est(struct gnet_dump *d,
+			 const struct gnet_stats_basic_packed *b,
+			 struct gnet_stats_rate_est *r)
 {
+	if (b && !gen_estimator_active(b, r))
+		return 0;
+
 	if (d->compat_tc_stats) {
 		d->tc_stats.bps = r->bps;
 		d->tc_stats.pps = r->pps;
-- 
cgit v1.2.2


From d73d3a8cb4723e161589864741d8528d70b350eb Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 5 Oct 2009 10:59:58 +0000
Subject: ethtool: Add reset operation

After updating firmware stored in flash, users may wish to reset the
relevant hardware and start the new firmware immediately.  This should
not be completely automatic as it may be disruptive.

A selective reset may also be useful for debugging or diagnostics.

This adds a separate reset operation which takes flags indicating the
components to be reset.  Drivers are allowed to reset only a subset of
those requested, and must indicate the actual subset.  This allows the
use of generic component masks and some future expansion.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e1951084b973..d8aee584e8d1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -302,6 +302,26 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 	return ret;
 }
 
+static int ethtool_reset(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value reset;
+	int ret;
+
+	if (!dev->ethtool_ops->reset)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&reset, useraddr, sizeof(reset)))
+		return -EFAULT;
+
+	ret = dev->ethtool_ops->reset(dev, &reset.data);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(useraddr, &reset, sizeof(reset)))
+		return -EFAULT;
+	return 0;
+}
+
 static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
@@ -1089,6 +1109,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_FLASHDEV:
 		rc = ethtool_flash_device(dev, useraddr);
 		break;
+	case ETHTOOL_RESET:
+		rc = ethtool_reset(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.2


From 3d23e349d807177eaf519d444677cee86b1a04cf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 29 Sep 2009 23:27:28 +0200
Subject: wext: refactor

Refactor wext to
 * split out iwpriv handling
 * split out iwspy handling
 * split out procfs support
 * allow cfg80211 to have wireless extensions compat code
   w/o CONFIG_WIRELESS_EXT

After this, drivers need to
 - select WIRELESS_EXT	- for wext support
 - select WEXT_PRIV	- for iwpriv support
 - select WEXT_SPY	- for iwspy support

except cfg80211 -- which gets new hooks in wext-core.c
and can then get wext handlers without CONFIG_WIRELESS_EXT.

Wireless extensions procfs support is auto-selected
based on PROC_FS and anything that requires the wext core
(i.e. WIRELESS_EXT or CFG80211_WEXT).

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/core/net-sysfs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index effb78410eb2..9b07535c2889 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -543,8 +543,12 @@ int netdev_register_kobject(struct net_device *net)
 	*groups++ = &netstat_group;
 
 #ifdef CONFIG_WIRELESS_EXT_SYSFS
-	if (net->wireless_handlers || net->ieee80211_ptr)
+	if (net->ieee80211_ptr)
 		*groups++ = &wireless_group;
+#ifdef CONFIG_WIRELESS_EXT
+	else if (net->wireless_handlers)
+		*groups++ = &wireless_group;
+#endif
 #endif
 #endif /* CONFIG_SYSFS */
 
-- 
cgit v1.2.2


From d9f5950f90292f7cc42834338dfd5f44dc4cc4ca Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Wed, 7 Oct 2009 12:24:25 +0000
Subject: net: Make UFO on master device independent of attached devices

Now that software UFO is supported, UFO can be enabled on master
devices like bridge, bond even though the attached device doesn't
support this feature in hardware.

This allows UFO to be used between KVM host and guest even when a
physical interface attached to the bridge doesn't support UFO.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index a74c8fd69556..510ff205d5db 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5489,7 +5489,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one,
 	one |= NETIF_F_ALL_CSUM;
 
 	one |= all & NETIF_F_ONE_FOR_ALL;
-	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
+	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
 
 	return all;
-- 
cgit v1.2.2


From 3b885787ea4112eaa80945999ea0901bf742707f Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 12 Oct 2009 13:26:31 -0700
Subject: net: Generalize socket rx gap / receive queue overflow cmsg

Create a new socket level option to report number of queue overflows

Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames.  This value was
exported via a SOL_PACKET level cmsg.  AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option.  As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames.  It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count).  Tested
successfully by me.

Notes:

1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.

2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me.  This also saves us having
to code in a per-protocol opt in mechanism.

3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 7626b6aacd68..43ca2c995393 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -276,6 +276,8 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int err = 0;
 	int skb_len;
+	unsigned long flags;
+	struct sk_buff_head *list = &sk->sk_receive_queue;
 
 	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 	   number of warnings when compiling with -W --ANK
@@ -305,7 +307,10 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_len = skb->len;
 
-	skb_queue_tail(&sk->sk_receive_queue, skb);
+	spin_lock_irqsave(&list->lock, flags);
+	skb->dropcount = atomic_read(&sk->sk_drops);
+	__skb_queue_tail(list, skb);
+	spin_unlock_irqrestore(&list->lock, flags);
 
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk, skb_len);
@@ -702,6 +707,12 @@ set_rcvbuf:
 
 		/* We implement the SO_SNDLOWAT etc to
 		   not be settable (1003.1g 5.3) */
+	case SO_RXQ_OVFL:
+		if (valbool)
+			sock_set_flag(sk, SOCK_RXQ_OVFL);
+		else
+			sock_reset_flag(sk, SOCK_RXQ_OVFL);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -901,6 +912,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sk->sk_mark;
 		break;
 
+	case SO_RXQ_OVFL:
+		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.2.2


From 89d71a66c40d629e3b1285def543ab1425558cd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 13 Oct 2009 05:34:20 +0000
Subject: net: Use netdev_alloc_skb_ip_align()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 510ff205d5db..28b0b9e992a0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2604,20 +2604,13 @@ EXPORT_SYMBOL(napi_reuse_skb);
 
 struct sk_buff *napi_get_frags(struct napi_struct *napi)
 {
-	struct net_device *dev = napi->dev;
 	struct sk_buff *skb = napi->skb;
 
 	if (!skb) {
-		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
-		if (!skb)
-			goto out;
-
-		skb_reserve(skb, NET_IP_ALIGN);
-
-		napi->skb = skb;
+		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+		if (skb)
+			napi->skb = skb;
 	}
-
-out:
 	return skb;
 }
 EXPORT_SYMBOL(napi_get_frags);
-- 
cgit v1.2.2


From 766e9037cc139ee25ed93ee5ad11e1450c4b99f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 14 Oct 2009 20:40:11 -0700
Subject: net: sk_drops consolidation

sock_queue_rcv_skb() can update sk_drops itself, removing need for
callers to take care of it. This is more consistent since
sock_queue_rcv_skb() also reads sk_drops when queueing a skb.

This adds sk_drops managment to many protocols that not cared yet.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 43ca2c995393..38713aa3faf2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -274,7 +274,7 @@ static void sock_disable_timestamp(struct sock *sk, int flag)
 
 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-	int err = 0;
+	int err;
 	int skb_len;
 	unsigned long flags;
 	struct sk_buff_head *list = &sk->sk_receive_queue;
@@ -284,17 +284,17 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 	    (unsigned)sk->sk_rcvbuf) {
-		err = -ENOMEM;
-		goto out;
+		atomic_inc(&sk->sk_drops);
+		return -ENOMEM;
 	}
 
 	err = sk_filter(sk, skb);
 	if (err)
-		goto out;
+		return err;
 
 	if (!sk_rmem_schedule(sk, skb->truesize)) {
-		err = -ENOBUFS;
-		goto out;
+		atomic_inc(&sk->sk_drops);
+		return -ENOBUFS;
 	}
 
 	skb->dev = NULL;
@@ -314,8 +314,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk, skb_len);
-out:
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL(sock_queue_rcv_skb);
 
-- 
cgit v1.2.2


From 8edf19c2fe028563fc6ea9cb1995b8ee4172d4b6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 15 Oct 2009 00:12:40 +0000
Subject: net: sk_drops consolidation part 2

- skb_kill_datagram() can increment sk->sk_drops itself, not callers.

- UDP on IPV4 & IPV6 dropped frames (because of bad checksum or policy checks) increment sk_drops

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1c6cf3a1a4f6..4d57f5e12b05 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -262,6 +262,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 	}
 
 	kfree_skb(skb);
+	atomic_inc(&sk->sk_drops);
 	sk_mem_reclaim_partial(sk);
 
 	return err;
-- 
cgit v1.2.2


From 7e75f93eda027d9f9e0203ee6ffd210ea92e98f3 Mon Sep 17 00:00:00 2001
From: jamal <hadi@cyberus.ca>
Date: Mon, 19 Oct 2009 02:17:56 +0000
Subject: pkt_sched: ingress socket filter by mark

Allow bpf to set a filter to drop packets that dont
match a specific mark

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index d1d779ca096d..e3987e1d4839 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -303,6 +303,9 @@ load_b:
 		case SKF_AD_IFINDEX:
 			A = skb->dev->ifindex;
 			continue;
+		case SKF_AD_MARK:
+			A = skb->mark;
+			continue;
 		case SKF_AD_NLATTR: {
 			struct nlattr *nla;
 
-- 
cgit v1.2.2


From d19742fb1c68e6db83b76e06dea5a374c99e104f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 20 Oct 2009 01:06:22 -0700
Subject: filter: Add SKF_AD_QUEUE instruction

It can help being able to filter packets on their queue_mapping.

If filter performance is not good, we could add a "numqueue" field
in struct packet_type, so that netif_nit_deliver() and other functions
can directly ignore packets with not expected queue number.

Lets experiment this simple filter extension first.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index e3987e1d4839..08db7b9143a3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -306,6 +306,9 @@ load_b:
 		case SKF_AD_MARK:
 			A = skb->mark;
 			continue;
+		case SKF_AD_QUEUE:
+			A = skb->queue_mapping;
+			continue;
 		case SKF_AD_NLATTR: {
 			struct nlattr *nla;
 
-- 
cgit v1.2.2


From e022f0b4a03f4fff9323b509df023b8af635716e Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Mon, 19 Oct 2009 23:46:20 +0000
Subject: net: Introduce sk_tx_queue_mapping

Introduce sk_tx_queue_mapping; and functions that set, test and
get this value. Reset sk_tx_queue_mapping to -1 whenever the dst
cache is set/reset, and in socket alloc. Setting txq to -1 and
using valid txq=<0 to n-1> allows the tx path to use the value
of sk_tx_queue_mapping directly instead of subtracting 1 on every
tx.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 38713aa3faf2..934d9673f084 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -357,6 +357,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 	struct dst_entry *dst = sk->sk_dst_cache;
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		sk_tx_queue_clear(sk);
 		sk->sk_dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -953,7 +954,8 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
 	void *sptr = nsk->sk_security;
 #endif
 	BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
-		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
+		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
+		     sizeof(osk->sk_tx_queue_mapping));
 	memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
 #ifdef CONFIG_SECURITY_NETWORK
@@ -997,6 +999,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 
 		if (!try_module_get(prot->owner))
 			goto out_free_sec;
+		sk_tx_queue_clear(sk);
 	}
 
 	return sk;
-- 
cgit v1.2.2


From ea94ff3b55188df157a8740bdf3976a87563d705 Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Mon, 19 Oct 2009 23:46:45 +0000
Subject: net: Fix for dst_negative_advice

dst_negative_advice() should check for changed dst and reset
sk_tx_queue_mapping accordingly. Pass sock to the callers of
dst_negative_advice.

(sk_reset_txq is defined just for use by dst_negative_advice. The
only way I could find to get around this is to move dst_negative_()
from dst.h to dst.c, include sock.h in dst.c, etc)

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 934d9673f084..5a51512f638a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -352,6 +352,12 @@ discard_and_relse:
 }
 EXPORT_SYMBOL(sk_receive_skb);
 
+void sk_reset_txq(struct sock *sk)
+{
+	sk_tx_queue_clear(sk);
+}
+EXPORT_SYMBOL(sk_reset_txq);
+
 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 {
 	struct dst_entry *dst = sk->sk_dst_cache;
-- 
cgit v1.2.2


From a4ee3ce3293dc931fab19beb472a8bde1295aebe Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Mon, 19 Oct 2009 23:50:07 +0000
Subject: net: Use sk_tx_queue_mapping for connected sockets

For connected sockets, the first run of dev_pick_tx saves the
calculated txq in sk_tx_queue_mapping. This is not saved if
either the device has a queue select or the socket is not
connected. Next iterations of dev_pick_tx uses the cached value
of sk_tx_queue_mapping.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 28b0b9e992a0..fa88dcd476d6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1791,13 +1791,25 @@ EXPORT_SYMBOL(skb_tx_hash);
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
-	u16 queue_index = 0;
+	u16 queue_index;
+	struct sock *sk = skb->sk;
+
+	if (sk_tx_queue_recorded(sk)) {
+		queue_index = sk_tx_queue_get(sk);
+	} else {
+		const struct net_device_ops *ops = dev->netdev_ops;
 
-	if (ops->ndo_select_queue)
-		queue_index = ops->ndo_select_queue(dev, skb);
-	else if (dev->real_num_tx_queues > 1)
-		queue_index = skb_tx_hash(dev, skb);
+		if (ops->ndo_select_queue) {
+			queue_index = ops->ndo_select_queue(dev, skb);
+		} else {
+			queue_index = 0;
+			if (dev->real_num_tx_queues > 1)
+				queue_index = skb_tx_hash(dev, skb);
+
+			if (sk && sk->sk_dst_cache)
+				sk_tx_queue_set(sk, queue_index);
+		}
+	}
 
 	skb_set_queue_mapping(skb, queue_index);
 	return netdev_get_tx_queue(dev, queue_index);
-- 
cgit v1.2.2


From a3d1289126e7b14307074b76bf1677015ea5036f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 21 Oct 2009 10:59:31 +0000
Subject: rtnetlink: rtnl_setlink() and rtnl_getlink() changes

rtnl_getlink() & rtnl_setlink() run with RTNL held, we can use
__dev_get_by_index() and __dev_get_by_name() variants and avoid
dev_hold()/dev_put()

Adds to rtnl_getlink() the capability to find a device by its name,
not only by its index.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index eb42873f2a3a..ba13b0974a7b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -910,9 +910,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	err = -EINVAL;
 	ifm = nlmsg_data(nlh);
 	if (ifm->ifi_index > 0)
-		dev = dev_get_by_index(net, ifm->ifi_index);
+		dev = __dev_get_by_index(net, ifm->ifi_index);
 	else if (tb[IFLA_IFNAME])
-		dev = dev_get_by_name(net, ifname);
+		dev = __dev_get_by_name(net, ifname);
 	else
 		goto errout;
 
@@ -922,11 +922,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	}
 
 	if ((err = validate_linkmsg(dev, tb)) < 0)
-		goto errout_dev;
+		goto errout;
 
 	err = do_setlink(dev, ifm, tb, ifname, 0);
-errout_dev:
-	dev_put(dev);
 errout:
 	return err;
 }
@@ -1154,6 +1152,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
 	struct ifinfomsg *ifm;
+	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	struct net_device *dev = NULL;
 	struct sk_buff *nskb;
@@ -1163,19 +1162,23 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
 	ifm = nlmsg_data(nlh);
-	if (ifm->ifi_index > 0) {
-		dev = dev_get_by_index(net, ifm->ifi_index);
-		if (dev == NULL)
-			return -ENODEV;
-	} else
+	if (ifm->ifi_index > 0)
+		dev = __dev_get_by_index(net, ifm->ifi_index);
+	else if (tb[IFLA_IFNAME])
+		dev = __dev_get_by_name(net, ifname);
+	else
 		return -EINVAL;
 
+	if (dev == NULL)
+		return -ENODEV;
+
 	nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
-	if (nskb == NULL) {
-		err = -ENOBUFS;
-		goto errout;
-	}
+	if (nskb == NULL)
+		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
 			       nlh->nlmsg_seq, 0, 0);
@@ -1183,11 +1186,8 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(nskb);
-		goto errout;
-	}
-	err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid);
-errout:
-	dev_put(dev);
+	} else
+		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid);
 
 	return err;
 }
-- 
cgit v1.2.2


From 7c28bd0b8ec4d128bd7660671d1b626b0abc471f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 24 Oct 2009 06:13:17 -0700
Subject: rtnetlink: speedup rtnl_dump_ifinfo()

When handling large number of netdevice, rtnl_dump_ifinfo()
is very slow because it has O(N^2) complexity.

Instead of scanning one single list, we can use the 256 sub lists
of the dev_index hash table.

This considerably speedups "ip link" operations

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       |  7 ++-----
 net/core/rtnetlink.c | 37 ++++++++++++++++++++++++-------------
 2 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index fa88dcd476d6..e7bada1d5ed9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -193,18 +193,15 @@ static struct list_head ptype_all __read_mostly;	/* Taps */
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
-#define NETDEV_HASHBITS	8
-#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
-
 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 {
 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
-	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+	return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)];
 }
 
 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 {
-	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
+	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 }
 
 /* Device list insertion */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ba13b0974a7b..52ea418d5302 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -682,22 +682,33 @@ nla_put_failure:
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	int idx;
-	int s_idx = cb->args[0];
+	int h, s_h;
+	int idx = 0, s_idx;
 	struct net_device *dev;
-
-	idx = 0;
-	for_each_netdev(net, dev) {
-		if (idx < s_idx)
-			goto cont;
-		if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
-				     NETLINK_CB(cb->skb).pid,
-				     cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0)
-			break;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	s_h = cb->args[0];
+	s_idx = cb->args[1];
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		hlist_for_each_entry(dev, node, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+					     NETLINK_CB(cb->skb).pid,
+					     cb->nlh->nlmsg_seq, 0,
+					     NLM_F_MULTI) <= 0)
+				goto out;
 cont:
-		idx++;
+			idx++;
+		}
 	}
-	cb->args[0] = idx;
+out:
+	cb->args[1] = idx;
+	cb->args[0] = h;
 
 	return skb->len;
 }
-- 
cgit v1.2.2


From 05423b241311c9380b7280179295bac7794281b6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 26 Oct 2009 18:40:35 -0700
Subject: vlan: allow null VLAN ID to be used

We currently use a 16 bit field (vlan_tci) to store VLAN ID/PRIO on a skb.

Null value is used as a special value, meaning vlan tagging not enabled.
This forbids use of null vlan ID.

As pointed by David, some drivers use the 3 high order bits (PRIO)

As VLAN ID is 12 bits, we can use the remaining bit (CFI) as a flag, and
allow null VLAN ID.

In case future code really wants to use VLAN_CFI_MASK, we'll have to use
a bit outside of vlan_tci.

#define VLAN_PRIO_MASK         0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT        13
#define VLAN_CFI_MASK          0x1000 /* Canonical Format Indicator */
#define VLAN_TAG_PRESENT       VLAN_CFI_MASK
#define VLAN_VID_MASK          0x0fff /* VLAN Identifier */

Reported-by: Gertjan Hofman <gertjan_hofman@yahoo.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index e7bada1d5ed9..950c13fa60d2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2300,7 +2300,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
 
-	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
+	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
 		return NET_RX_SUCCESS;
 
 	/* if we've gotten here through NAPI, check netpoll */
-- 
cgit v1.2.2


From 44a0873d52282f24b1894c58c0f157e0f626ddc9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 07:03:04 +0000
Subject: net: Introduce unregister_netdevice_queue()

This patchs adds an unreg_list anchor to struct net_device, and
introduces an unregister_netdevice_queue() function, able to queue
a net_device to a list instead of immediately unregister it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 950c13fa60d2..ff94e2b8df7f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5245,25 +5245,31 @@ void synchronize_net(void)
 EXPORT_SYMBOL(synchronize_net);
 
 /**
- *	unregister_netdevice - remove device from the kernel
+ *	unregister_netdevice_queue - remove device from the kernel
  *	@dev: device
- *
+ *	@head: list
+
  *	This function shuts down a device interface and removes it
  *	from the kernel tables.
+ *	If head not NULL, device is queued to be unregistered later.
  *
  *	Callers must hold the rtnl semaphore.  You may want
  *	unregister_netdev() instead of this.
  */
 
-void unregister_netdevice(struct net_device *dev)
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 {
 	ASSERT_RTNL();
 
-	rollback_registered(dev);
-	/* Finish processing unregister after unlock */
-	net_set_todo(dev);
+	if (head) {
+		list_add_tail(&dev->unreg_list, head);
+	} else {
+		rollback_registered(dev);
+		/* Finish processing unregister after unlock */
+		net_set_todo(dev);
+	}
 }
-EXPORT_SYMBOL(unregister_netdevice);
+EXPORT_SYMBOL(unregister_netdevice_queue);
 
 /**
  *	unregister_netdev - remove device from the kernel
-- 
cgit v1.2.2


From 9b5e383c11b08784eb0087617f880077982ef769 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 07:04:19 +0000
Subject: net: Introduce unregister_netdevice_many()

Introduce rollback_registered_many() and unregister_netdevice_many()

rollback_registered_many() is able to perform necessary steps at device dismantle
time, factorizing two expensive synchronize_net() calls.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 97 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 32 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index ff94e2b8df7f..04d3e3014020 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4637,59 +4637,76 @@ static void net_set_todo(struct net_device *dev)
 	list_add_tail(&dev->todo_list, &net_todo_list);
 }
 
-static void rollback_registered(struct net_device *dev)
+static void rollback_registered_many(struct list_head *head)
 {
+	struct net_device *dev;
+
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-	/* Some devices call without registering for initialization unwind. */
-	if (dev->reg_state == NETREG_UNINITIALIZED) {
-		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
-				  "was registered\n", dev->name, dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		/* Some devices call without registering
+		 * for initialization unwind.
+		 */
+		if (dev->reg_state == NETREG_UNINITIALIZED) {
+			pr_debug("unregister_netdevice: device %s/%p never "
+				 "was registered\n", dev->name, dev);
 
-		WARN_ON(1);
-		return;
-	}
+			WARN_ON(1);
+			return;
+		}
 
-	BUG_ON(dev->reg_state != NETREG_REGISTERED);
+		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 
-	/* If device is running, close it first. */
-	dev_close(dev);
+		/* If device is running, close it first. */
+		dev_close(dev);
 
-	/* And unlink it from device chain. */
-	unlist_netdevice(dev);
+		/* And unlink it from device chain. */
+		unlist_netdevice(dev);
 
-	dev->reg_state = NETREG_UNREGISTERING;
+		dev->reg_state = NETREG_UNREGISTERING;
+	}
 
 	synchronize_net();
 
-	/* Shutdown queueing discipline. */
-	dev_shutdown(dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		/* Shutdown queueing discipline. */
+		dev_shutdown(dev);
 
 
-	/* Notify protocols, that we are about to destroy
-	   this device. They should clean all the things.
-	*/
-	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+		/* Notify protocols, that we are about to destroy
+		   this device. They should clean all the things.
+		*/
+		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
-	/*
-	 *	Flush the unicast and multicast chains
-	 */
-	dev_unicast_flush(dev);
-	dev_addr_discard(dev);
+		/*
+		 *	Flush the unicast and multicast chains
+		 */
+		dev_unicast_flush(dev);
+		dev_addr_discard(dev);
 
-	if (dev->netdev_ops->ndo_uninit)
-		dev->netdev_ops->ndo_uninit(dev);
+		if (dev->netdev_ops->ndo_uninit)
+			dev->netdev_ops->ndo_uninit(dev);
 
-	/* Notifier chain MUST detach us from master device. */
-	WARN_ON(dev->master);
+		/* Notifier chain MUST detach us from master device. */
+		WARN_ON(dev->master);
 
-	/* Remove entries from kobject tree */
-	netdev_unregister_kobject(dev);
+		/* Remove entries from kobject tree */
+		netdev_unregister_kobject(dev);
+	}
 
 	synchronize_net();
 
-	dev_put(dev);
+	list_for_each_entry(dev, head, unreg_list)
+		dev_put(dev);
+}
+
+static void rollback_registered(struct net_device *dev)
+{
+	LIST_HEAD(single);
+
+	list_add(&dev->unreg_list, &single);
+	rollback_registered_many(&single);
 }
 
 static void __netdev_init_queue_locks_one(struct net_device *dev,
@@ -5271,6 +5288,22 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 }
 EXPORT_SYMBOL(unregister_netdevice_queue);
 
+/**
+ *	unregister_netdevice_many - unregister many devices
+ *	@head: list of devices
+ *
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+	struct net_device *dev;
+
+	if (!list_empty(head)) {
+		rollback_registered_many(head);
+		list_for_each_entry(dev, head, unreg_list)
+			net_set_todo(dev);
+	}
+}
+
 /**
  *	unregister_netdev - remove device from the kernel
  *	@dev: device
-- 
cgit v1.2.2


From 23289a37e2b127dfc4de1313fba15bb4c9f0cd5b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 07:06:36 +0000
Subject: net: add a list_head parameter to dellink() method

Adding a list_head parameter to rtnl_link_ops->dellink() methods
allow us to queue devices on a list, in order to dismantle
them all at once.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       |  2 +-
 net/core/rtnetlink.c | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 04d3e3014020..4513dfd5718e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5629,7 +5629,7 @@ restart:
 
 		/* Delete virtual devices */
 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
-			dev->rtnl_link_ops->dellink(dev);
+			dev->rtnl_link_ops->dellink(dev, NULL);
 			goto restart;
 		}
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 52ea418d5302..391a62cd9df6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -248,7 +248,7 @@ static LIST_HEAD(link_ops);
 int __rtnl_link_register(struct rtnl_link_ops *ops)
 {
 	if (!ops->dellink)
-		ops->dellink = unregister_netdevice;
+		ops->dellink = unregister_netdevice_queue;
 
 	list_add_tail(&ops->list, &link_ops);
 	return 0;
@@ -277,13 +277,13 @@ EXPORT_SYMBOL_GPL(rtnl_link_register);
 static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
 {
 	struct net_device *dev;
-restart:
+	LIST_HEAD(list_kill);
+
 	for_each_netdev(net, dev) {
-		if (dev->rtnl_link_ops == ops) {
-			ops->dellink(dev);
-			goto restart;
-		}
+		if (dev->rtnl_link_ops == ops)
+			ops->dellink(dev, &list_kill);
 	}
+	unregister_netdevice_many(&list_kill);
 }
 
 void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
@@ -972,7 +972,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	if (!ops)
 		return -EOPNOTSUPP;
 
-	ops->dellink(dev);
+	ops->dellink(dev, NULL);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 63c8099d90096db56ee1c66c31f05d4fcfbc1c69 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 07:06:49 +0000
Subject: vlan: Optimize multiple unregistration

Use unregister_netdevice_many() to speedup master device unregister.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4513dfd5718e..09551cc143a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5303,6 +5303,7 @@ void unregister_netdevice_many(struct list_head *head)
 			net_set_todo(dev);
 	}
 }
+EXPORT_SYMBOL(unregister_netdevice_many);
 
 /**
  *	unregister_netdev - remove device from the kernel
-- 
cgit v1.2.2


From ac5e3af9996fb911d4fdff910a8ac3cb7fc63a94 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 26 Oct 2009 01:23:33 +0000
Subject: net: sysfs: ethtool_ops can be NULL

commit d519e17e2d01a0ee9abe083019532061b4438065
(net: export device speed and duplex via sysfs)
made the wrong assumption that netdev->ethtool_ops was always set.

This makes possible to crash kernel and let rtnl in locked state.

modprobe dummy
ip link set dummy0 up
(udev runs and crash)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 753c420060df..89de182353b0 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -139,7 +139,9 @@ static ssize_t show_speed(struct device *dev,
 	if (!rtnl_trylock())
 		return restart_syscall();
 
-	if (netif_running(netdev) && netdev->ethtool_ops->get_settings) {
+	if (netif_running(netdev) &&
+	    netdev->ethtool_ops &&
+	    netdev->ethtool_ops->get_settings) {
 		struct ethtool_cmd cmd = { ETHTOOL_GSET };
 
 		if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
@@ -158,7 +160,9 @@ static ssize_t show_duplex(struct device *dev,
 	if (!rtnl_trylock())
 		return restart_syscall();
 
-	if (netif_running(netdev) && netdev->ethtool_ops->get_settings) {
+	if (netif_running(netdev) &&
+	    netdev->ethtool_ops &&
+	    netdev->ethtool_ops->get_settings) {
 		struct ethtool_cmd cmd = { ETHTOOL_GSET };
 
 		if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
-- 
cgit v1.2.2


From fb699dfd426a189fe33b91586c15176a75c8aed0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 19 Oct 2009 19:18:49 +0000
Subject: net: Introduce dev_get_by_index_rcu()

Some workloads hit dev_base_lock rwlock pretty hard.
We can use RCU lookups to avoid touching this rwlock.

netdevices are already freed after a RCU grace period, so this patch
adds no penalty at device dismantle time.

dev_ifname() converted to dev_get_by_index_rcu()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 09551cc143a9..68a1bb68b5a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -214,12 +214,15 @@ static int list_netdevice(struct net_device *dev)
 	write_lock_bh(&dev_base_lock);
 	list_add_tail(&dev->dev_list, &net->dev_base_head);
 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
-	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
+	hlist_add_head_rcu(&dev->index_hlist,
+			   dev_index_hash(net, dev->ifindex));
 	write_unlock_bh(&dev_base_lock);
 	return 0;
 }
 
-/* Device list removal */
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
 static void unlist_netdevice(struct net_device *dev)
 {
 	ASSERT_RTNL();
@@ -228,7 +231,7 @@ static void unlist_netdevice(struct net_device *dev)
 	write_lock_bh(&dev_base_lock);
 	list_del(&dev->dev_list);
 	hlist_del(&dev->name_hlist);
-	hlist_del(&dev->index_hlist);
+	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
 }
 
@@ -646,6 +649,31 @@ struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 }
 EXPORT_SYMBOL(__dev_get_by_index);
 
+/**
+ *	dev_get_by_index_rcu - find a device by its ifindex
+ *	@net: the applicable net namespace
+ *	@ifindex: index of device
+ *
+ *	Search for an interface by index. Returns %NULL if the device
+ *	is not found or a pointer to the device. The device has not
+ *	had its reference counter increased so the caller must be careful
+ *	about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_index_hash(net, ifindex);
+
+	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
+		if (dev->ifindex == ifindex)
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
+
 
 /**
  *	dev_get_by_index - find a device by its ifindex
@@ -662,11 +690,11 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_index(net, ifindex);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
 	if (dev)
 		dev_hold(dev);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev;
 }
 EXPORT_SYMBOL(dev_get_by_index);
@@ -2939,15 +2967,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
 	if (!dev) {
-		read_unlock(&dev_base_lock);
+		rcu_read_unlock();
 		return -ENODEV;
 	}
 
 	strcpy(ifr.ifr_name, dev->name);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 
 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 		return -EFAULT;
-- 
cgit v1.2.2


From 5b252f0c2f98df21fadf0f6cf189b87a0b938228 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 29 Oct 2009 07:17:09 +0000
Subject: gro: Name the GRO result enumeration type

This clarifies which return and parameter types are GRO result codes
and not RX result codes.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 68a1bb68b5a8..1dc13744684c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2476,7 +2476,7 @@ void napi_gro_flush(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
-int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
 	struct packet_type *ptype;
@@ -2484,7 +2484,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
 	int same_flow;
 	int mac_len;
-	int ret;
+	enum gro_result ret;
 
 	if (!(skb->dev->features & NETIF_F_GRO))
 		goto normal;
@@ -2568,7 +2568,8 @@ normal:
 }
 EXPORT_SYMBOL(dev_gro_receive);
 
-static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+static gro_result_t
+__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff *p;
 
@@ -2585,7 +2586,7 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-int napi_skb_finish(int ret, struct sk_buff *skb)
+int napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
 	int err = NET_RX_SUCCESS;
 
@@ -2600,6 +2601,10 @@ int napi_skb_finish(int ret, struct sk_buff *skb)
 	case GRO_MERGED_FREE:
 		kfree_skb(skb);
 		break;
+
+	case GRO_HELD:
+	case GRO_MERGED:
+		break;
 	}
 
 	return err;
@@ -2652,7 +2657,8 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_get_frags);
 
-int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
+int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
+		      gro_result_t ret)
 {
 	int err = NET_RX_SUCCESS;
 
@@ -2674,6 +2680,9 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 	case GRO_MERGED_FREE:
 		napi_reuse_skb(napi, skb);
 		break;
+
+	case GRO_MERGED:
+		break;
 	}
 
 	return err;
-- 
cgit v1.2.2


From c7c4b3b6e976b95facbb723951bdcd554a3530a4 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 29 Oct 2009 21:36:53 -0700
Subject: gro: Change all receive functions to return GRO result codes

This will allow drivers to adjust their receive path dynamically
based on whether GRO is being applied successfully.

Currently all in-tree callers ignore the return values of these
functions and do not need to be changed.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 38 +++++++++++++++-----------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1dc13744684c..631cc40da197 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2586,18 +2586,15 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-int napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
-	int err = NET_RX_SUCCESS;
-
 	switch (ret) {
 	case GRO_NORMAL:
-		return netif_receive_skb(skb);
+		if (netif_receive_skb(skb))
+			ret = GRO_DROP;
+		break;
 
 	case GRO_DROP:
-		err = NET_RX_DROP;
-		/* fall through */
-
 	case GRO_MERGED_FREE:
 		kfree_skb(skb);
 		break;
@@ -2607,7 +2604,7 @@ int napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 		break;
 	}
 
-	return err;
+	return ret;
 }
 EXPORT_SYMBOL(napi_skb_finish);
 
@@ -2627,7 +2624,7 @@ void skb_gro_reset_offset(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_gro_reset_offset);
 
-int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	skb_gro_reset_offset(skb);
 
@@ -2657,26 +2654,21 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_get_frags);
 
-int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
-		      gro_result_t ret)
+gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
+			       gro_result_t ret)
 {
-	int err = NET_RX_SUCCESS;
-
 	switch (ret) {
 	case GRO_NORMAL:
 	case GRO_HELD:
 		skb->protocol = eth_type_trans(skb, napi->dev);
 
-		if (ret == GRO_NORMAL)
-			return netif_receive_skb(skb);
-
-		skb_gro_pull(skb, -ETH_HLEN);
+		if (ret == GRO_HELD)
+			skb_gro_pull(skb, -ETH_HLEN);
+		else if (netif_receive_skb(skb))
+			ret = GRO_DROP;
 		break;
 
 	case GRO_DROP:
-		err = NET_RX_DROP;
-		/* fall through */
-
 	case GRO_MERGED_FREE:
 		napi_reuse_skb(napi, skb);
 		break;
@@ -2685,7 +2677,7 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 		break;
 	}
 
-	return err;
+	return ret;
 }
 EXPORT_SYMBOL(napi_frags_finish);
 
@@ -2726,12 +2718,12 @@ out:
 }
 EXPORT_SYMBOL(napi_frags_skb);
 
-int napi_gro_frags(struct napi_struct *napi)
+gro_result_t napi_gro_frags(struct napi_struct *napi)
 {
 	struct sk_buff *skb = napi_frags_skb(napi);
 
 	if (!skb)
-		return NET_RX_DROP;
+		return GRO_DROP;
 
 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
 }
-- 
cgit v1.2.2


From 0bd8d53656da72bc065766b5f2a05ca738020b8a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 30 Oct 2009 01:40:11 -0700
Subject: net: use hlist_for_each_entry()

Small cleanup of __dev_get_by_name() and __dev_get_by_index()
to use hlist_for_each_entry() : They'll look like their _rcu variant.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 631cc40da197..94f42a15fff1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -587,13 +587,13 @@ __setup("netdev=", netdev_boot_setup);
 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 {
 	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_name_hash(net, name);
 
-	hlist_for_each(p, dev_name_hash(net, name)) {
-		struct net_device *dev
-			= hlist_entry(p, struct net_device, name_hlist);
+	hlist_for_each_entry(dev, p, head, name_hlist)
 		if (!strncmp(dev->name, name, IFNAMSIZ))
 			return dev;
-	}
+
 	return NULL;
 }
 EXPORT_SYMBOL(__dev_get_by_name);
@@ -638,13 +638,13 @@ EXPORT_SYMBOL(dev_get_by_name);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 {
 	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_index_hash(net, ifindex);
 
-	hlist_for_each(p, dev_index_hash(net, ifindex)) {
-		struct net_device *dev
-			= hlist_entry(p, struct net_device, index_hlist);
+	hlist_for_each_entry(dev, p, head, index_hlist)
 		if (dev->ifindex == ifindex)
 			return dev;
-	}
+
 	return NULL;
 }
 EXPORT_SYMBOL(__dev_get_by_index);
-- 
cgit v1.2.2


From 0c509a6c9393b27a8c5a01acd4a72616206cfc24 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Thu, 29 Oct 2009 14:18:21 +0000
Subject: net: Allow devices to specify a device specific sysfs group.

This isn't beautifully abstracted, but it is simple,
simplifies uses and so far is only needed for the bonding driver.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 89de182353b0..157645c0da73 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -544,8 +544,11 @@ int netdev_register_kobject(struct net_device *net)
 	dev_set_name(dev, "%s", net->name);
 
 #ifdef CONFIG_SYSFS
-	*groups++ = &netstat_group;
+	/* Allow for a device specific group */
+	if (*groups)
+		groups++;
 
+	*groups++ = &netstat_group;
 #ifdef CONFIG_WIRELESS_EXT_SYSFS
 	if (net->ieee80211_ptr)
 		*groups++ = &wireless_group;
-- 
cgit v1.2.2


From 72c9528bab94cc052d00ce241b8e85f5d71e45f0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 30 Oct 2009 07:11:27 +0000
Subject: net: Introduce dev_get_by_name_rcu()

Some workloads hit dev_base_lock rwlock pretty hard.
We can use RCU lookups to avoid touching this rwlock
(and avoid touching netdevice refcount)

netdevices are already freed after a RCU grace period, so this patch
adds no penalty at device dismantle time.

However, it adds a synchronize_rcu() call in dev_change_name()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 94f42a15fff1..f54d8b8a434b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -213,7 +213,7 @@ static int list_netdevice(struct net_device *dev)
 
 	write_lock_bh(&dev_base_lock);
 	list_add_tail(&dev->dev_list, &net->dev_base_head);
-	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
+	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 	hlist_add_head_rcu(&dev->index_hlist,
 			   dev_index_hash(net, dev->ifindex));
 	write_unlock_bh(&dev_base_lock);
@@ -230,7 +230,7 @@ static void unlist_netdevice(struct net_device *dev)
 	/* Unlink dev from the device chain */
 	write_lock_bh(&dev_base_lock);
 	list_del(&dev->dev_list);
-	hlist_del(&dev->name_hlist);
+	hlist_del_rcu(&dev->name_hlist);
 	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
 }
@@ -598,6 +598,32 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
 }
 EXPORT_SYMBOL(__dev_get_by_name);
 
+/**
+ *	dev_get_by_name_rcu	- find a device by its name
+ *	@net: the applicable net namespace
+ *	@name: name to find
+ *
+ *	Find an interface by name.
+ *	If the name is found a pointer to the device is returned.
+ * 	If the name is not found then %NULL is returned.
+ *	The reference counters are not incremented so the caller must be
+ *	careful with locks. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_name_hash(net, name);
+
+	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
+		if (!strncmp(dev->name, name, IFNAMSIZ))
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_name_rcu);
+
 /**
  *	dev_get_by_name		- find a device by its name
  *	@net: the applicable net namespace
@@ -614,11 +640,11 @@ struct net_device *dev_get_by_name(struct net *net, const char *name)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_name(net, name);
+	rcu_read_lock();
+	dev = dev_get_by_name_rcu(net, name);
 	if (dev)
 		dev_hold(dev);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev;
 }
 EXPORT_SYMBOL(dev_get_by_name);
@@ -960,7 +986,12 @@ rollback:
 
 	write_lock_bh(&dev_base_lock);
 	hlist_del(&dev->name_hlist);
-	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
+	write_unlock_bh(&dev_base_lock);
+
+	synchronize_rcu();
+
+	write_lock_bh(&dev_base_lock);
+	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 	write_unlock_bh(&dev_base_lock);
 
 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
@@ -1062,9 +1093,9 @@ void dev_load(struct net *net, const char *name)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_name(net, name);
-	read_unlock(&dev_base_lock);
+	rcu_read_lock();
+	dev = dev_get_by_name_rcu(net, name);
+	rcu_read_unlock();
 
 	if (!dev && capable(CAP_NET_ADMIN))
 		request_module("%s", name);
-- 
cgit v1.2.2


From 9fdce099bb72df534daa6193318feaec177998fc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Oct 2009 14:51:13 +0000
Subject: veth: Fix unregister_netdevice_queue for veth

I tested the recent unregister many changes and got a weird,
nasty and seemingly unrelasted kernel oops. Changing
unregister_netdevice_queue to use list_move_tail fixes
the problem for me.

ip link add type veth
rmmod veth

ls /sys/class/net/
showed one of the veth devices still present.

A subsequent ip link oopsed the box.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index f54d8b8a434b..3c40d545a035 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5258,6 +5258,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
+	INIT_LIST_HEAD(&dev->unreg_list);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 	strcpy(dev->name, name);
@@ -5339,7 +5340,7 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 	ASSERT_RTNL();
 
 	if (head) {
-		list_add_tail(&dev->unreg_list, head);
+		list_move_tail(&dev->unreg_list, head);
 	} else {
 		rollback_registered(dev);
 		/* Finish processing unregister after unlock */
-- 
cgit v1.2.2


From 3710becf8a58a5c6c4e797e3a3c968c161abdb41 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 1 Nov 2009 19:42:09 +0000
Subject: net: RCU locking for simple ioctl()

All ioctls() implemented by dev_ifsioc_locked() :
SIOCGIFFLAGS, SIOCGIFMETRIC, SIOCGIFMTU, SIOCGIFHWADDR,
SIOCGIFSLAVE, SIOCGIFMAP, SIOCGIFINDEX & SIOCGIFTXQLEN
can use RCU lock instead of dev_base_lock rwlock

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 3c40d545a035..76a1502efe67 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4315,12 +4315,12 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 EXPORT_SYMBOL(dev_set_mac_address);
 
 /*
- *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
+ *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
  */
 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
 {
 	int err;
-	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
 
 	if (!dev)
 		return -ENODEV;
@@ -4552,9 +4552,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCGIFINDEX:
 	case SIOCGIFTXQLEN:
 		dev_load(net, ifr.ifr_name);
-		read_lock(&dev_base_lock);
+		rcu_read_lock();
 		ret = dev_ifsioc_locked(net, &ifr, cmd);
-		read_unlock(&dev_base_lock);
+		rcu_read_unlock();
 		if (!ret) {
 			if (colon)
 				*colon = ':';
-- 
cgit v1.2.2


From c6d14c84566d6b70ad9dc1618db0dec87cca9300 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 4 Nov 2009 05:43:23 -0800
Subject: net: Introduce for_each_netdev_rcu() iterator

Adds RCU management to the list of netdevices.

Convert some for_each_netdev() users to RCU version, if
it can avoid read_lock-ing dev_base_lock

Ie:
	read_lock(&dev_base_loack);
	for_each_netdev(net, dev)
		some_action();
	read_unlock(&dev_base_lock);

becomes :

	rcu_read_lock();
	for_each_netdev_rcu(net, dev)
		some_action();
	rcu_read_unlock();


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 76a1502efe67..bf629ac08b87 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -175,7 +175,7 @@ static struct list_head ptype_all __read_mostly;	/* Taps */
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  * semaphore.
  *
- * Pure readers hold dev_base_lock for reading.
+ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  *
  * Writers must hold the rtnl semaphore while they loop through the
  * dev_base_head list, and hold dev_base_lock for writing when they do the
@@ -212,7 +212,7 @@ static int list_netdevice(struct net_device *dev)
 	ASSERT_RTNL();
 
 	write_lock_bh(&dev_base_lock);
-	list_add_tail(&dev->dev_list, &net->dev_base_head);
+	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 	hlist_add_head_rcu(&dev->index_hlist,
 			   dev_index_hash(net, dev->ifindex));
@@ -229,7 +229,7 @@ static void unlist_netdevice(struct net_device *dev)
 
 	/* Unlink dev from the device chain */
 	write_lock_bh(&dev_base_lock);
-	list_del(&dev->dev_list);
+	list_del_rcu(&dev->dev_list);
 	hlist_del_rcu(&dev->name_hlist);
 	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
@@ -799,15 +799,15 @@ struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 	struct net_device *dev, *ret;
 
 	ret = NULL;
-	read_lock(&dev_base_lock);
-	for_each_netdev(net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
 		if (((dev->flags ^ if_flags) & mask) == 0) {
 			dev_hold(dev);
 			ret = dev;
 			break;
 		}
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL(dev_get_by_flags);
@@ -3077,18 +3077,18 @@ static int dev_ifconf(struct net *net, char __user *arg)
  *	in detail.
  */
 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(dev_base_lock)
+	__acquires(RCU)
 {
 	struct net *net = seq_file_net(seq);
 	loff_t off;
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
+	rcu_read_lock();
 	if (!*pos)
 		return SEQ_START_TOKEN;
 
 	off = 1;
-	for_each_netdev(net, dev)
+	for_each_netdev_rcu(net, dev)
 		if (off++ == *pos)
 			return dev;
 
@@ -3097,16 +3097,18 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
 
 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct net *net = seq_file_net(seq);
+	struct net_device *dev = (v == SEQ_START_TOKEN) ?
+				  first_net_device(seq_file_net(seq)) :
+				  next_net_device((struct net_device *)v);
+
 	++*pos;
-	return v == SEQ_START_TOKEN ?
-		first_net_device(net) : next_net_device((struct net_device *)v);
+	return rcu_dereference(dev);
 }
 
 void dev_seq_stop(struct seq_file *seq, void *v)
-	__releases(dev_base_lock)
+	__releases(RCU)
 {
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
-- 
cgit v1.2.2


From bf8e56bfc4fcfcef9f08e6233dc619706807893a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 5 Nov 2009 21:03:39 -0800
Subject: net: sock_bindtodevice() RCU-ification

Avoid dev_hold()/dev_put() in sock_bindtodevice()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 5a51512f638a..38820eaecd43 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -420,14 +420,16 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 	if (devname[0] == '\0') {
 		index = 0;
 	} else {
-		struct net_device *dev = dev_get_by_name(net, devname);
+		struct net_device *dev;
 
+		rcu_read_lock();
+		dev = dev_get_by_name_rcu(net, devname);
+		if (dev)
+			index = dev->ifindex;
+		rcu_read_unlock();
 		ret = -ENODEV;
 		if (!dev)
 			goto out;
-
-		index = dev->ifindex;
-		dev_put(dev);
 	}
 
 	lock_sock(sk);
-- 
cgit v1.2.2


From baac8564547ac7f944af1c2e8cc6fdd57f2836a4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 5 Nov 2009 21:04:32 -0800
Subject: pktgen: tx_bytes might be slightly wrong

cur_pkt_size can be changed in proc fs while pktgen is running,
we better use a private field to get precise tx-bytes counter.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 5ce017bf4afa..d38470a32792 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -340,6 +340,7 @@ struct pktgen_dev {
 	__u16 cur_udp_src;
 	__u16 cur_queue_map;
 	__u32 cur_pkt_size;
+	__u32 last_pkt_size;
 
 	__u8 hh[14];
 	/* = {
@@ -3434,7 +3435,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 			pkt_dev->clone_count--;	/* back out increment, OOM */
 			return;
 		}
-
+		pkt_dev->last_pkt_size = pkt_dev->skb->len;
 		pkt_dev->allocated_skbs++;
 		pkt_dev->clone_count = 0;	/* reset counter */
 	}
@@ -3461,7 +3462,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		pkt_dev->last_ok = 1;
 		pkt_dev->sofar++;
 		pkt_dev->seq_num++;
-		pkt_dev->tx_bytes += pkt_dev->cur_pkt_size;
+		pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
 		break;
 	default: /* Drivers are not supposed to return other values! */
 		if (net_ratelimit())
-- 
cgit v1.2.2


From 000ba2e43f33901859fd794bb33c885909d53b3b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 5 Nov 2009 22:37:11 -0800
Subject: net: Fix build warning in sock_bindtodevice().

net/core/sock.c: In function 'sock_setsockopt':
net/core/sock.c:396: warning: 'index' may be used uninitialized in this function
net/core/sock.c:396: note: 'index' was declared here

GCC can't see that all paths initialize index, so just
set it to the default (0) and eliminate the specific
code block that handles the null device name string.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 38820eaecd43..76ff58d43e26 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -417,9 +417,8 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 	if (copy_from_user(devname, optval, optlen))
 		goto out;
 
-	if (devname[0] == '\0') {
-		index = 0;
-	} else {
+	index = 0;
+	if (devname[0] != '\0') {
 		struct net_device *dev;
 
 		rcu_read_lock();
-- 
cgit v1.2.2


From d3bcfefaca27c1bfc8f740f5fff5b25d52ed1211 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 6 Nov 2009 22:17:25 -0800
Subject: net: Replace old style lock initializer

SPIN_LOCK_UNLOCKED is deprecated. Use DEFINE_SPINLOCK instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/drop_monitor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 0a113f26bc9f..b8e9d3a86887 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -41,7 +41,7 @@ static void send_dm_alert(struct work_struct *unused);
  * netlink alerts
  */
 static int trace_state = TRACE_OFF;
-static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(trace_state_lock);
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
-- 
cgit v1.2.2


From e0d087af725b09358336098a6b57bb7f90f96175 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 7 Nov 2009 01:26:17 -0800
Subject: rtnetlink: Cleanups

Pure cleanups patch

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 54 ++++++++++++++++++++++------------------------------
 1 file changed, 23 insertions(+), 31 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 391a62cd9df6..e2f3317f290f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -38,7 +38,6 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
-#include <asm/string.h>
 
 #include <linux/inet.h>
 #include <linux/netdevice.h>
@@ -53,8 +52,7 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 
-struct rtnl_link
-{
+struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
 };
@@ -65,6 +63,7 @@ void rtnl_lock(void)
 {
 	mutex_lock(&rtnl_mutex);
 }
+EXPORT_SYMBOL(rtnl_lock);
 
 void __rtnl_unlock(void)
 {
@@ -76,16 +75,19 @@ void rtnl_unlock(void)
 	/* This fellow will unlock it for us. */
 	netdev_run_todo();
 }
+EXPORT_SYMBOL(rtnl_unlock);
 
 int rtnl_trylock(void)
 {
 	return mutex_trylock(&rtnl_mutex);
 }
+EXPORT_SYMBOL(rtnl_trylock);
 
 int rtnl_is_locked(void)
 {
 	return mutex_is_locked(&rtnl_mutex);
 }
+EXPORT_SYMBOL(rtnl_is_locked);
 
 static struct rtnl_link *rtnl_msg_handlers[NPROTO];
 
@@ -168,7 +170,6 @@ int __rtnl_register(int protocol, int msgtype,
 
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(__rtnl_register);
 
 /**
@@ -188,7 +189,6 @@ void rtnl_register(int protocol, int msgtype,
 		      "protocol = %d, message type = %d\n",
 		      protocol, msgtype);
 }
-
 EXPORT_SYMBOL_GPL(rtnl_register);
 
 /**
@@ -213,7 +213,6 @@ int rtnl_unregister(int protocol, int msgtype)
 
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(rtnl_unregister);
 
 /**
@@ -230,7 +229,6 @@ void rtnl_unregister_all(int protocol)
 	kfree(rtnl_msg_handlers[protocol]);
 	rtnl_msg_handlers[protocol] = NULL;
 }
-
 EXPORT_SYMBOL_GPL(rtnl_unregister_all);
 
 static LIST_HEAD(link_ops);
@@ -253,7 +251,6 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
 	list_add_tail(&ops->list, &link_ops);
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(__rtnl_link_register);
 
 /**
@@ -271,7 +268,6 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
 	rtnl_unlock();
 	return err;
 }
-
 EXPORT_SYMBOL_GPL(rtnl_link_register);
 
 static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
@@ -309,7 +305,6 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops)
 	}
 	list_del(&ops->list);
 }
-
 EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
 
 /**
@@ -322,7 +317,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops)
 	__rtnl_link_unregister(ops);
 	rtnl_unlock();
 }
-
 EXPORT_SYMBOL_GPL(rtnl_link_unregister);
 
 static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
@@ -427,12 +421,13 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
 	struct rtattr *rta;
 	int size = RTA_LENGTH(attrlen);
 
-	rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
+	rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size));
 	rta->rta_type = attrtype;
 	rta->rta_len = size;
 	memcpy(RTA_DATA(rta), data, attrlen);
 	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 }
+EXPORT_SYMBOL(__rta_fill);
 
 int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
 {
@@ -454,6 +449,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
 
 	return nlmsg_unicast(rtnl, skb, pid);
 }
+EXPORT_SYMBOL(rtnl_unicast);
 
 void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
 		 struct nlmsghdr *nlh, gfp_t flags)
@@ -466,6 +462,7 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
 
 	nlmsg_notify(rtnl, skb, pid, group, report, flags);
 }
+EXPORT_SYMBOL(rtnl_notify);
 
 void rtnl_set_sk_err(struct net *net, u32 group, int error)
 {
@@ -473,6 +470,7 @@ void rtnl_set_sk_err(struct net *net, u32 group, int error)
 
 	netlink_set_err(rtnl, 0, group, error);
 }
+EXPORT_SYMBOL(rtnl_set_sk_err);
 
 int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
 {
@@ -501,6 +499,7 @@ nla_put_failure:
 	nla_nest_cancel(skb, mx);
 	return -EMSGSIZE;
 }
+EXPORT_SYMBOL(rtnetlink_put_metrics);
 
 int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 		       u32 ts, u32 tsage, long expires, u32 error)
@@ -520,14 +519,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 
 	return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
 }
-
 EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
 
 static void set_operstate(struct net_device *dev, unsigned char transition)
 {
 	unsigned char operstate = dev->operstate;
 
-	switch(transition) {
+	switch (transition) {
 	case IF_OPER_UP:
 		if ((operstate == IF_OPER_DORMANT ||
 		     operstate == IF_OPER_UNKNOWN) &&
@@ -728,6 +726,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
 };
+EXPORT_SYMBOL(ifla_policy);
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 	[IFLA_INFO_KIND]	= { .type = NLA_STRING },
@@ -932,7 +931,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		goto errout;
 	}
 
-	if ((err = validate_linkmsg(dev, tb)) < 0)
+	err = validate_linkmsg(dev, tb);
+	if (err < 0)
 		goto errout;
 
 	err = do_setlink(dev, ifm, tb, ifname, 0);
@@ -985,7 +985,8 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname,
 	unsigned int real_num_queues = 1;
 
 	if (ops->get_tx_queues) {
-		err = ops->get_tx_queues(net, tb, &num_queues, &real_num_queues);
+		err = ops->get_tx_queues(net, tb, &num_queues,
+					 &real_num_queues);
 		if (err)
 			goto err;
 	}
@@ -1026,6 +1027,7 @@ err_free:
 err:
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL(rtnl_create_link);
 
 static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
@@ -1059,7 +1061,8 @@ replay:
 	else
 		dev = NULL;
 
-	if ((err = validate_linkmsg(dev, tb)) < 0)
+	err = validate_linkmsg(dev, tb);
+	if (err < 0)
 		return err;
 
 	if (tb[IFLA_LINKINFO]) {
@@ -1210,7 +1213,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (s_idx == 0)
 		s_idx = 1;
-	for (idx=1; idx<NPROTO; idx++) {
+	for (idx = 1; idx < NPROTO; idx++) {
 		int type = cb->nlh->nlmsg_type-RTM_BASE;
 		if (idx < s_idx || idx == PF_PACKET)
 			continue;
@@ -1277,7 +1280,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
 		return 0;
 
-	family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
+	family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family;
 	if (family >= NPROTO)
 		return -EAFNOSUPPORT;
 
@@ -1310,7 +1313,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	if (nlh->nlmsg_len > min_len) {
 		int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
-		struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len);
+		struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len);
 
 		while (RTA_OK(attr, attrlen)) {
 			unsigned flavor = attr->rta_type;
@@ -1416,14 +1419,3 @@ void __init rtnetlink_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all);
 }
 
-EXPORT_SYMBOL(__rta_fill);
-EXPORT_SYMBOL(rtnetlink_put_metrics);
-EXPORT_SYMBOL(rtnl_lock);
-EXPORT_SYMBOL(rtnl_trylock);
-EXPORT_SYMBOL(rtnl_unlock);
-EXPORT_SYMBOL(rtnl_is_locked);
-EXPORT_SYMBOL(rtnl_unicast);
-EXPORT_SYMBOL(rtnl_notify);
-EXPORT_SYMBOL(rtnl_set_sk_err);
-EXPORT_SYMBOL(rtnl_create_link);
-EXPORT_SYMBOL(ifla_policy);
-- 
cgit v1.2.2


From 81adee47dfb608df3ad0b91d230fb3cef75f0060 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Sun, 8 Nov 2009 00:53:51 -0800
Subject: net: Support specifying the network namespace upon device creation.

There is no good reason to not support userspace specifying the
network namespace during device creation, and it makes it easier
to create a network device and pass it to a child network namespace
with a well known name.

We have to be careful to ensure that the target network namespace
for the new device exists through the life of the call.  To keep
that logic clear I have factored out the network namespace grabbing
logic into rtnl_link_get_net.

In addtion we need to continue to pass the source network namespace
to the rtnl_link_ops.newlink method so that we can find the base
device source network namespace.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/rtnetlink.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e2f3317f290f..33148a568199 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -733,6 +733,20 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 	[IFLA_INFO_DATA]	= { .type = NLA_NESTED },
 };
 
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+	struct net *net;
+	/* Examine the link attributes and figure out which
+	 * network namespace we are talking about.
+	 */
+	if (tb[IFLA_NET_NS_PID])
+		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+	else
+		net = get_net(src_net);
+	return net;
+}
+EXPORT_SYMBOL(rtnl_link_get_net);
+
 static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 {
 	if (dev) {
@@ -756,8 +770,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 	int err;
 
 	if (tb[IFLA_NET_NS_PID]) {
-		struct net *net;
-		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+		struct net *net = rtnl_link_get_net(dev_net(dev), tb);
 		if (IS_ERR(net)) {
 			err = PTR_ERR(net);
 			goto errout;
@@ -976,8 +989,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	return 0;
 }
 
-struct net_device *rtnl_create_link(struct net *net, char *ifname,
-		const struct rtnl_link_ops *ops, struct nlattr *tb[])
+struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
+	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
 {
 	int err;
 	struct net_device *dev;
@@ -985,7 +998,7 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname,
 	unsigned int real_num_queues = 1;
 
 	if (ops->get_tx_queues) {
-		err = ops->get_tx_queues(net, tb, &num_queues,
+		err = ops->get_tx_queues(src_net, tb, &num_queues,
 					 &real_num_queues);
 		if (err)
 			goto err;
@@ -995,16 +1008,16 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname,
 	if (!dev)
 		goto err;
 
+	dev_net_set(dev, net);
+	dev->rtnl_link_ops = ops;
 	dev->real_num_tx_queues = real_num_queues;
+
 	if (strchr(dev->name, '%')) {
 		err = dev_alloc_name(dev, dev->name);
 		if (err < 0)
 			goto err_free;
 	}
 
-	dev_net_set(dev, net);
-	dev->rtnl_link_ops = ops;
-
 	if (tb[IFLA_MTU])
 		dev->mtu = nla_get_u32(tb[IFLA_MTU]);
 	if (tb[IFLA_ADDRESS])
@@ -1083,6 +1096,7 @@ replay:
 
 	if (1) {
 		struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL;
+		struct net *dest_net;
 
 		if (ops) {
 			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
@@ -1147,17 +1161,19 @@ replay:
 		if (!ifname[0])
 			snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
 
-		dev = rtnl_create_link(net, ifname, ops, tb);
+		dest_net = rtnl_link_get_net(net, tb);
+		dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
 
 		if (IS_ERR(dev))
 			err = PTR_ERR(dev);
 		else if (ops->newlink)
-			err = ops->newlink(dev, tb, data);
+			err = ops->newlink(net, dev, tb, data);
 		else
 			err = register_netdevice(dev);
-
 		if (err < 0 && !IS_ERR(dev))
 			free_netdev(dev);
+
+		put_net(dest_net);
 		return err;
 	}
 }
-- 
cgit v1.2.2


From e84af6ddef0e447c56b256a2bb5a90af11bdac2e Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 10 Nov 2009 14:11:01 +0000
Subject: skbuff: Do not allow skb recycling with disabled IRQs

NAPI drivers try to recycle SKBs in their polling routine, but we
generally don't know the context in which the polling will be called,
and the skb recycling itself may require IRQs to be enabled.

This patch adds irqs_disabled() test to the skb_recycle_check()
routine, so that we'll not let the drivers hit the skb recycling
path with IRQs disabled.

As a side effect, this patch actually disables skb recycling for some
[broken] drivers. E.g. gianfar driver grabs an irqsave spinlock during
TX ring processing, and then tries to recycle an skb, and that caused
the following badness:

nf_conntrack version 0.5.0 (1008 buckets, 4032 max)
------------[ cut here ]------------
Badness at kernel/softirq.c:143
NIP: c003e3c4 LR: c423a528 CTR: c003e344
...
NIP [c003e3c4] local_bh_enable+0x80/0xc4
LR [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack]
Call Trace:
[c15d1b60] [c003e32c] local_bh_disable+0x1c/0x34 (unreliable)
[c15d1b70] [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack]
[c15d1b80] [c02c6370] nf_conntrack_destroy+0x3c/0x70

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 80a96166df39..941bac907484 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -493,6 +493,9 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
 {
 	struct skb_shared_info *shinfo;
 
+	if (irqs_disabled())
+		return 0;
+
 	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
 		return 0;
 
-- 
cgit v1.2.2


From 08e9897d512fe7a67e46209543b3815b57a36dc7 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Tue, 10 Nov 2009 07:20:34 +0000
Subject: netdev: fold name hash properly (v3)

The full_name_hash function does not produce well distributed values in
the lower bits, so most code uses hash_32() to fold it.  This is really
a bug introduced when name hashing was added, back in 2.5 when I added
name hashing.

hash_32 is all that is needed since full_name_hash returns unsigned int
which is only 32 bits on 64 bit platforms.

Also, there is no point in using hash_32 on ifindex, because the is naturally
sequential and usually well distributed.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index bf629ac08b87..ad8e320ceba7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -79,6 +79,7 @@
 #include <linux/cpu.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/hash.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
 #include <linux/string.h>
@@ -196,7 +197,7 @@ EXPORT_SYMBOL(dev_base_lock);
 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 {
 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
-	return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)];
+	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 }
 
 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
-- 
cgit v1.2.2


From f8572d8f2a2ba75408b97dc24ef47c83671795d7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 5 Nov 2009 13:32:03 -0800
Subject: sysctl net: Remove unused binary sysctl code

Now that sys_sysctl is a compatiblity wrapper around /proc/sys
all sysctl strategy routines, and all ctl_name and strategy
entries in the sysctl tables are unused, and can be
revmoed.

In addition neigh_sysctl_register has been modified to no longer
take a strategy argument and it's callers have been modified not
to pass one.

Cc: "David Miller" <davem@davemloft.net>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: netdev@vger.kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/core/neighbour.c       | 47 ++++++----------------------------------------
 net/core/sysctl_net_core.c | 21 ++++-----------------
 2 files changed, 10 insertions(+), 58 deletions(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e587e6819698..2b54e6c6a7c8 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2566,21 +2566,18 @@ static struct neigh_sysctl_table {
 } neigh_sysctl_template __read_mostly = {
 	.neigh_vars = {
 		{
-			.ctl_name	= NET_NEIGH_MCAST_SOLICIT,
 			.procname	= "mcast_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
 		{
-			.ctl_name	= NET_NEIGH_UCAST_SOLICIT,
 			.procname	= "ucast_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
 		{
-			.ctl_name	= NET_NEIGH_APP_SOLICIT,
 			.procname	= "app_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
@@ -2593,38 +2590,30 @@ static struct neigh_sysctl_table {
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_REACHABLE_TIME,
 			.procname	= "base_reachable_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
-			.strategy	= sysctl_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_DELAY_PROBE_TIME,
 			.procname	= "delay_first_probe_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
-			.strategy	= sysctl_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_GC_STALE_TIME,
 			.procname	= "gc_stale_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
-			.strategy	= sysctl_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_UNRES_QLEN,
 			.procname	= "unres_qlen",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
 		{
-			.ctl_name	= NET_NEIGH_PROXY_QLEN,
 			.procname	= "proxy_qlen",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
@@ -2649,45 +2638,36 @@ static struct neigh_sysctl_table {
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_RETRANS_TIME_MS,
 			.procname	= "retrans_time_ms",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_ms_jiffies,
-			.strategy	= sysctl_ms_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_REACHABLE_TIME_MS,
 			.procname	= "base_reachable_time_ms",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_ms_jiffies,
-			.strategy	= sysctl_ms_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_GC_INTERVAL,
 			.procname	= "gc_interval",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
-			.strategy	= sysctl_jiffies,
 		},
 		{
-			.ctl_name	= NET_NEIGH_GC_THRESH1,
 			.procname	= "gc_thresh1",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
 		{
-			.ctl_name	= NET_NEIGH_GC_THRESH2,
 			.procname	= "gc_thresh2",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
 		{
-			.ctl_name	= NET_NEIGH_GC_THRESH3,
 			.procname	= "gc_thresh3",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
@@ -2699,7 +2679,7 @@ static struct neigh_sysctl_table {
 
 int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 			  int p_id, int pdev_id, char *p_name,
-			  proc_handler *handler, ctl_handler *strategy)
+			  proc_handler *handler)
 {
 	struct neigh_sysctl_table *t;
 	const char *dev_name_source = NULL;
@@ -2710,10 +2690,10 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 #define NEIGH_CTL_PATH_DEV	3
 
 	struct ctl_path neigh_path[] = {
-		{ .procname = "net",	 .ctl_name = CTL_NET, },
-		{ .procname = "proto",	 .ctl_name = 0, },
-		{ .procname = "neigh",	 .ctl_name = 0, },
-		{ .procname = "default", .ctl_name = NET_PROTO_CONF_DEFAULT, },
+		{ .procname = "net",	 },
+		{ .procname = "proto",	 },
+		{ .procname = "neigh",	 },
+		{ .procname = "default", },
 		{ },
 	};
 
@@ -2738,7 +2718,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 
 	if (dev) {
 		dev_name_source = dev->name;
-		neigh_path[NEIGH_CTL_PATH_DEV].ctl_name = dev->ifindex;
 		/* Terminate the table early */
 		memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
 	} else {
@@ -2750,31 +2729,19 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 	}
 
 
-	if (handler || strategy) {
+	if (handler) {
 		/* RetransTime */
 		t->neigh_vars[3].proc_handler = handler;
-		t->neigh_vars[3].strategy = strategy;
 		t->neigh_vars[3].extra1 = dev;
-		if (!strategy)
-			t->neigh_vars[3].ctl_name = CTL_UNNUMBERED;
 		/* ReachableTime */
 		t->neigh_vars[4].proc_handler = handler;
-		t->neigh_vars[4].strategy = strategy;
 		t->neigh_vars[4].extra1 = dev;
-		if (!strategy)
-			t->neigh_vars[4].ctl_name = CTL_UNNUMBERED;
 		/* RetransTime (in milliseconds)*/
 		t->neigh_vars[12].proc_handler = handler;
-		t->neigh_vars[12].strategy = strategy;
 		t->neigh_vars[12].extra1 = dev;
-		if (!strategy)
-			t->neigh_vars[12].ctl_name = CTL_UNNUMBERED;
 		/* ReachableTime (in milliseconds) */
 		t->neigh_vars[13].proc_handler = handler;
-		t->neigh_vars[13].strategy = strategy;
 		t->neigh_vars[13].extra1 = dev;
-		if (!strategy)
-			t->neigh_vars[13].ctl_name = CTL_UNNUMBERED;
 	}
 
 	t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
@@ -2782,9 +2749,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 		goto free;
 
 	neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name;
-	neigh_path[NEIGH_CTL_PATH_NEIGH].ctl_name = pdev_id;
 	neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name;
-	neigh_path[NEIGH_CTL_PATH_PROTO].ctl_name = p_id;
 
 	t->sysctl_header =
 		register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7db1de0497c6..1ce4e6e85125 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -17,7 +17,6 @@
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
-		.ctl_name	= NET_CORE_WMEM_MAX,
 		.procname	= "wmem_max",
 		.data		= &sysctl_wmem_max,
 		.maxlen		= sizeof(int),
@@ -25,7 +24,6 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_RMEM_MAX,
 		.procname	= "rmem_max",
 		.data		= &sysctl_rmem_max,
 		.maxlen		= sizeof(int),
@@ -33,7 +31,6 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_WMEM_DEFAULT,
 		.procname	= "wmem_default",
 		.data		= &sysctl_wmem_default,
 		.maxlen		= sizeof(int),
@@ -41,7 +38,6 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_RMEM_DEFAULT,
 		.procname	= "rmem_default",
 		.data		= &sysctl_rmem_default,
 		.maxlen		= sizeof(int),
@@ -49,7 +45,6 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_DEV_WEIGHT,
 		.procname	= "dev_weight",
 		.data		= &weight_p,
 		.maxlen		= sizeof(int),
@@ -57,7 +52,6 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_MAX_BACKLOG,
 		.procname	= "netdev_max_backlog",
 		.data		= &netdev_max_backlog,
 		.maxlen		= sizeof(int),
@@ -65,16 +59,13 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_MSG_COST,
 		.procname	= "message_cost",
 		.data		= &net_ratelimit_state.interval,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
-		.strategy	= sysctl_jiffies,
 	},
 	{
-		.ctl_name	= NET_CORE_MSG_BURST,
 		.procname	= "message_burst",
 		.data		= &net_ratelimit_state.burst,
 		.maxlen		= sizeof(int),
@@ -82,7 +73,6 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= NET_CORE_OPTMEM_MAX,
 		.procname	= "optmem_max",
 		.data		= &sysctl_optmem_max,
 		.maxlen		= sizeof(int),
@@ -91,7 +81,6 @@ static struct ctl_table net_core_table[] = {
 	},
 #endif /* CONFIG_NET */
 	{
-		.ctl_name	= NET_CORE_BUDGET,
 		.procname	= "netdev_budget",
 		.data		= &netdev_budget,
 		.maxlen		= sizeof(int),
@@ -99,31 +88,29 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= NET_CORE_WARNINGS,
 		.procname	= "warnings",
 		.data		= &net_msg_warn,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static struct ctl_table netns_core_table[] = {
 	{
-		.ctl_name	= NET_CORE_SOMAXCONN,
 		.procname	= "somaxconn",
 		.data		= &init_net.core.sysctl_somaxconn,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 __net_initdata struct ctl_path net_core_path[] = {
-	{ .procname = "net", .ctl_name = CTL_NET, },
-	{ .procname = "core", .ctl_name = NET_CORE, },
+	{ .procname = "net", },
+	{ .procname = "core", },
 	{ },
 };
 
-- 
cgit v1.2.2


From 572a9d7b6fc7f20f573664063324c086be310c42 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 10 Nov 2009 06:14:14 +0000
Subject: net: allow to propagate errors through ->ndo_hard_start_xmit()

Currently the ->ndo_hard_start_xmit() callbacks are only permitted to return
one of the NETDEV_TX codes. This prevents any kind of error propagation for
virtual devices, like queue congestion of the underlying device in case of
layered devices, or unreachability in case of tunnels.

This patches changes the NET_XMIT codes to avoid clashes with the NETDEV_TX
codes and changes the two callers of dev_hard_start_xmit() to expect either
errno codes, NET_XMIT codes or NETDEV_TX codes as return value.

In case of qdisc_restart(), all non NETDEV_TX codes are mapped to NETDEV_TX_OK
since no error propagation is possible when using qdiscs. In case of
dev_queue_xmit(), the error is propagated upwards.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index ad8e320ceba7..548340b57296 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1757,7 +1757,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	int rc;
+	int rc = NETDEV_TX_OK;
 
 	if (likely(!skb->next)) {
 		if (!list_empty(&ptype_all))
@@ -1805,6 +1805,8 @@ gso:
 		nskb->next = NULL;
 		rc = ops->ndo_start_xmit(nskb, dev);
 		if (unlikely(rc != NETDEV_TX_OK)) {
+			if (rc & ~NETDEV_TX_MASK)
+				goto out_kfree_gso_skb;
 			nskb->next = skb->next;
 			skb->next = nskb;
 			return rc;
@@ -1814,11 +1816,12 @@ gso:
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
 
-	skb->destructor = DEV_GSO_CB(skb)->destructor;
-
+out_kfree_gso_skb:
+	if (likely(skb->next == NULL))
+		skb->destructor = DEV_GSO_CB(skb)->destructor;
 out_kfree_skb:
 	kfree_skb(skb);
-	return NETDEV_TX_OK;
+	return rc;
 }
 
 static u32 skb_tx_hashrnd;
@@ -1906,6 +1909,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	return rc;
 }
 
+static inline bool dev_xmit_complete(int rc)
+{
+	/* successful transmission */
+	if (rc == NETDEV_TX_OK)
+		return true;
+
+	/* error while transmitting, driver consumed skb */
+	if (rc < 0)
+		return true;
+
+	/* error while queueing to a different device, driver consumed skb */
+	if (rc & NET_XMIT_MASK)
+		return true;
+
+	return false;
+}
+
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
@@ -2003,8 +2023,8 @@ gso:
 			HARD_TX_LOCK(dev, txq, cpu);
 
 			if (!netif_tx_queue_stopped(txq)) {
-				rc = NET_XMIT_SUCCESS;
-				if (!dev_hard_start_xmit(skb, dev, txq)) {
+				rc = dev_hard_start_xmit(skb, dev, txq);
+				if (dev_xmit_complete(rc)) {
 					HARD_TX_UNLOCK(dev, txq);
 					goto out;
 				}
-- 
cgit v1.2.2


From ed04642f753b1240fc65c2978b7365e93209972a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 13 Nov 2009 21:54:04 +0000
Subject: net: check the return value of ndo_select_queue()

Check the return value of ndo_select_queue(). If the value isn't smaller
than the real_num_tx_queues, print a warning message, and reset it to zero.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
----
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 548340b57296..32045df1da5c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1848,6 +1848,20 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_tx_hash);
 
+static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
+{
+	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
+		if (net_ratelimit()) {
+			WARN(1, "%s selects TX queue %d, but "
+			     "real number of TX queues is %d\n",
+			     dev->name, queue_index,
+			     dev->real_num_tx_queues);
+		}
+		return 0;
+	}
+	return queue_index;
+}
+
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
@@ -1861,6 +1875,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 
 		if (ops->ndo_select_queue) {
 			queue_index = ops->ndo_select_queue(dev, skb);
+			queue_index = dev_cap_txqueue(dev, queue_index);
 		} else {
 			queue_index = 0;
 			if (dev->real_num_tx_queues > 1)
-- 
cgit v1.2.2


From 9a1654ba0b50402a6bd03c7b0fe9b0200a5ea7b1 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sun, 15 Nov 2009 07:20:12 +0000
Subject: net: Optimize hard_start_xmit() return checking

Recent changes in the TX error propagation require additional checking
and masking of values returned from hard_start_xmit(), mainly to
separate cases where skb was consumed. This aim can be simplified by
changing the order of NETDEV_TX and NET_XMIT codes, because the latter
are treated similarly to negative (ERRNO) values.

After this change much simpler dev_xmit_complete() is also used in
sch_direct_xmit(), so it is moved to netdevice.h.

Additionally NET_RX definitions in netdevice.h are moved up from
between TX codes to avoid confusion while reading the TX comment.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 32045df1da5c..4b24d79414e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1924,23 +1924,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	return rc;
 }
 
-static inline bool dev_xmit_complete(int rc)
-{
-	/* successful transmission */
-	if (rc == NETDEV_TX_OK)
-		return true;
-
-	/* error while transmitting, driver consumed skb */
-	if (rc < 0)
-		return true;
-
-	/* error while queueing to a different device, driver consumed skb */
-	if (rc & NET_XMIT_MASK)
-		return true;
-
-	return false;
-}
-
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
-- 
cgit v1.2.2


From d83345adf96bc13a5e360f4649a2e68ef968dec0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 16 Nov 2009 03:36:51 +0000
Subject: net: add dev_txq_stats_fold() helper

Some drivers ndo_get_stats() method need to perform txqueue stats folding.

Move folding from dev_get_stats() to a new dev_txq_stats_fold() function

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 48 +++++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index d867522290b9..c3e0578d29d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5168,6 +5168,32 @@ void netdev_run_todo(void)
 	}
 }
 
+/**
+ *	dev_txq_stats_fold - fold tx_queues stats
+ *	@dev: device to get statistics from
+ *	@stats: struct net_device_stats to hold results
+ */
+void dev_txq_stats_fold(const struct net_device *dev,
+			struct net_device_stats *stats)
+{
+	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+	unsigned int i;
+	struct netdev_queue *txq;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		txq = netdev_get_tx_queue(dev, i);
+		tx_bytes   += txq->tx_bytes;
+		tx_packets += txq->tx_packets;
+		tx_dropped += txq->tx_dropped;
+	}
+	if (tx_bytes || tx_packets || tx_dropped) {
+		stats->tx_bytes   = tx_bytes;
+		stats->tx_packets = tx_packets;
+		stats->tx_dropped = tx_dropped;
+	}
+}
+EXPORT_SYMBOL(dev_txq_stats_fold);
+
 /**
  *	dev_get_stats	- get network device statistics
  *	@dev: device to get statistics from
@@ -5182,25 +5208,9 @@ const struct net_device_stats *dev_get_stats(struct net_device *dev)
 
 	if (ops->ndo_get_stats)
 		return ops->ndo_get_stats(dev);
-	else {
-		unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
-		struct net_device_stats *stats = &dev->stats;
-		unsigned int i;
-		struct netdev_queue *txq;
-
-		for (i = 0; i < dev->num_tx_queues; i++) {
-			txq = netdev_get_tx_queue(dev, i);
-			tx_bytes   += txq->tx_bytes;
-			tx_packets += txq->tx_packets;
-			tx_dropped += txq->tx_dropped;
-		}
-		if (tx_bytes || tx_packets || tx_dropped) {
-			stats->tx_bytes   = tx_bytes;
-			stats->tx_packets = tx_packets;
-			stats->tx_dropped = tx_dropped;
-		}
-		return stats;
-	}
+
+	dev_txq_stats_fold(dev, &dev->stats);
+	return &dev->stats;
 }
 EXPORT_SYMBOL(dev_get_stats);
 
-- 
cgit v1.2.2


From 395264d509aec45149745843d9a737140a1ece16 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Mon, 16 Nov 2009 13:49:35 +0000
Subject: net: introduce NETDEV_UNREGISTER_PERNET

This new event is called once for each unique net namespace in batched
unregister operations (with the argument set to a random device from
that namespace) and once per device in non-batched unregister
operations.

It allows us to factorize some device unregister work such as clearing the
routing cache.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index c3e0578d29d1..e25fe5d9343b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1344,6 +1344,7 @@ rollback:
 				nb->notifier_call(nb, NETDEV_DOWN, dev);
 			}
 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+			nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev);
 		}
 	}
 
@@ -4721,7 +4722,8 @@ static void net_set_todo(struct net_device *dev)
 
 static void rollback_registered_many(struct list_head *head)
 {
-	struct net_device *dev;
+	struct net_device *dev, *aux, *fdev;
+	LIST_HEAD(pernet_list);
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
@@ -4779,8 +4781,24 @@ static void rollback_registered_many(struct list_head *head)
 
 	synchronize_net();
 
-	list_for_each_entry(dev, head, unreg_list)
+	list_for_each_entry_safe(dev, aux, head, unreg_list) {
+		int new_net = 1;
+		list_for_each_entry(fdev, &pernet_list, unreg_list) {
+			if (dev_net(dev) == dev_net(fdev)) {
+				new_net = 0;
+				dev_put(dev);
+				break;
+			}
+		}
+		if (new_net)
+			list_move(&dev->unreg_list, &pernet_list);
+	}
+
+	list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) {
+		call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
+		list_move(&dev->unreg_list, head);
 		dev_put(dev);
+	}
 }
 
 static void rollback_registered(struct net_device *dev)
@@ -5074,6 +5092,8 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 			/* Rebroadcast unregister notification */
 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+			/* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users
+			 * should have already handle it the first time */
 
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 				     &dev->state)) {
@@ -5385,6 +5405,10 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
  *	unregister_netdevice_many - unregister many devices
  *	@head: list of devices
  *
+ *	WARNING: Calling this modifies the given list
+ *	(in rollback_registered_many). It may change the order of the elements
+ *	in the list. However, you can assume it does not add or delete elements
+ *	to/from the list.
  */
 void unregister_netdevice_many(struct list_head *head)
 {
@@ -5504,6 +5528,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	   this device. They should clean all the things.
 	*/
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
 
 	/*
 	 *	Flush the unicast and multicast chains
-- 
cgit v1.2.2


From e014debecd3ee3832e6476b3a9c948edfcfd1250 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 17 Nov 2009 05:59:21 +0000
Subject: linkwatch: linkwatch_forget_dev() to speedup device dismantle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Herbert Xu a écrit :
> On Tue, Nov 17, 2009 at 04:26:04AM -0800, David Miller wrote:
>> Really, the link watch stuff is just due for a redesign.  I don't
>> think a simple hack is going to cut it this time, sorry Eric :-)
>
> I have no objections against any redesigns, but since the only
> caller of linkwatch_forget_dev runs in process context with the
> RTNL, it could also legally emit those events.

Thanks guys, here an updated version then, before linkwatch surgery ?

In this version, I force the event to be sent synchronously.

[PATCH net-next-2.6] linkwatch: linkwatch_forget_dev() to speedup device dismantle

time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105

real	0m0.266s
user	0m0.000s
sys	0m0.001s

real	0m0.770s
user	0m0.000s
sys	0m0.000s

real	0m1.022s
user	0m0.000s
sys	0m0.000s

One problem of current schem in vlan dismantle phase is the
holding of device done by following chain :

vlan_dev_stop() ->
	netif_carrier_off(dev) ->
		linkwatch_fire_event(dev) ->
			dev_hold() ...

And __linkwatch_run_queue() runs up to one second later...

A generic fix to this problem is to add a linkwatch_forget_dev() method
to unlink the device from the list of watched devices.

dev->link_watch_next becomes dev->link_watch_list (and use a bit more memory),
to be able to unlink device in O(1).

After patch :
time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105

real    0m0.024s
user    0m0.000s
sys     0m0.000s

real    0m0.032s
user    0m0.000s
sys     0m0.001s

real    0m0.033s
user    0m0.000s
sys     0m0.000s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c        |  3 ++
 net/core/link_watch.c | 94 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 60 insertions(+), 37 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index e25fe5d9343b..c128af708ebf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5085,6 +5085,8 @@ static void netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
 
+	linkwatch_forget_dev(dev);
+
 	rebroadcast_time = warning_time = jiffies;
 	while (atomic_read(&dev->refcnt) != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
@@ -5311,6 +5313,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
+	INIT_LIST_HEAD(&dev->link_watch_list);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 	strcpy(dev->name, name);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index bf8f7af699d7..5910b555a54a 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -35,7 +35,7 @@ static unsigned long linkwatch_nextevent;
 static void linkwatch_event(struct work_struct *dummy);
 static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
 
-static struct net_device *lweventlist;
+static LIST_HEAD(lweventlist);
 static DEFINE_SPINLOCK(lweventlist_lock);
 
 static unsigned char default_operstate(const struct net_device *dev)
@@ -89,8 +89,10 @@ static void linkwatch_add_event(struct net_device *dev)
 	unsigned long flags;
 
 	spin_lock_irqsave(&lweventlist_lock, flags);
-	dev->link_watch_next = lweventlist;
-	lweventlist = dev;
+	if (list_empty(&dev->link_watch_list)) {
+		list_add_tail(&dev->link_watch_list, &lweventlist);
+		dev_hold(dev);
+	}
 	spin_unlock_irqrestore(&lweventlist_lock, flags);
 }
 
@@ -133,9 +135,35 @@ static void linkwatch_schedule_work(int urgent)
 }
 
 
+static void linkwatch_do_dev(struct net_device *dev)
+{
+	/*
+	 * Make sure the above read is complete since it can be
+	 * rewritten as soon as we clear the bit below.
+	 */
+	smp_mb__before_clear_bit();
+
+	/* We are about to handle this device,
+	 * so new events can be accepted
+	 */
+	clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+	rfc2863_policy(dev);
+	if (dev->flags & IFF_UP) {
+		if (netif_carrier_ok(dev))
+			dev_activate(dev);
+		else
+			dev_deactivate(dev);
+
+		netdev_state_change(dev);
+	}
+	dev_put(dev);
+}
+
 static void __linkwatch_run_queue(int urgent_only)
 {
-	struct net_device *next;
+	struct net_device *dev;
+	LIST_HEAD(wrk);
 
 	/*
 	 * Limit the number of linkwatch events to one
@@ -153,46 +181,40 @@ static void __linkwatch_run_queue(int urgent_only)
 	clear_bit(LW_URGENT, &linkwatch_flags);
 
 	spin_lock_irq(&lweventlist_lock);
-	next = lweventlist;
-	lweventlist = NULL;
-	spin_unlock_irq(&lweventlist_lock);
+	list_splice_init(&lweventlist, &wrk);
 
-	while (next) {
-		struct net_device *dev = next;
+	while (!list_empty(&wrk)) {
 
-		next = dev->link_watch_next;
+		dev = list_first_entry(&wrk, struct net_device, link_watch_list);
+		list_del_init(&dev->link_watch_list);
 
 		if (urgent_only && !linkwatch_urgent_event(dev)) {
-			linkwatch_add_event(dev);
+			list_add_tail(&dev->link_watch_list, &lweventlist);
 			continue;
 		}
-
-		/*
-		 * Make sure the above read is complete since it can be
-		 * rewritten as soon as we clear the bit below.
-		 */
-		smp_mb__before_clear_bit();
-
-		/* We are about to handle this device,
-		 * so new events can be accepted
-		 */
-		clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
-
-		rfc2863_policy(dev);
-		if (dev->flags & IFF_UP) {
-			if (netif_carrier_ok(dev))
-				dev_activate(dev);
-			else
-				dev_deactivate(dev);
-
-			netdev_state_change(dev);
-		}
-
-		dev_put(dev);
+		spin_unlock_irq(&lweventlist_lock);
+		linkwatch_do_dev(dev);
+		spin_lock_irq(&lweventlist_lock);
 	}
 
-	if (lweventlist)
+	if (!list_empty(&lweventlist))
 		linkwatch_schedule_work(0);
+	spin_unlock_irq(&lweventlist_lock);
+}
+
+void linkwatch_forget_dev(struct net_device *dev)
+{
+	unsigned long flags;
+	int clean = 0;
+
+	spin_lock_irqsave(&lweventlist_lock, flags);
+	if (!list_empty(&dev->link_watch_list)) {
+		list_del_init(&dev->link_watch_list);
+		clean = 1;
+	}
+	spin_unlock_irqrestore(&lweventlist_lock, flags);
+	if (clean)
+		linkwatch_do_dev(dev);
 }
 
 
@@ -216,8 +238,6 @@ void linkwatch_fire_event(struct net_device *dev)
 	bool urgent = linkwatch_urgent_event(dev);
 
 	if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
-		dev_hold(dev);
-
 		linkwatch_add_event(dev);
 	} else if (!urgent)
 		return;
-- 
cgit v1.2.2


From d90310243fd750240755e217c5faa13e24f41536 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Wed, 18 Nov 2009 02:36:59 +0000
Subject: net: device name allocation cleanups

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 69 ++++++++++++++++++++--------------------------------------
 1 file changed, 24 insertions(+), 45 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index c128af708ebf..9977288583b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -893,7 +893,8 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 		free_page((unsigned long) inuse);
 	}
 
-	snprintf(buf, IFNAMSIZ, name, i);
+	if (buf != name)
+		snprintf(buf, IFNAMSIZ, name, i);
 	if (!__dev_get_by_name(net, buf))
 		return i;
 
@@ -933,6 +934,21 @@ int dev_alloc_name(struct net_device *dev, const char *name)
 }
 EXPORT_SYMBOL(dev_alloc_name);
 
+static int dev_get_valid_name(struct net *net, const char *name, char *buf,
+			      bool fmt)
+{
+	if (!dev_valid_name(name))
+		return -EINVAL;
+
+	if (fmt && strchr(name, '%'))
+		return __dev_alloc_name(net, name, buf);
+	else if (__dev_get_by_name(net, name))
+		return -EEXIST;
+	else if (buf != name)
+		strlcpy(buf, name, IFNAMSIZ);
+
+	return 0;
+}
 
 /**
  *	dev_change_name - change name of a device
@@ -956,22 +972,14 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	if (dev->flags & IFF_UP)
 		return -EBUSY;
 
-	if (!dev_valid_name(newname))
-		return -EINVAL;
-
 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 		return 0;
 
 	memcpy(oldname, dev->name, IFNAMSIZ);
 
-	if (strchr(newname, '%')) {
-		err = dev_alloc_name(dev, newname);
-		if (err < 0)
-			return err;
-	} else if (__dev_get_by_name(net, newname))
-		return -EEXIST;
-	else
-		strlcpy(dev->name, newname, IFNAMSIZ);
+	err = dev_get_valid_name(net, newname, dev->name, 1);
+	if (err < 0)
+		return err;
 
 rollback:
 	/* For now only devices in the initial network namespace
@@ -4883,8 +4891,6 @@ EXPORT_SYMBOL(netdev_fix_features);
 
 int register_netdevice(struct net_device *dev)
 {
-	struct hlist_head *head;
-	struct hlist_node *p;
 	int ret;
 	struct net *net = dev_net(dev);
 
@@ -4913,26 +4919,14 @@ int register_netdevice(struct net_device *dev)
 		}
 	}
 
-	if (!dev_valid_name(dev->name)) {
-		ret = -EINVAL;
+	ret = dev_get_valid_name(net, dev->name, dev->name, 0);
+	if (ret)
 		goto err_uninit;
-	}
 
 	dev->ifindex = dev_new_index(net);
 	if (dev->iflink == -1)
 		dev->iflink = dev->ifindex;
 
-	/* Check for existence of name */
-	head = dev_name_hash(net, dev->name);
-	hlist_for_each(p, head) {
-		struct net_device *d
-			= hlist_entry(p, struct net_device, name_hlist);
-		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
-			ret = -EEXIST;
-			goto err_uninit;
-		}
-	}
-
 	/* Fix illegal checksum combinations */
 	if ((dev->features & NETIF_F_HW_CSUM) &&
 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
@@ -5460,8 +5454,6 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	char buf[IFNAMSIZ];
-	const char *destname;
 	int err;
 
 	ASSERT_RTNL();
@@ -5494,20 +5486,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 * we can use it in the destination network namespace.
 	 */
 	err = -EEXIST;
-	destname = dev->name;
-	if (__dev_get_by_name(net, destname)) {
+	if (__dev_get_by_name(net, dev->name)) {
 		/* We get here if we can't use the current device name */
 		if (!pat)
 			goto out;
-		if (!dev_valid_name(pat))
-			goto out;
-		if (strchr(pat, '%')) {
-			if (__dev_alloc_name(net, pat, buf) < 0)
-				goto out;
-			destname = buf;
-		} else
-			destname = pat;
-		if (__dev_get_by_name(net, destname))
+		if (dev_get_valid_name(net, pat, dev->name, 1))
 			goto out;
 	}
 
@@ -5544,10 +5527,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
 
-	/* Assign the new device name */
-	if (destname != dev->name)
-		strcpy(dev->name, destname);
-
 	/* If there is an ifindex conflict assign a new one */
 	if (__dev_get_by_index(net, dev->ifindex)) {
 		int iflink = (dev->iflink == dev->ifindex);
-- 
cgit v1.2.2


From 8964be4a9a5ca8cab1219bb046db2f6d1936227c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 20 Nov 2009 15:35:04 -0800
Subject: net: rename skb->iif to skb->skb_iif

To help grep games, rename iif to skb_iif

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c    | 6 +++---
 net/core/skbuff.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 9977288583b8..09f3d6b9c0c8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2287,7 +2287,7 @@ static int ing_filter(struct sk_buff *skb)
 	if (MAX_RED_LOOP < ttl++) {
 		printk(KERN_WARNING
 		       "Redir loop detected Dropping packet (%d->%d)\n",
-		       skb->iif, dev->ifindex);
+		       skb->skb_iif, dev->ifindex);
 		return TC_ACT_SHOT;
 	}
 
@@ -2395,8 +2395,8 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (netpoll_receive_skb(skb))
 		return NET_RX_DROP;
 
-	if (!skb->iif)
-		skb->iif = skb->dev->ifindex;
+	if (!skb->skb_iif)
+		skb->skb_iif = skb->dev->ifindex;
 
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 739b8f4dd327..bfa3e7865a8c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -549,7 +549,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	new->protocol		= old->protocol;
 	new->mark		= old->mark;
-	new->iif		= old->iif;
+	new->skb_iif		= old->skb_iif;
 	__nf_copy(new, old);
 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
-- 
cgit v1.2.2


From 6ebfbc065624790772398f5b327ac33a7ae3880b Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 22 Nov 2009 20:43:13 -0800
Subject: net: Fix missing kernel-doc notation

Fix the following htmldocs warning:

  Warning(net/core/dev.c:5378): bad line:

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 09f3d6b9c0c8..ccefa2473c39 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5375,7 +5375,7 @@ EXPORT_SYMBOL(synchronize_net);
  *	unregister_netdevice_queue - remove device from the kernel
  *	@dev: device
  *	@head: list
-
+ *
  *	This function shuts down a device interface and removes it
  *	from the kernel tables.
  *	If head not NULL, device is queued to be unregistered later.
-- 
cgit v1.2.2


From 09ad9bc752519cc167d0a573e1acf69b5c707c67 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Wed, 25 Nov 2009 15:14:13 -0800
Subject: net: use net_eq to compare nets

Generated with the following semantic patch

@@
struct net *n1;
struct net *n2;
@@
- n1 == n2
+ net_eq(n1, n2)

@@
struct net *n1;
struct net *n2;
@@
- n1 != n2
+ !net_eq(n1, n2)

applied over {include,net,drivers/net}.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c             | 4 ++--
 net/core/neighbour.c       | 2 +-
 net/core/net-sysfs.c       | 4 ++--
 net/core/net_namespace.c   | 2 +-
 net/core/sysctl_net_core.c | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index ccefa2473c39..e65af6041415 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -985,7 +985,7 @@ rollback:
 	/* For now only devices in the initial network namespace
 	 * are in sysfs.
 	 */
-	if (net == &init_net) {
+	if (net_eq(net, &init_net)) {
 		ret = device_rename(&dev->dev, dev->name);
 		if (ret) {
 			memcpy(dev->name, oldname, IFNAMSIZ);
@@ -4792,7 +4792,7 @@ static void rollback_registered_many(struct list_head *head)
 	list_for_each_entry_safe(dev, aux, head, unreg_list) {
 		int new_net = 1;
 		list_for_each_entry(fdev, &pernet_list, unreg_list) {
-			if (dev_net(dev) == dev_net(fdev)) {
+			if (net_eq(dev_net(dev), dev_net(fdev))) {
 				new_net = 0;
 				dev_put(dev);
 				break;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e587e6819698..a08a35bf0a7b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2092,7 +2092,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 		if (h > s_h)
 			s_idx = 0;
 		for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) {
-			if (dev_net(n->dev) != net)
+			if (!net_eq(dev_net(n->dev), net))
 				continue;
 			if (idx < s_idx)
 				goto next;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 157645c0da73..fbc1c7472c5e 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -525,7 +525,7 @@ void netdev_unregister_kobject(struct net_device * net)
 
 	kobject_get(&dev->kobj);
 
-	if (dev_net(net) != &init_net)
+	if (!net_eq(dev_net(net), &init_net))
 		return;
 
 	device_del(dev);
@@ -559,7 +559,7 @@ int netdev_register_kobject(struct net_device *net)
 #endif
 #endif /* CONFIG_SYSFS */
 
-	if (dev_net(net) != &init_net)
+	if (!net_eq(dev_net(net), &init_net))
 		return 0;
 
 	return device_add(dev);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1c1af2756f38..86ed7f44d083 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -280,7 +280,7 @@ out_undo:
 	list_del(&ops->list);
 	if (ops->exit) {
 		for_each_net(undo_net) {
-			if (undo_net == net)
+			if (net_eq(undo_net, net))
 				goto undone;
 			ops->exit(undo_net);
 		}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7db1de0497c6..fcfc5458c399 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -134,7 +134,7 @@ static __net_init int sysctl_core_net_init(struct net *net)
 	net->core.sysctl_somaxconn = SOMAXCONN;
 
 	tbl = netns_core_table;
-	if (net != &init_net) {
+	if (!net_eq(net, &init_net)) {
 		tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
 		if (tbl == NULL)
 			goto err_dup;
-- 
cgit v1.2.2


From 445409602c09219767c06497c0dc2285eac244ed Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Nov 2009 06:07:08 +0000
Subject: veth: move loopback logic to common location

The veth driver contains code to forward an skb
from the start_xmit function of one network
device into the receive path of another device.

Moving that code into a common location lets us
reuse the code for direct forwarding of data
between macvlan ports, and possibly in other
drivers.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index e65af6041415..7775e8b48deb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -105,6 +105,7 @@
 #include <net/dst.h>
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
+#include <net/xfrm.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
@@ -1419,6 +1420,45 @@ static inline void net_timestamp(struct sk_buff *skb)
 		skb->tstamp.tv64 = 0;
 }
 
+/**
+ * dev_forward_skb - loopback an skb to another netif
+ *
+ * @dev: destination network device
+ * @skb: buffer to forward
+ *
+ * return values:
+ *	NET_RX_SUCCESS	(no congestion)
+ *	NET_RX_DROP     (packet was dropped)
+ *
+ * dev_forward_skb can be used for injecting an skb from the
+ * start_xmit function of one device into the receive queue
+ * of another device.
+ *
+ * The receiving device may be in another namespace, so
+ * we have to clear all information in the skb that could
+ * impact namespace isolation.
+ */
+int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	skb_orphan(skb);
+
+	if (!(dev->flags & IFF_UP))
+		return NET_RX_DROP;
+
+	if (skb->len > (dev->mtu + dev->hard_header_len))
+		return NET_RX_DROP;
+
+	skb_dst_drop(skb);
+	skb->tstamp.tv64 = 0;
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, dev);
+	skb->mark = 0;
+	secpath_reset(skb);
+	nf_reset(skb);
+	return netif_rx(skb);
+}
+EXPORT_SYMBOL_GPL(dev_forward_skb);
+
 /*
  *	Support routine. Sends outgoing frames to any network
  *	taps currently in use.
-- 
cgit v1.2.2


From 3291b9db567e1ec38362024049cbd02987b1e277 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 29 Nov 2009 00:44:33 -0800
Subject: pktgen: NUMA aware

pktgen threads are bound to given CPU, we can allocate memory for
these threads in a NUMA aware way.

After a pktgen session on two threads, we can check flows memory was
allocated on right node, instead of a not related one.

# grep pktgen_thread_write /proc/vmallocinfo
0xffffc90007204000-0xffffc90007385000 1576960 pktgen_thread_write+0x3a4/0x6b0 [pktgen] pages=384 vmalloc N0=384
0xffffc90007386000-0xffffc90007507000 1576960 pktgen_thread_write+0x3a4/0x6b0 [pktgen] pages=384 vmalloc N1=384

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b44104d3e6be..19ee493ec178 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3626,6 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 {
 	struct pktgen_dev *pkt_dev;
 	int err;
+	int node = cpu_to_node(t->cpu);
 
 	/* We don't allow a device to be on several threads */
 
@@ -3635,12 +3636,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 		return -EBUSY;
 	}
 
-	pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL);
+	pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node);
 	if (!pkt_dev)
 		return -ENOMEM;
 
 	strcpy(pkt_dev->odevname, ifname);
-	pkt_dev->flows = vmalloc(MAX_CFLOWS * sizeof(struct flow_state));
+	pkt_dev->flows = vmalloc_node(MAX_CFLOWS * sizeof(struct flow_state),
+				      node);
 	if (pkt_dev->flows == NULL) {
 		kfree(pkt_dev);
 		return -ENOMEM;
@@ -3702,7 +3704,8 @@ static int __init pktgen_create_thread(int cpu)
 	struct proc_dir_entry *pe;
 	struct task_struct *p;
 
-	t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL);
+	t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL,
+			 cpu_to_node(cpu));
 	if (!t) {
 		printk(KERN_ERR "pktgen: ERROR: out of memory, can't "
 		       "create new thread.\n");
-- 
cgit v1.2.2


From f64f9e719261a87818dd192a3a2352e5b20fbd0f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 29 Nov 2009 16:55:45 -0800
Subject: net: Move && and || to end of previous line

Not including net/atm/

Compiled tested x86 allyesconfig only
Added a > 80 column line or two, which I ignored.
Existing checkpatch plaints willfully, cheerfully ignored.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c    | 7 ++++---
 net/core/pktgen.c | 5 ++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7775e8b48deb..5d131c2f84cc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2677,9 +2677,10 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		return GRO_NORMAL;
 
 	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
-			&& !compare_ether_header(skb_mac_header(p),
-						 skb_gro_mac_header(skb));
+		NAPI_GRO_CB(p)->same_flow =
+			(p->dev == skb->dev) &&
+			!compare_ether_header(skb_mac_header(p),
+					      skb_gro_mac_header(skb));
 		NAPI_GRO_CB(p)->flush = 0;
 	}
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 19ee493ec178..a23b45f08ec9 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2052,9 +2052,8 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 				read_lock_bh(&idev->lock);
 				for (ifp = idev->addr_list; ifp;
 				     ifp = ifp->if_next) {
-					if (ifp->scope == IFA_LINK
-					    && !(ifp->
-						 flags & IFA_F_TENTATIVE)) {
+					if (ifp->scope == IFA_LINK &&
+					    !(ifp->flags & IFA_F_TENTATIVE)) {
 						ipv6_addr_copy(&pkt_dev->
 							       cur_in6_saddr,
 							       &ifp->addr);
-- 
cgit v1.2.2


From a5ee155136b4a8f4ab0e4c9c064b661da475e298 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 29 Nov 2009 15:45:58 +0000
Subject: net: NETDEV_UNREGISTER_PERNET -> NETDEV_UNREGISTER_BATCH

The motivation for an additional notifier in batched netdevice
notification (rt_do_flush) only needs to be called once per batch not
once per namespace.

For further batching improvements I need a guarantee that the
netdevices are unregistered in order allowing me to unregister an all
of the network devices in a network namespace at the same time with
the guarantee that the loopback device is really and truly
unregistered last.

Additionally it appears that we moved the route cache flush after
the final synchronize_net, which seems wrong and there was no
explanation.  So I have restored the original location of the final
synchronize_net.

Cc: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 36 +++++++++---------------------------
 1 file changed, 9 insertions(+), 27 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 5d131c2f84cc..bb37ee1e0901 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1353,7 +1353,7 @@ rollback:
 				nb->notifier_call(nb, NETDEV_DOWN, dev);
 			}
 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
-			nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev);
+			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
 		}
 	}
 
@@ -4771,8 +4771,7 @@ static void net_set_todo(struct net_device *dev)
 
 static void rollback_registered_many(struct list_head *head)
 {
-	struct net_device *dev, *aux, *fdev;
-	LIST_HEAD(pernet_list);
+	struct net_device *dev;
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
@@ -4828,26 +4827,14 @@ static void rollback_registered_many(struct list_head *head)
 		netdev_unregister_kobject(dev);
 	}
 
-	synchronize_net();
+	/* Process any work delayed until the end of the batch */
+	dev = list_entry(head->next, struct net_device, unreg_list);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
-	list_for_each_entry_safe(dev, aux, head, unreg_list) {
-		int new_net = 1;
-		list_for_each_entry(fdev, &pernet_list, unreg_list) {
-			if (net_eq(dev_net(dev), dev_net(fdev))) {
-				new_net = 0;
-				dev_put(dev);
-				break;
-			}
-		}
-		if (new_net)
-			list_move(&dev->unreg_list, &pernet_list);
-	}
+	synchronize_net();
 
-	list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) {
-		call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
-		list_move(&dev->unreg_list, head);
+	list_for_each_entry(dev, head, unreg_list)
 		dev_put(dev);
-	}
 }
 
 static void rollback_registered(struct net_device *dev)
@@ -5129,7 +5116,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 			/* Rebroadcast unregister notification */
 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
-			/* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users
+			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
 			 * should have already handle it the first time */
 
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
@@ -5442,11 +5429,6 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
 /**
  *	unregister_netdevice_many - unregister many devices
  *	@head: list of devices
- *
- *	WARNING: Calling this modifies the given list
- *	(in rollback_registered_many). It may change the order of the elements
- *	in the list. However, you can assume it does not add or delete elements
- *	to/from the list.
  */
 void unregister_netdevice_many(struct list_head *head)
 {
@@ -5555,7 +5537,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	   this device. They should clean all the things.
 	*/
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
-	call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
 	/*
 	 *	Flush the unicast and multicast chains
-- 
cgit v1.2.2


From 2b035b39970740722598f7a9d548835f9bdd730f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <eric@conroxe.ebiederm.org>
Date: Sun, 29 Nov 2009 22:25:27 +0000
Subject: net: Batch network namespace destruction.

It is fairly common to kill several network namespaces at once.  Either
because they are nested one inside the other or because they are cooperating
in multiple machine networking experiments.  As the network stack control logic
does not parallelize easily batch up multiple network namespaces existing
together.

To get the full benefit of batching the virtual network devices to be
removed must be all removed in one batch.  For that purpose I have added
a loop after the last network device operations have run that batches
up all remaining network devices and deletes them.

An extra benefit is that the reorganization slightly shrinks the size
of the per network namespace data structures replaceing a work_struct
with a list_head.

In a trivial test with 4K namespaces this change reduced the cost of
a destroying 4K namespaces from 7+ minutes (at 12% cpu) to 44 seconds
(at 60% cpu).  The bulk of that 44s was spent in inet_twsk_purge.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 66 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 86ed7f44d083..a42caa2b909b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -8,8 +8,10 @@
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
+#include <linux/netdevice.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/rtnetlink.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -27,6 +29,20 @@ EXPORT_SYMBOL(init_net);
 
 #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
 
+static void unregister_netdevices(struct net *net, struct list_head *list)
+{
+	struct net_device *dev;
+	/* At exit all network devices most be removed from a network
+	 * namespace.  Do this in the reverse order of registeration.
+	 */
+	for_each_netdev_reverse(net, dev) {
+		if (dev->rtnl_link_ops)
+			dev->rtnl_link_ops->dellink(dev, list);
+		else
+			unregister_netdevice_queue(dev, list);
+	}
+}
+
 /*
  * setup_net runs the initializers for the network namespace object.
  */
@@ -59,6 +75,13 @@ out_undo:
 	list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
 		if (ops->exit)
 			ops->exit(net);
+		if (&ops->list == first_device) {
+			LIST_HEAD(dev_kill_list);
+			rtnl_lock();
+			unregister_netdevices(net, &dev_kill_list);
+			unregister_netdevice_many(&dev_kill_list);
+			rtnl_unlock();
+		}
 	}
 
 	rcu_barrier();
@@ -147,18 +170,26 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 	return net_create();
 }
 
+static DEFINE_SPINLOCK(cleanup_list_lock);
+static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
+
 static void cleanup_net(struct work_struct *work)
 {
 	struct pernet_operations *ops;
-	struct net *net;
+	struct net *net, *tmp;
+	LIST_HEAD(net_kill_list);
 
-	net = container_of(work, struct net, work);
+	/* Atomically snapshot the list of namespaces to cleanup */
+	spin_lock_irq(&cleanup_list_lock);
+	list_replace_init(&cleanup_list, &net_kill_list);
+	spin_unlock_irq(&cleanup_list_lock);
 
 	mutex_lock(&net_mutex);
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
-	list_del_rcu(&net->list);
+	list_for_each_entry(net, &net_kill_list, cleanup_list)
+		list_del_rcu(&net->list);
 	rtnl_unlock();
 
 	/*
@@ -170,8 +201,18 @@ static void cleanup_net(struct work_struct *work)
 
 	/* Run all of the network namespace exit methods */
 	list_for_each_entry_reverse(ops, &pernet_list, list) {
-		if (ops->exit)
-			ops->exit(net);
+		if (ops->exit) {
+			list_for_each_entry(net, &net_kill_list, cleanup_list)
+				ops->exit(net);
+		}
+		if (&ops->list == first_device) {
+			LIST_HEAD(dev_kill_list);
+			rtnl_lock();
+			list_for_each_entry(net, &net_kill_list, cleanup_list)
+				unregister_netdevices(net, &dev_kill_list);
+			unregister_netdevice_many(&dev_kill_list);
+			rtnl_unlock();
+		}
 	}
 
 	mutex_unlock(&net_mutex);
@@ -182,14 +223,23 @@ static void cleanup_net(struct work_struct *work)
 	rcu_barrier();
 
 	/* Finally it is safe to free my network namespace structure */
-	net_free(net);
+	list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) {
+		list_del_init(&net->cleanup_list);
+		net_free(net);
+	}
 }
+static DECLARE_WORK(net_cleanup_work, cleanup_net);
 
 void __put_net(struct net *net)
 {
 	/* Cleanup the network namespace in process context */
-	INIT_WORK(&net->work, cleanup_net);
-	queue_work(netns_wq, &net->work);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cleanup_list_lock, flags);
+	list_add(&net->cleanup_list, &cleanup_list);
+	spin_unlock_irqrestore(&cleanup_list_lock, flags);
+
+	queue_work(netns_wq, &net_cleanup_work);
 }
 EXPORT_SYMBOL_GPL(__put_net);
 
-- 
cgit v1.2.2


From f875bae065334907796da12523f9df85c89f5712 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 29 Nov 2009 22:25:28 +0000
Subject: net: Automatically allocate per namespace data.

To get the full benefit of batched network namespace cleanup netowrk
device deletion needs to be performed by the generic code.  When
using register_pernet_gen_device and freeing the data in exit_net
it is impossible to delay allocation until after exit_net has called
as the device uninit methods are no longer safe.

To correct this, and to simplify working with per network namespace data
I have moved allocation and deletion of per network namespace data into
the network namespace core.  The core now frees the data only after
all of the network namespace exit routines have run.

Now it is only required to set the new fields .id and .size
in the pernet_operations structure if you want network namespace
data to be managed for you automatically.

This makes the current register_pernet_gen_device and
register_pernet_gen_subsys routines unnecessary.  For the moment
I have left them as compatibility wrappers in net_namespace.h
They will be removed once all of the users have been updated.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 188 +++++++++++++++++++++++++----------------------
 1 file changed, 102 insertions(+), 86 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a42caa2b909b..9679ad292da9 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -43,13 +43,40 @@ static void unregister_netdevices(struct net *net, struct list_head *list)
 	}
 }
 
+static int ops_init(const struct pernet_operations *ops, struct net *net)
+{
+	int err;
+	if (ops->id && ops->size) {
+		void *data = kzalloc(ops->size, GFP_KERNEL);
+		if (!data)
+			return -ENOMEM;
+
+		err = net_assign_generic(net, *ops->id, data);
+		if (err) {
+			kfree(data);
+			return err;
+		}
+	}
+	if (ops->init)
+		return ops->init(net);
+	return 0;
+}
+
+static void ops_free(const struct pernet_operations *ops, struct net *net)
+{
+	if (ops->id && ops->size) {
+		int id = *ops->id;
+		kfree(net_generic(net, id));
+	}
+}
+
 /*
  * setup_net runs the initializers for the network namespace object.
  */
 static __net_init int setup_net(struct net *net)
 {
 	/* Must be called with net_mutex held */
-	struct pernet_operations *ops;
+	const struct pernet_operations *ops, *saved_ops;
 	int error = 0;
 
 	atomic_set(&net->count, 1);
@@ -59,11 +86,9 @@ static __net_init int setup_net(struct net *net)
 #endif
 
 	list_for_each_entry(ops, &pernet_list, list) {
-		if (ops->init) {
-			error = ops->init(net);
-			if (error < 0)
-				goto out_undo;
-		}
+		error = ops_init(ops, net);
+		if (error < 0)
+			goto out_undo;
 	}
 out:
 	return error;
@@ -72,6 +97,7 @@ out_undo:
 	/* Walk through the list backwards calling the exit functions
 	 * for the pernet modules whose init functions did not fail.
 	 */
+	saved_ops = ops;
 	list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
 		if (ops->exit)
 			ops->exit(net);
@@ -83,6 +109,9 @@ out_undo:
 			rtnl_unlock();
 		}
 	}
+	ops = saved_ops;
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_free(ops, net);
 
 	rcu_barrier();
 	goto out;
@@ -175,7 +204,7 @@ static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
 
 static void cleanup_net(struct work_struct *work)
 {
-	struct pernet_operations *ops;
+	const struct pernet_operations *ops;
 	struct net *net, *tmp;
 	LIST_HEAD(net_kill_list);
 
@@ -214,6 +243,13 @@ static void cleanup_net(struct work_struct *work)
 			rtnl_unlock();
 		}
 	}
+	/* Free the net generic variables */
+	list_for_each_entry_reverse(ops, &pernet_list, list) {
+		if (ops->size && ops->id) {
+			list_for_each_entry(net, &net_kill_list, cleanup_list)
+				ops_free(ops, net);
+		}
+	}
 
 	mutex_unlock(&net_mutex);
 
@@ -309,16 +345,16 @@ static int __init net_ns_init(void)
 pure_initcall(net_ns_init);
 
 #ifdef CONFIG_NET_NS
-static int register_pernet_operations(struct list_head *list,
-				      struct pernet_operations *ops)
+static int __register_pernet_operations(struct list_head *list,
+					struct pernet_operations *ops)
 {
 	struct net *net, *undo_net;
 	int error;
 
 	list_add_tail(&ops->list, list);
-	if (ops->init) {
+	if (ops->init || (ops->id && ops->size)) {
 		for_each_net(net) {
-			error = ops->init(net);
+			error = ops_init(ops, net);
 			if (error)
 				goto out_undo;
 		}
@@ -336,10 +372,18 @@ out_undo:
 		}
 	}
 undone:
+	if (ops->size && ops->id) {
+		for_each_net(undo_net) {
+			if (net_eq(undo_net, net))
+				goto freed;
+			ops_free(ops, undo_net);
+		}
+	}
+freed:
 	return error;
 }
 
-static void unregister_pernet_operations(struct pernet_operations *ops)
+static void __unregister_pernet_operations(struct pernet_operations *ops)
 {
 	struct net *net;
 
@@ -347,27 +391,66 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
 	if (ops->exit)
 		for_each_net(net)
 			ops->exit(net);
+	if (ops->id && ops->size)
+		for_each_net(net)
+			ops_free(ops, net);
 }
 
 #else
 
-static int register_pernet_operations(struct list_head *list,
-				      struct pernet_operations *ops)
+static int __register_pernet_operations(struct list_head *list,
+					struct pernet_operations *ops)
 {
-	if (ops->init == NULL)
-		return 0;
-	return ops->init(&init_net);
+	int err = 0;
+	err = ops_init(ops, &init_net);
+	if (err)
+		ops_free(ops, &init_net);
+	return err;
+	
 }
 
-static void unregister_pernet_operations(struct pernet_operations *ops)
+static void __unregister_pernet_operations(struct pernet_operations *ops)
 {
 	if (ops->exit)
 		ops->exit(&init_net);
+	ops_free(ops, &init_net);
 }
-#endif
+
+#endif /* CONFIG_NET_NS */
 
 static DEFINE_IDA(net_generic_ids);
 
+static int register_pernet_operations(struct list_head *list,
+				      struct pernet_operations *ops)
+{
+	int error;
+
+	if (ops->id) {
+again:
+		error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+		if (error < 0) {
+			if (error == -EAGAIN) {
+				ida_pre_get(&net_generic_ids, GFP_KERNEL);
+				goto again;
+			}
+			return error;
+		}
+	}
+	error = __register_pernet_operations(list, ops);
+	if (error && ops->id)
+		ida_remove(&net_generic_ids, *ops->id);
+
+	return error;
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+	
+	__unregister_pernet_operations(ops);
+	if (ops->id)
+		ida_remove(&net_generic_ids, *ops->id);
+}
+
 /**
  *      register_pernet_subsys - register a network namespace subsystem
  *	@ops:  pernet operations structure for the subsystem
@@ -414,38 +497,6 @@ void unregister_pernet_subsys(struct pernet_operations *module)
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 
-int register_pernet_gen_subsys(int *id, struct pernet_operations *ops)
-{
-	int rv;
-
-	mutex_lock(&net_mutex);
-again:
-	rv = ida_get_new_above(&net_generic_ids, 1, id);
-	if (rv < 0) {
-		if (rv == -EAGAIN) {
-			ida_pre_get(&net_generic_ids, GFP_KERNEL);
-			goto again;
-		}
-		goto out;
-	}
-	rv = register_pernet_operations(first_device, ops);
-	if (rv < 0)
-		ida_remove(&net_generic_ids, *id);
-out:
-	mutex_unlock(&net_mutex);
-	return rv;
-}
-EXPORT_SYMBOL_GPL(register_pernet_gen_subsys);
-
-void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops)
-{
-	mutex_lock(&net_mutex);
-	unregister_pernet_operations(ops);
-	ida_remove(&net_generic_ids, id);
-	mutex_unlock(&net_mutex);
-}
-EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys);
-
 /**
  *      register_pernet_device - register a network namespace device
  *	@ops:  pernet operations structure for the subsystem
@@ -477,30 +528,6 @@ int register_pernet_device(struct pernet_operations *ops)
 }
 EXPORT_SYMBOL_GPL(register_pernet_device);
 
-int register_pernet_gen_device(int *id, struct pernet_operations *ops)
-{
-	int error;
-	mutex_lock(&net_mutex);
-again:
-	error = ida_get_new_above(&net_generic_ids, 1, id);
-	if (error) {
-		if (error == -EAGAIN) {
-			ida_pre_get(&net_generic_ids, GFP_KERNEL);
-			goto again;
-		}
-		goto out;
-	}
-	error = register_pernet_operations(&pernet_list, ops);
-	if (error)
-		ida_remove(&net_generic_ids, *id);
-	else if (first_device == &pernet_list)
-		first_device = &ops->list;
-out:
-	mutex_unlock(&net_mutex);
-	return error;
-}
-EXPORT_SYMBOL_GPL(register_pernet_gen_device);
-
 /**
  *      unregister_pernet_device - unregister a network namespace netdevice
  *	@ops: pernet operations structure to manipulate
@@ -520,17 +547,6 @@ void unregister_pernet_device(struct pernet_operations *ops)
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
 
-void unregister_pernet_gen_device(int id, struct pernet_operations *ops)
-{
-	mutex_lock(&net_mutex);
-	if (&ops->list == first_device)
-		first_device = first_device->next;
-	unregister_pernet_operations(ops);
-	ida_remove(&net_generic_ids, id);
-	mutex_unlock(&net_mutex);
-}
-EXPORT_SYMBOL_GPL(unregister_pernet_gen_device);
-
 static void net_generic_release(struct rcu_head *rcu)
 {
 	struct net_generic *ng;
-- 
cgit v1.2.2


From e008b5fc8dc7f46d9904001c7a2155eb1e7d35ab Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 29 Nov 2009 22:25:30 +0000
Subject: net: Simplfy default_device_exit and improve batching.

- Defer dellink to net_cleanup() allowing for batching.
- Fix comment.
- Use for_each_netdev_safe again as dev_change_net_namespace touches
  at most one network device (unlike veth dellink).

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index bb37ee1e0901..e3e18dee0bd3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5736,14 +5736,13 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
 
 static void __net_exit default_device_exit(struct net *net)
 {
-	struct net_device *dev;
+	struct net_device *dev, *aux;
 	/*
-	 * Push all migratable of the network devices back to the
+	 * Push all migratable network devices back to the
 	 * initial network namespace
 	 */
 	rtnl_lock();
-restart:
-	for_each_netdev(net, dev) {
+	for_each_netdev_safe(net, dev, aux) {
 		int err;
 		char fb_name[IFNAMSIZ];
 
@@ -5751,11 +5750,9 @@ restart:
 		if (dev->features & NETIF_F_NETNS_LOCAL)
 			continue;
 
-		/* Delete virtual devices */
-		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
-			dev->rtnl_link_ops->dellink(dev, NULL);
-			goto restart;
-		}
+		/* Leave virtual devices for the generic cleanup */
+		if (dev->rtnl_link_ops)
+			continue;
 
 		/* Push remaing network devices to init_net */
 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
@@ -5765,7 +5762,6 @@ restart:
 				__func__, dev->name, err);
 			BUG();
 		}
-		goto restart;
 	}
 	rtnl_unlock();
 }
-- 
cgit v1.2.2


From c81c2d95449cd218c2022ce6014c52fef1eb1f66 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 2 Dec 2009 16:49:02 +0000
Subject: skbuff: remove skb_dma_map/unmap

The two functions skb_dma_map/unmap are unsafe to use as they cause
problems when packets are cloned and sent to multiple devices while a HW
IOMMU is enabled.  Due to this it is best to remove the code so it is not
used by any other network driver maintainters.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/Makefile      |  1 -
 net/core/skb_dma_map.c | 65 --------------------------------------------------
 2 files changed, 66 deletions(-)
 delete mode 100644 net/core/skb_dma_map.c

(limited to 'net/core')

diff --git a/net/core/Makefile b/net/core/Makefile
index 796f46eece5f..08791ac3e05a 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -6,7 +6,6 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
 	 gen_stats.o gen_estimator.o net_namespace.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-$(CONFIG_HAS_DMA) += skb_dma_map.o
 
 obj-y		     += dev.o ethtool.o dev_mcast.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o
diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c
deleted file mode 100644
index 79687dfd6957..000000000000
--- a/net/core/skb_dma_map.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/* skb_dma_map.c: DMA mapping helpers for socket buffers.
- *
- * Copyright (C) David S. Miller <davem@davemloft.net>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/dma-mapping.h>
-#include <linux/skbuff.h>
-
-int skb_dma_map(struct device *dev, struct sk_buff *skb,
-		enum dma_data_direction dir)
-{
-	struct skb_shared_info *sp = skb_shinfo(skb);
-	dma_addr_t map;
-	int i;
-
-	map = dma_map_single(dev, skb->data,
-			     skb_headlen(skb), dir);
-	if (dma_mapping_error(dev, map))
-		goto out_err;
-
-	sp->dma_head = map;
-	for (i = 0; i < sp->nr_frags; i++) {
-		skb_frag_t *fp = &sp->frags[i];
-
-		map = dma_map_page(dev, fp->page, fp->page_offset,
-				   fp->size, dir);
-		if (dma_mapping_error(dev, map))
-			goto unwind;
-		sp->dma_maps[i] = map;
-	}
-
-	return 0;
-
-unwind:
-	while (--i >= 0) {
-		skb_frag_t *fp = &sp->frags[i];
-
-		dma_unmap_page(dev, sp->dma_maps[i],
-			       fp->size, dir);
-	}
-	dma_unmap_single(dev, sp->dma_head,
-			 skb_headlen(skb), dir);
-out_err:
-	return -ENOMEM;
-}
-EXPORT_SYMBOL(skb_dma_map);
-
-void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
-		   enum dma_data_direction dir)
-{
-	struct skb_shared_info *sp = skb_shinfo(skb);
-	int i;
-
-	dma_unmap_single(dev, sp->dma_head,
-			 skb_headlen(skb), dir);
-	for (i = 0; i < sp->nr_frags; i++) {
-		skb_frag_t *fp = &sp->frags[i];
-
-		dma_unmap_page(dev, sp->dma_maps[i],
-			       fp->size, dir);
-	}
-}
-EXPORT_SYMBOL(skb_dma_unmap);
-- 
cgit v1.2.2


From 491deb24bf5bf7124141287aaf02c3219783ceab Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Dec 2009 01:25:54 +0000
Subject: net 02/05: fib_rules: rename ifindex/ifname/FRA_IFNAME to
 iifindex/iifname/FRA_IIFNAME

commit 229e77eec406ad68662f18e49fda8b5d366768c5
Author: Patrick McHardy <kaber@trash.net>
Date:   Thu Dec 3 12:05:23 2009 +0100

    net: fib_rules: rename ifindex/ifname/FRA_IFNAME to iifindex/iifname/FRA_IIFNAME

    The next patch will add oif classification, rename interface related members
    and attributes to reflect that they're used for iif classification.

    Signed-off-by: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'net/core')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bd309384f8b8..8e8028cdc87f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -135,7 +135,7 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 {
 	int ret = 0;
 
-	if (rule->ifindex && (rule->ifindex != fl->iif))
+	if (rule->iifindex && (rule->iifindex != fl->iif))
 		goto out;
 
 	if ((rule->mark ^ fl->mark) & rule->mark_mask)
@@ -248,14 +248,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (tb[FRA_PRIORITY])
 		rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
 
-	if (tb[FRA_IFNAME]) {
+	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
 
-		rule->ifindex = -1;
-		nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ);
-		dev = __dev_get_by_name(net, rule->ifname);
+		rule->iifindex = -1;
+		nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, rule->iifname);
 		if (dev)
-			rule->ifindex = dev->ifindex;
+			rule->iifindex = dev->ifindex;
 	}
 
 	if (tb[FRA_FWMARK]) {
@@ -388,8 +388,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		    (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
 			continue;
 
-		if (tb[FRA_IFNAME] &&
-		    nla_strcmp(tb[FRA_IFNAME], rule->ifname))
+		if (tb[FRA_IIFNAME] &&
+		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
 			continue;
 
 		if (tb[FRA_FWMARK] &&
@@ -447,7 +447,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 					 struct fib_rule *rule)
 {
 	size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
-			 + nla_total_size(IFNAMSIZ) /* FRA_IFNAME */
+			 + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
 			 + nla_total_size(4) /* FRA_FWMARK */
@@ -481,11 +481,11 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL)
 		frh->flags |= FIB_RULE_UNRESOLVED;
 
-	if (rule->ifname[0]) {
-		NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname);
+	if (rule->iifname[0]) {
+		NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname);
 
-		if (rule->ifindex == -1)
-			frh->flags |= FIB_RULE_DEV_DETACHED;
+		if (rule->iifindex == -1)
+			frh->flags |= FIB_RULE_IIF_DETACHED;
 	}
 
 	if (rule->pref)
@@ -600,9 +600,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
 	struct fib_rule *rule;
 
 	list_for_each_entry(rule, rules, list) {
-		if (rule->ifindex == -1 &&
-		    strcmp(dev->name, rule->ifname) == 0)
-			rule->ifindex = dev->ifindex;
+		if (rule->iifindex == -1 &&
+		    strcmp(dev->name, rule->iifname) == 0)
+			rule->iifindex = dev->ifindex;
 	}
 }
 
@@ -611,8 +611,8 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
 	struct fib_rule *rule;
 
 	list_for_each_entry(rule, rules, list)
-		if (rule->ifindex == dev->ifindex)
-			rule->ifindex = -1;
+		if (rule->iifindex == dev->ifindex)
+			rule->iifindex = -1;
 }
 
 
-- 
cgit v1.2.2


From 1b038a5e60c7812f19818e8a5df96d029e49c38f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Dec 2009 01:25:56 +0000
Subject: net 03/05: fib_rules: add oif classification

commit 68144d350f4f6c348659c825cde6a82b34c27a91
Author: Patrick McHardy <kaber@trash.net>
Date:   Thu Dec 3 12:05:25 2009 +0100

    net: fib_rules: add oif classification

    Support routing table lookup based on the flow's oif. This is useful to
    classify packets originating from sockets bound to interfaces differently.

    The route cache already includes the oif and needs no changes.

    Signed-off-by: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 8e8028cdc87f..d1a70ad4b544 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -138,6 +138,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 	if (rule->iifindex && (rule->iifindex != fl->iif))
 		goto out;
 
+	if (rule->oifindex && (rule->oifindex != fl->oif))
+		goto out;
+
 	if ((rule->mark ^ fl->mark) & rule->mark_mask)
 		goto out;
 
@@ -258,6 +261,16 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 			rule->iifindex = dev->ifindex;
 	}
 
+	if (tb[FRA_OIFNAME]) {
+		struct net_device *dev;
+
+		rule->oifindex = -1;
+		nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, rule->oifname);
+		if (dev)
+			rule->oifindex = dev->ifindex;
+	}
+
 	if (tb[FRA_FWMARK]) {
 		rule->mark = nla_get_u32(tb[FRA_FWMARK]);
 		if (rule->mark)
@@ -392,6 +405,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
 			continue;
 
+		if (tb[FRA_OIFNAME] &&
+		    nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
+			continue;
+
 		if (tb[FRA_FWMARK] &&
 		    (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
 			continue;
@@ -448,6 +465,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 {
 	size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
 			 + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
 			 + nla_total_size(4) /* FRA_FWMARK */
@@ -488,6 +506,13 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 			frh->flags |= FIB_RULE_IIF_DETACHED;
 	}
 
+	if (rule->oifname[0]) {
+		NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname);
+
+		if (rule->oifindex == -1)
+			frh->flags |= FIB_RULE_OIF_DETACHED;
+	}
+
 	if (rule->pref)
 		NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref);
 
@@ -603,6 +628,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
 		if (rule->iifindex == -1 &&
 		    strcmp(dev->name, rule->iifname) == 0)
 			rule->iifindex = dev->ifindex;
+		if (rule->oifindex == -1 &&
+		    strcmp(dev->name, rule->oifname) == 0)
+			rule->oifindex = dev->ifindex;
 	}
 }
 
@@ -610,9 +638,12 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
 {
 	struct fib_rule *rule;
 
-	list_for_each_entry(rule, rules, list)
+	list_for_each_entry(rule, rules, list) {
 		if (rule->iifindex == dev->ifindex)
 			rule->iifindex = -1;
+		if (rule->oifindex == dev->ifindex)
+			rule->oifindex = -1;
+	}
 }
 
 
-- 
cgit v1.2.2


From 5adef1809147a9c39119ffd5a13a1ca4fe23a411 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Dec 2009 01:25:57 +0000
Subject: net 04/05: fib_rules: allow to delete local rule

commit d124356ce314fff22a047ea334379d5105b2d834
Author: Patrick McHardy <kaber@trash.net>
Date:   Thu Dec 3 12:16:35 2009 +0100

    net: fib_rules: allow to delete local rule

    Allow to delete the local rule and recreate it with a higher priority. This
    can be used to force packets with a local destination out on the wire instead
    of routing them to loopback. Additionally this patch allows to recreate rules
    with a priority of 0.

    Combined with the previous patch to allow oif classification, a socket can
    be bound to the desired interface and packets routed to the wire like this:

    # move local rule to lower priority
    ip rule add pref 1000 lookup local
    ip rule del pref 0

    # route packets of sockets bound to eth0 to the wire independant
    # of the destination address
    ip rule add pref 100 oif eth0 lookup 100
    ip route add default dev eth0 table 100

    Signed-off-by: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index d1a70ad4b544..ef0e7d9e664b 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -287,7 +287,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
 
-	if (!rule->pref && ops->default_pref)
+	if (!tb[FRA_PRIORITY] && ops->default_pref)
 		rule->pref = ops->default_pref(ops);
 
 	err = -EINVAL;
-- 
cgit v1.2.2


From 72ad937abd0a43b7cf2c557ba1f2ec75e608c516 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Dec 2009 02:29:03 +0000
Subject: net: Add support for batching network namespace cleanups

- Add exit_list to struct net to support building lists of network
  namespaces to cleanup.

- Add exit_batch to pernet_operations to allow running operations only
  once during a network namespace exit.  Instead of once per network
  namespace.

- Factor opt ops_exit_list and ops_exit_free so the logic with cleanup
  up a network namespace does not need to be duplicated.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 122 +++++++++++++++++++++++------------------------
 1 file changed, 61 insertions(+), 61 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 9679ad292da9..6c7f6e04dbbf 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -70,6 +70,36 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
 	}
 }
 
+static void ops_exit_list(const struct pernet_operations *ops,
+			  struct list_head *net_exit_list)
+{
+	struct net *net;
+	if (ops->exit) {
+		list_for_each_entry(net, net_exit_list, exit_list)
+			ops->exit(net);
+	}
+	if (&ops->list == first_device) {
+		LIST_HEAD(dev_kill_list);
+		rtnl_lock();
+		list_for_each_entry(net, net_exit_list, exit_list)
+			unregister_netdevices(net, &dev_kill_list);
+		unregister_netdevice_many(&dev_kill_list);
+		rtnl_unlock();
+	}
+	if (ops->exit_batch)
+		ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+			  struct list_head *net_exit_list)
+{
+	struct net *net;
+	if (ops->size && ops->id) {
+		list_for_each_entry(net, net_exit_list, exit_list)
+			ops_free(ops, net);
+	}
+}
+
 /*
  * setup_net runs the initializers for the network namespace object.
  */
@@ -78,6 +108,7 @@ static __net_init int setup_net(struct net *net)
 	/* Must be called with net_mutex held */
 	const struct pernet_operations *ops, *saved_ops;
 	int error = 0;
+	LIST_HEAD(net_exit_list);
 
 	atomic_set(&net->count, 1);
 
@@ -97,21 +128,14 @@ out_undo:
 	/* Walk through the list backwards calling the exit functions
 	 * for the pernet modules whose init functions did not fail.
 	 */
+	list_add(&net->exit_list, &net_exit_list);
 	saved_ops = ops;
-	list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
-		if (ops->exit)
-			ops->exit(net);
-		if (&ops->list == first_device) {
-			LIST_HEAD(dev_kill_list);
-			rtnl_lock();
-			unregister_netdevices(net, &dev_kill_list);
-			unregister_netdevice_many(&dev_kill_list);
-			rtnl_unlock();
-		}
-	}
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_exit_list(ops, &net_exit_list);
+
 	ops = saved_ops;
 	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
-		ops_free(ops, net);
+		ops_free_list(ops, &net_exit_list);
 
 	rcu_barrier();
 	goto out;
@@ -207,6 +231,7 @@ static void cleanup_net(struct work_struct *work)
 	const struct pernet_operations *ops;
 	struct net *net, *tmp;
 	LIST_HEAD(net_kill_list);
+	LIST_HEAD(net_exit_list);
 
 	/* Atomically snapshot the list of namespaces to cleanup */
 	spin_lock_irq(&cleanup_list_lock);
@@ -217,8 +242,10 @@ static void cleanup_net(struct work_struct *work)
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
-	list_for_each_entry(net, &net_kill_list, cleanup_list)
+	list_for_each_entry(net, &net_kill_list, cleanup_list) {
 		list_del_rcu(&net->list);
+		list_add_tail(&net->exit_list, &net_exit_list);
+	}
 	rtnl_unlock();
 
 	/*
@@ -229,27 +256,12 @@ static void cleanup_net(struct work_struct *work)
 	synchronize_rcu();
 
 	/* Run all of the network namespace exit methods */
-	list_for_each_entry_reverse(ops, &pernet_list, list) {
-		if (ops->exit) {
-			list_for_each_entry(net, &net_kill_list, cleanup_list)
-				ops->exit(net);
-		}
-		if (&ops->list == first_device) {
-			LIST_HEAD(dev_kill_list);
-			rtnl_lock();
-			list_for_each_entry(net, &net_kill_list, cleanup_list)
-				unregister_netdevices(net, &dev_kill_list);
-			unregister_netdevice_many(&dev_kill_list);
-			rtnl_unlock();
-		}
-	}
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_exit_list(ops, &net_exit_list);
+
 	/* Free the net generic variables */
-	list_for_each_entry_reverse(ops, &pernet_list, list) {
-		if (ops->size && ops->id) {
-			list_for_each_entry(net, &net_kill_list, cleanup_list)
-				ops_free(ops, net);
-		}
-	}
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_free_list(ops, &net_exit_list);
 
 	mutex_unlock(&net_mutex);
 
@@ -259,8 +271,8 @@ static void cleanup_net(struct work_struct *work)
 	rcu_barrier();
 
 	/* Finally it is safe to free my network namespace structure */
-	list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) {
-		list_del_init(&net->cleanup_list);
+	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+		list_del_init(&net->exit_list);
 		net_free(net);
 	}
 }
@@ -348,8 +360,9 @@ pure_initcall(net_ns_init);
 static int __register_pernet_operations(struct list_head *list,
 					struct pernet_operations *ops)
 {
-	struct net *net, *undo_net;
+	struct net *net;
 	int error;
+	LIST_HEAD(net_exit_list);
 
 	list_add_tail(&ops->list, list);
 	if (ops->init || (ops->id && ops->size)) {
@@ -357,6 +370,7 @@ static int __register_pernet_operations(struct list_head *list,
 			error = ops_init(ops, net);
 			if (error)
 				goto out_undo;
+			list_add_tail(&net->exit_list, &net_exit_list);
 		}
 	}
 	return 0;
@@ -364,36 +378,21 @@ static int __register_pernet_operations(struct list_head *list,
 out_undo:
 	/* If I have an error cleanup all namespaces I initialized */
 	list_del(&ops->list);
-	if (ops->exit) {
-		for_each_net(undo_net) {
-			if (net_eq(undo_net, net))
-				goto undone;
-			ops->exit(undo_net);
-		}
-	}
-undone:
-	if (ops->size && ops->id) {
-		for_each_net(undo_net) {
-			if (net_eq(undo_net, net))
-				goto freed;
-			ops_free(ops, undo_net);
-		}
-	}
-freed:
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
 	return error;
 }
 
 static void __unregister_pernet_operations(struct pernet_operations *ops)
 {
 	struct net *net;
+	LIST_HEAD(net_exit_list);
 
 	list_del(&ops->list);
-	if (ops->exit)
-		for_each_net(net)
-			ops->exit(net);
-	if (ops->id && ops->size)
-		for_each_net(net)
-			ops_free(ops, net);
+	for_each_net(net)
+		list_add_tail(&net->exit_list, &net_exit_list);
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
 }
 
 #else
@@ -411,9 +410,10 @@ static int __register_pernet_operations(struct list_head *list,
 
 static void __unregister_pernet_operations(struct pernet_operations *ops)
 {
-	if (ops->exit)
-		ops->exit(&init_net);
-	ops_free(ops, &init_net);
+	LIST_HEAD(net_exit_list);
+	list_add(&init_net.exit_list, &net_exit_list);
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
 }
 
 #endif /* CONFIG_NET_NS */
-- 
cgit v1.2.2


From 04dc7f6be3a7b308f8545bb45772c9fb75f71aca Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Dec 2009 02:29:04 +0000
Subject: net: Move network device exit batching

Move network device exit batching from a special case in
net_namespace.c to using common mechanisms in dev.c

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c           | 25 +++++++++++++++++++++++++
 net/core/net_namespace.c | 24 ------------------------
 2 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index e3e18dee0bd3..0913a08a87d6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5766,8 +5766,33 @@ static void __net_exit default_device_exit(struct net *net)
 	rtnl_unlock();
 }
 
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+	/* At exit all network devices most be removed from a network
+	 * namespace.  Do this in the reverse order of registeration.
+	 * Do this across as many network namespaces as possible to
+	 * improve batching efficiency.
+	 */
+	struct net_device *dev;
+	struct net *net;
+	LIST_HEAD(dev_kill_list);
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list) {
+		for_each_netdev_reverse(net, dev) {
+			if (dev->rtnl_link_ops)
+				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+			else
+				unregister_netdevice_queue(dev, &dev_kill_list);
+		}
+	}
+	unregister_netdevice_many(&dev_kill_list);
+	rtnl_unlock();
+}
+
 static struct pernet_operations __net_initdata default_device_ops = {
 	.exit = default_device_exit,
+	.exit_batch = default_device_exit_batch,
 };
 
 /*
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 6c7f6e04dbbf..4026a4cff93c 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -8,10 +8,8 @@
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
-#include <linux/netdevice.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
-#include <net/rtnetlink.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -29,20 +27,6 @@ EXPORT_SYMBOL(init_net);
 
 #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
 
-static void unregister_netdevices(struct net *net, struct list_head *list)
-{
-	struct net_device *dev;
-	/* At exit all network devices most be removed from a network
-	 * namespace.  Do this in the reverse order of registeration.
-	 */
-	for_each_netdev_reverse(net, dev) {
-		if (dev->rtnl_link_ops)
-			dev->rtnl_link_ops->dellink(dev, list);
-		else
-			unregister_netdevice_queue(dev, list);
-	}
-}
-
 static int ops_init(const struct pernet_operations *ops, struct net *net)
 {
 	int err;
@@ -78,14 +62,6 @@ static void ops_exit_list(const struct pernet_operations *ops,
 		list_for_each_entry(net, net_exit_list, exit_list)
 			ops->exit(net);
 	}
-	if (&ops->list == first_device) {
-		LIST_HEAD(dev_kill_list);
-		rtnl_lock();
-		list_for_each_entry(net, net_exit_list, exit_list)
-			unregister_netdevices(net, &dev_kill_list);
-		unregister_netdevice_many(&dev_kill_list);
-		rtnl_unlock();
-	}
 	if (ops->exit_batch)
 		ops->exit_batch(net_exit_list);
 }
-- 
cgit v1.2.2


From 3a765edadb28cc736d185f67d1ba6bedcc85f4b9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Dec 2009 02:29:06 +0000
Subject: netns: Add an explicit rcu_barrier to
 unregister_pernet_{device|subsys}

This allows namespace exit methods to batch work that comes requires an
rcu barrier using call_rcu without having to treat the
unregister_pernet_operations cases specially.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 4026a4cff93c..bd8c4712ea24 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -413,8 +413,11 @@ again:
 		}
 	}
 	error = __register_pernet_operations(list, ops);
-	if (error && ops->id)
-		ida_remove(&net_generic_ids, *ops->id);
+	if (error) {
+		rcu_barrier();
+		if (ops->id)
+			ida_remove(&net_generic_ids, *ops->id);
+	}
 
 	return error;
 }
@@ -423,6 +426,7 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
 {
 	
 	__unregister_pernet_operations(ops);
+	rcu_barrier();
 	if (ops->id)
 		ida_remove(&net_generic_ids, *ops->id);
 }
-- 
cgit v1.2.2


From e9c5158ac26affd5d8ce006521bdfb7148090e18 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Dec 2009 12:22:55 -0800
Subject: net: Allow fib_rule_unregister to batch

Refactor the code so fib_rules_register always takes a template instead
of the actual fib_rules_ops structure that will be used.  This is
required for network namespace support so 2 out of the 3 callers already
do this, it allows the error handling to be made common, and it allows
fib_rules_unregister to free the template for hte caller.

Modify fib_rules_unregister to use call_rcu instead of syncrhonize_rcu
to allw multiple namespaces to be cleaned up in the same rcu grace
period.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index ef0e7d9e664b..02a3b2c69c1e 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -72,7 +72,7 @@ static void flush_route_cache(struct fib_rules_ops *ops)
 		ops->flush_cache(ops);
 }
 
-int fib_rules_register(struct fib_rules_ops *ops)
+static int __fib_rules_register(struct fib_rules_ops *ops)
 {
 	int err = -EEXIST;
 	struct fib_rules_ops *o;
@@ -102,6 +102,28 @@ errout:
 	return err;
 }
 
+struct fib_rules_ops *
+fib_rules_register(struct fib_rules_ops *tmpl, struct net *net)
+{
+	struct fib_rules_ops *ops;
+	int err;
+
+	ops = kmemdup(tmpl, sizeof (*ops), GFP_KERNEL);
+	if (ops == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->rules_list);
+	ops->fro_net = net;
+
+	err = __fib_rules_register(ops);
+	if (err) {
+		kfree(ops);
+		ops = ERR_PTR(err);
+	}
+
+	return ops;
+}
+
 EXPORT_SYMBOL_GPL(fib_rules_register);
 
 void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
@@ -115,6 +137,15 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
 }
 EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
 
+static void fib_rules_put_rcu(struct rcu_head *head)
+{
+	struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
+	struct net *net = ops->fro_net;
+
+	release_net(net);
+	kfree(ops);
+}
+
 void fib_rules_unregister(struct fib_rules_ops *ops)
 {
 	struct net *net = ops->fro_net;
@@ -124,8 +155,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
 	fib_rules_cleanup_ops(ops);
 	spin_unlock(&net->rules_mod_lock);
 
-	synchronize_rcu();
-	release_net(net);
+	call_rcu(&ops->rcu, fib_rules_put_rcu);
 }
 
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
-- 
cgit v1.2.2


From fc4a7489663250360cd40d5adf06a08d1c5d54df Mon Sep 17 00:00:00 2001
From: Patrick Mullaney <pmullaney@novell.com>
Date: Thu, 3 Dec 2009 15:59:22 -0800
Subject: netdevice: provide common routine for macvlan and vlan operstate
 management

Provide common routine for the transition of operational state for a leaf
device during a root device transition.

Signed-off-by: Patrick Mullaney <pmullaney@novell.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0913a08a87d6..c36a17aafcf3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4900,6 +4900,33 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
 }
 EXPORT_SYMBOL(netdev_fix_features);
 
+/**
+ *	netif_stacked_transfer_operstate -	transfer operstate
+ *	@rootdev: the root or lower level device to transfer state from
+ *	@dev: the device to transfer operstate to
+ *
+ *	Transfer operational state from root to device. This is normally
+ *	called when a stacking relationship exists between the root
+ *	device and the device(a leaf device).
+ */
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+					struct net_device *dev)
+{
+	if (rootdev->operstate == IF_OPER_DORMANT)
+		netif_dormant_on(dev);
+	else
+		netif_dormant_off(dev);
+
+	if (netif_carrier_ok(rootdev)) {
+		if (!netif_carrier_ok(dev))
+			netif_carrier_on(dev);
+	} else {
+		if (netif_carrier_ok(dev))
+			netif_carrier_off(dev);
+	}
+}
+EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
-- 
cgit v1.2.2


From e93737b0f0159a61772894943199fd3b6f315641 Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Tue, 8 Dec 2009 22:26:02 +0000
Subject: net: Handle NETREG_UNINITIALIZED devices correctly

Fix two problems:

1. If unregister_netdevice_many() is called with both registered
   and unregistered devices, rollback_registered_many() bails out
   when it reaches the first unregistered device. The processing
   of the prior registered devices is unfinished, and the
   remaining devices are skipped, and possible registered netdev's
   are leaked/unregistered.

2. System hangs or panics depending on how the devices are passed,
   since when netdev_run_todo() runs, some devices were not fully
   processed.

Tested by passing intermingled unregistered and registered vlan
devices to unregister_netdevice_many() as follows:
	1. dev, fake_dev1, fake_dev2: hangs in run_todo
	   ("unregister_netdevice: waiting for eth1.100 to become
	    free. Usage count = 1")
	2. fake_dev1, dev, fake_dev2: failure during de-registration
	   and next registration, followed by a vlan driver Oops
	   during subsequent registration.

Confirmed that the patch fixes both cases.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index c36a17aafcf3..6fe7d739e59b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4771,21 +4771,23 @@ static void net_set_todo(struct net_device *dev)
 
 static void rollback_registered_many(struct list_head *head)
 {
-	struct net_device *dev;
+	struct net_device *dev, *tmp;
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-	list_for_each_entry(dev, head, unreg_list) {
+	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 		/* Some devices call without registering
-		 * for initialization unwind.
+		 * for initialization unwind. Remove those
+		 * devices and proceed with the remaining.
 		 */
 		if (dev->reg_state == NETREG_UNINITIALIZED) {
 			pr_debug("unregister_netdevice: device %s/%p never "
 				 "was registered\n", dev->name, dev);
 
 			WARN_ON(1);
-			return;
+			list_del(&dev->unreg_list);
+			continue;
 		}
 
 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
-- 
cgit v1.2.2


From d90a909e1f3e006a1d57fe11fd417173b6494701 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 12 Dec 2009 22:11:15 +0000
Subject: net: Fix userspace RTM_NEWLINK notifications.

I received some bug reports about userspace programs having problems
because after RTM_NEWLINK was received they could not immediate access
files under /proc/sys/net/ because they had not been registered yet.

The original problem was trivially fixed by moving the userspace
notification from rtnetlink_event() to the end of
register_netdevice().

When testing that change I discovered I was still getting RTM_NEWLINK
events before I could access proc and I was also getting RTM_NEWLINK
events after I was seeing RTM_DELLINK.  Things practically guaranteed
to confuse userspace.

After a little more investigation these extra notifications proved to
be from the new notifiers NETDEV_POST_INIT and NETDEV_UNREGISTER_BATCH
hitting the default case in rtnetlink_event, and triggering
unnecessary RTM_NEWLINK messages.

rtnetlink_event now explicitly handles NETDEV_UNREGISTER_BATCH and
NETDEV_POST_INIT to avoid sending the incorrect userspace
notifications.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 11 +++++++++++
 net/core/rtnetlink.c |  6 +++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6fe7d739e59b..be9924f60ec3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5035,6 +5035,11 @@ int register_netdevice(struct net_device *dev)
 		rollback_registered(dev);
 		dev->reg_state = NETREG_UNREGISTERED;
 	}
+	/*
+	 *	Prevent userspace races by waiting until the network
+	 *	device is fully setup before sending notifications.
+	 */
+	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
 
 out:
 	return ret;
@@ -5597,6 +5602,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	/* Notify protocols, that a new device appeared. */
 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
 
+	/*
+	 *	Prevent userspace races by waiting until the network
+	 *	device is fully setup before sending notifications.
+	 */
+	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+
 	synchronize_net();
 	err = 0;
 out:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33148a568199..794bcb897ff0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1364,15 +1364,15 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_UNREGISTER:
 		rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 		break;
-	case NETDEV_REGISTER:
-		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
-		break;
 	case NETDEV_UP:
 	case NETDEV_DOWN:
 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
 		break;
+	case NETDEV_POST_INIT:
+	case NETDEV_REGISTER:
 	case NETDEV_CHANGE:
 	case NETDEV_GOING_DOWN:
+	case NETDEV_UNREGISTER_BATCH:
 		break;
 	default:
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
-- 
cgit v1.2.2


From 28dfef8febe48f59cf1e7596e1992a6a1893ca24 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 15 Dec 2009 16:46:48 -0800
Subject: const: constify remaining pipe_buf_operations

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bfa3e7865a8c..93c4e060c91e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -93,7 +93,7 @@ static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
 
 
 /* Pipe buffer operations for a socket. */
-static struct pipe_buf_operations sock_pipe_buf_ops = {
+static const struct pipe_buf_operations sock_pipe_buf_ops = {
 	.can_merge = 0,
 	.map = generic_pipe_buf_map,
 	.unmap = generic_pipe_buf_unmap,
-- 
cgit v1.2.2


From 068a2de57ddf4f472e32e7af868613c574ad1d88 Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Wed, 9 Dec 2009 20:59:58 +0000
Subject: net: release dst entry while cache-hot for GSO case too

Non-GSO code drops dst entry for performance reasons, but
the same is missing for GSO code. Drop dst while cache-hot
for GSO case too.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index be9924f60ec3..a8d68cdedbbe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1853,6 +1853,14 @@ gso:
 
 		skb->next = nskb->next;
 		nskb->next = NULL;
+
+		/*
+		 * If device doesnt need nskb->dst, release it right now while
+		 * its hot in this cpu cache
+		 */
+		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+			skb_dst_drop(nskb);
+
 		rc = ops->ndo_start_xmit(nskb, dev);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
-- 
cgit v1.2.2


From f466dba1832f05006cf6caa9be41fb98d11cb848 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 23 Dec 2009 22:02:57 -0800
Subject: pktgen: ndo_start_xmit can return NET_XMIT_xxx values

This updates pktgen so that it does not decrement skb->users
when it receives valid NET_XMIT_xxx values.  These are now
valid return values from ndo_start_xmit in net-next-2.6.
They also indicate the skb has been consumed.

This fixes pktgen to work correctly with vlan devices.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index a23b45f08ec9..de0c2c726420 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -250,8 +250,7 @@ struct pktgen_dev {
 	__u64 count;		/* Default No packets to send */
 	__u64 sofar;		/* How many pkts we've sent so far */
 	__u64 tx_bytes;		/* How many bytes we've transmitted */
-	__u64 errors;		/* Errors when trying to transmit,
-				   pkts will be re-sent */
+	__u64 errors;		/* Errors when trying to transmit, */
 
 	/* runtime counters relating to clone_skb */
 
@@ -3465,6 +3464,12 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		pkt_dev->seq_num++;
 		pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
 		break;
+	case NET_XMIT_DROP:
+	case NET_XMIT_CN:
+	case NET_XMIT_POLICED:
+		/* skb has been consumed */
+		pkt_dev->errors++;
+		break;
 	default: /* Drivers are not supposed to return other values! */
 		if (net_ratelimit())
 			pr_info("pktgen: %s xmit error: %d\n",
-- 
cgit v1.2.2


From 1f3c8804acba841b5573b953f5560d2683d2db0d Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <andy@greyhouse.net>
Date: Mon, 14 Dec 2009 10:48:58 +0000
Subject: bonding: allow arp_ip_targets on separate vlans to use arp validation

This allows a bond device to specify an arp_ip_target as a host that is
not on the same vlan as the base bond device and still use arp
validation.  A configuration like this, now works:

BONDING_OPTS="mode=active-backup arp_interval=1000 arp_ip_target=10.0.100.1 arp_validate=3"

1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436 qdisc noqueue
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
2: eth1: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast master bond0 qlen 1000
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
3: eth0: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast master bond0 qlen 1000
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
8: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
    inet6 fe80::213:21ff:febe:33e9/64 scope link
       valid_lft forever preferred_lft forever
9: bond0.100@bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
    inet 10.0.100.2/24 brd 10.0.100.255 scope global bond0.100
    inet6 fe80::213:21ff:febe:33e9/64 scope link
       valid_lft forever preferred_lft forever

Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009)

Bonding Mode: fault-tolerance (active-backup)
Primary Slave: None
Currently Active Slave: eth1
MII Status: up
MII Polling Interval (ms): 0
Up Delay (ms): 0
Down Delay (ms): 0
ARP Polling Interval (ms): 1000
ARP IP target/s (n.n.n.n form): 10.0.100.1

Slave Interface: eth1
MII Status: up
Link Failure Count: 1
Permanent HW addr: 00:40:05:30:ff:30

Slave Interface: eth0
MII Status: up
Link Failure Count: 0
Permanent HW addr: 00:13:21:be:33:e9

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index a8d68cdedbbe..f9aa699ab6cb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2495,12 +2495,26 @@ ncls:
 	if (!skb)
 		goto out;
 
+	/*
+	 * Make sure frames received on VLAN interfaces stacked on
+	 * bonding interfaces still make their way to any base bonding
+	 * device that may have registered for a specific ptype.  The
+	 * handler may have to adjust skb->dev and orig_dev.
+	 *
+	 * null_or_orig can be overloaded since it will not be set when
+	 * using VLANs on top of bonding.  Putting it here prevents
+	 * disturbing the ptype_all handlers above.
+	 */
+	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
+	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
+		null_or_orig = vlan_dev_real_dev(skb->dev);
+	}
+
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type &&
-		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-		     ptype->dev == orig_dev)) {
+		if (ptype->type == type && (ptype->dev == null_or_orig ||
+		     ptype->dev == skb->dev || ptype->dev == orig_dev)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
-- 
cgit v1.2.2


From ca8d9ea30bc79b2965a1d169dcb2f48f02af4d2d Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <andy@greyhouse.net>
Date: Wed, 6 Jan 2010 12:56:37 +0000
Subject: fix bonding: allow arp_ip_targets on separate vlans to use arp
 validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On Wed, Jan 06, 2010 at 10:10:03PM +0100, Eric Dumazet wrote:
> Le 06/01/2010 19:38, Eric Dumazet a écrit :
> >
> > (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)
>
> David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d
> (bonding: allow arp_ip_targets on separate vlans to use arp validation)
>
> Or else, my vlan devices dont work (unfortunatly I dont have much time
> these days to debug the thing)
>
> My config :
>
>               +---------+
> vlan.103 -----+ bond0   +--- eth1 (bnx2)
>               |         +
> vlan.825 -----+         +--- eth2 (tg3)
>               +---------+
>
> $ cat /proc/net/bonding/bond0
> Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009)
>
> Bonding Mode: fault-tolerance (active-backup)
> Primary Slave: None
> Currently Active Slave: eth2
> MII Status: up
> MII Polling Interval (ms): 100
> Up Delay (ms): 0
> Down Delay (ms): 0
>
> Slave Interface: eth1  (bnx2)
> MII Status: down
> Link Failure Count: 1
> Permanent HW addr: 00:1e:0b:ec:d3:d2
>
> Slave Interface: eth2   (tg3)
> MII Status: up
> Link Failure Count: 0
> Permanent HW addr: 00:1e:0b:92:78:50
>

This patch fixes up a problem with found with commit
1f3c8804acba841b5573b953f5560d2683d2db0d.  The original change
overloaded null_or_orig, but doing that prevented any packet handlers
that were not tied to a specific device (i.e. ptype->dev == NULL) from
ever receiving any frames.

The null_or_orig variable cannot be overloaded, and must be kept as NULL
to prevent the frame from being ignored by packet handlers designed to
accept frames on any interface.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index f9aa699ab6cb..d9ab9be0c323 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2430,6 +2430,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
 	struct net_device *null_or_orig;
+	struct net_device *null_or_bond;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -2500,21 +2501,19 @@ ncls:
 	 * bonding interfaces still make their way to any base bonding
 	 * device that may have registered for a specific ptype.  The
 	 * handler may have to adjust skb->dev and orig_dev.
-	 *
-	 * null_or_orig can be overloaded since it will not be set when
-	 * using VLANs on top of bonding.  Putting it here prevents
-	 * disturbing the ptype_all handlers above.
 	 */
+	null_or_bond = NULL;
 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		null_or_orig = vlan_dev_real_dev(skb->dev);
+		null_or_bond = vlan_dev_real_dev(skb->dev);
 	}
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
 		if (ptype->type == type && (ptype->dev == null_or_orig ||
-		     ptype->dev == skb->dev || ptype->dev == orig_dev)) {
+		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
+		     ptype->dev == null_or_bond)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
-- 
cgit v1.2.2


From 2d13bafeba24f732e89b818b8c66b07893457570 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@comx.dk>
Date: Tue, 5 Jan 2010 05:50:52 +0000
Subject: net: Make it easier to parse /proc/net/dev contents.

The contents of /proc/net/dev is annoying to parse, because
it changes whether there is a space after the "ethX:" or not.
It depends upon the size of the "Receive bytes" counter,
if the number is below 7 digits, then there is whitespaces
else if the number is 8 digits or above there is no space
between the ":" and the number.

This patch changes the output to assure there is always a space
between the ":" and the number.  Given that all existing userspace
application already need to handle the whitespaces, I see
no breakage of existing tools.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index d9ab9be0c323..a008f6987a95 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3206,7 +3206,7 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 {
 	const struct net_device_stats *stats = dev_get_stats(dev);
 
-	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
+	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
 		   dev->name, stats->rx_bytes, stats->rx_packets,
 		   stats->rx_errors,
-- 
cgit v1.2.2


From 704da560c0a0120d8869187f511491a00951a1d3 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Fri, 8 Jan 2010 00:00:09 -0800
Subject: tcp: update the netstamp_needed counter when cloning sockets

This fixes a netstamp_needed accounting issue when the listen socket
has SO_TIMESTAMP set:

    s = socket(AF_INET, SOCK_STREAM, 0);
    setsockopt(s, SOL_SOCKET, SO_TIMESTAMP, 1); -> netstamp_needed = 1
    bind(s, ...);
    listen(s, ...);
    s2 = accept(s, ...); -> netstamp_needed = 1
    close(s2); -> netstamp_needed = 0
    close(s); -> netstamp_needed = -1

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 76ff58d43e26..e1f6f225f012 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1205,6 +1205,10 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 
 		if (newsk->sk_prot->sockets_allocated)
 			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+
+		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
+		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+			net_enable_timestamp();
 	}
 out:
 	return newsk;
-- 
cgit v1.2.2


From 508e14b4a4fb1a824a14f2c5b8d7df67b313f8e4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <danborkmann@googlemail.com>
Date: Tue, 12 Jan 2010 14:27:30 +0000
Subject: netpoll: allow execution of multiple rx_hooks per interface

Signed-off-by: Daniel Borkmann <danborkmann@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 169 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 106 insertions(+), 63 deletions(-)

(limited to 'net/core')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 0b4d0d35ef40..7aa697253765 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -407,11 +407,24 @@ static void arp_reply(struct sk_buff *skb)
 	__be32 sip, tip;
 	unsigned char *sha;
 	struct sk_buff *send_skb;
-	struct netpoll *np = NULL;
+	struct netpoll *np, *tmp;
+	unsigned long flags;
+	int hits = 0;
+
+	if (list_empty(&npinfo->rx_np))
+		return;
+
+	/* Before checking the packet, we do some early
+	   inspection whether this is interesting at all */
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
+		if (np->dev == skb->dev)
+			hits++;
+	}
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
 
-	if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
-		np = npinfo->rx_np;
-	if (!np)
+	/* No netpoll struct is using this dev */
+	if (!hits)
 		return;
 
 	/* No arp on this interface */
@@ -437,77 +450,91 @@ static void arp_reply(struct sk_buff *skb)
 	arp_ptr += skb->dev->addr_len;
 	memcpy(&sip, arp_ptr, 4);
 	arp_ptr += 4;
-	/* if we actually cared about dst hw addr, it would get copied here */
+	/* If we actually cared about dst hw addr,
+	   it would get copied here */
 	arp_ptr += skb->dev->addr_len;
 	memcpy(&tip, arp_ptr, 4);
 
 	/* Should we ignore arp? */
-	if (tip != np->local_ip ||
-	    ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+	if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
 		return;
 
 	size = arp_hdr_len(skb->dev);
-	send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev),
-			    LL_RESERVED_SPACE(np->dev));
 
-	if (!send_skb)
-		return;
-
-	skb_reset_network_header(send_skb);
-	arp = (struct arphdr *) skb_put(send_skb, size);
-	send_skb->dev = skb->dev;
-	send_skb->protocol = htons(ETH_P_ARP);
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
+		if (tip != np->local_ip)
+			continue;
 
-	/* Fill the device header for the ARP frame */
-	if (dev_hard_header(send_skb, skb->dev, ptype,
-			    sha, np->dev->dev_addr,
-			    send_skb->len) < 0) {
-		kfree_skb(send_skb);
-		return;
-	}
+		send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev),
+				    LL_RESERVED_SPACE(np->dev));
+		if (!send_skb)
+			continue;
 
-	/*
-	 * Fill out the arp protocol part.
-	 *
-	 * we only support ethernet device type,
-	 * which (according to RFC 1390) should always equal 1 (Ethernet).
-	 */
+		skb_reset_network_header(send_skb);
+		arp = (struct arphdr *) skb_put(send_skb, size);
+		send_skb->dev = skb->dev;
+		send_skb->protocol = htons(ETH_P_ARP);
 
-	arp->ar_hrd = htons(np->dev->type);
-	arp->ar_pro = htons(ETH_P_IP);
-	arp->ar_hln = np->dev->addr_len;
-	arp->ar_pln = 4;
-	arp->ar_op = htons(type);
+		/* Fill the device header for the ARP frame */
+		if (dev_hard_header(send_skb, skb->dev, ptype,
+				    sha, np->dev->dev_addr,
+				    send_skb->len) < 0) {
+			kfree_skb(send_skb);
+			continue;
+		}
 
-	arp_ptr=(unsigned char *)(arp + 1);
-	memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
-	arp_ptr += np->dev->addr_len;
-	memcpy(arp_ptr, &tip, 4);
-	arp_ptr += 4;
-	memcpy(arp_ptr, sha, np->dev->addr_len);
-	arp_ptr += np->dev->addr_len;
-	memcpy(arp_ptr, &sip, 4);
+		/*
+		 * Fill out the arp protocol part.
+		 *
+		 * we only support ethernet device type,
+		 * which (according to RFC 1390) should
+		 * always equal 1 (Ethernet).
+		 */
 
-	netpoll_send_skb(np, send_skb);
+		arp->ar_hrd = htons(np->dev->type);
+		arp->ar_pro = htons(ETH_P_IP);
+		arp->ar_hln = np->dev->addr_len;
+		arp->ar_pln = 4;
+		arp->ar_op = htons(type);
+
+		arp_ptr = (unsigned char *)(arp + 1);
+		memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
+		arp_ptr += np->dev->addr_len;
+		memcpy(arp_ptr, &tip, 4);
+		arp_ptr += 4;
+		memcpy(arp_ptr, sha, np->dev->addr_len);
+		arp_ptr += np->dev->addr_len;
+		memcpy(arp_ptr, &sip, 4);
+
+		netpoll_send_skb(np, send_skb);
+
+		/* If there are several rx_hooks for the same address,
+		   we're fine by sending a single reply */
+		break;
+	}
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
 }
 
 int __netpoll_rx(struct sk_buff *skb)
 {
 	int proto, len, ulen;
+	int hits = 0;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll_info *npi = skb->dev->npinfo;
-	struct netpoll *np = npi->rx_np;
+	struct netpoll_info *npinfo = skb->dev->npinfo;
+	struct netpoll *np, *tmp;
 
-	if (!np)
+	if (list_empty(&npinfo->rx_np))
 		goto out;
+
 	if (skb->dev->type != ARPHRD_ETHER)
 		goto out;
 
 	/* check if netpoll clients need ARP */
 	if (skb->protocol == htons(ETH_P_ARP) &&
 	    atomic_read(&trapped)) {
-		skb_queue_tail(&npi->arp_tx, skb);
+		skb_queue_tail(&npinfo->arp_tx, skb);
 		return 1;
 	}
 
@@ -551,16 +578,23 @@ int __netpoll_rx(struct sk_buff *skb)
 		goto out;
 	if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
 		goto out;
-	if (np->local_ip && np->local_ip != iph->daddr)
-		goto out;
-	if (np->remote_ip && np->remote_ip != iph->saddr)
-		goto out;
-	if (np->local_port && np->local_port != ntohs(uh->dest))
-		goto out;
 
-	np->rx_hook(np, ntohs(uh->source),
-		    (char *)(uh+1),
-		    ulen - sizeof(struct udphdr));
+	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
+		if (np->local_ip && np->local_ip != iph->daddr)
+			continue;
+		if (np->remote_ip && np->remote_ip != iph->saddr)
+			continue;
+		if (np->local_port && np->local_port != ntohs(uh->dest))
+			continue;
+
+		np->rx_hook(np, ntohs(uh->source),
+			       (char *)(uh+1),
+			       ulen - sizeof(struct udphdr));
+		hits++;
+	}
+
+	if (!hits)
+		goto out;
 
 	kfree_skb(skb);
 	return 1;
@@ -684,6 +718,7 @@ int netpoll_setup(struct netpoll *np)
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
 	struct netpoll_info *npinfo;
+	struct netpoll *npe, *tmp;
 	unsigned long flags;
 	int err;
 
@@ -704,7 +739,7 @@ int netpoll_setup(struct netpoll *np)
 		}
 
 		npinfo->rx_flags = 0;
-		npinfo->rx_np = NULL;
+		INIT_LIST_HEAD(&npinfo->rx_np);
 
 		spin_lock_init(&npinfo->rx_lock);
 		skb_queue_head_init(&npinfo->arp_tx);
@@ -785,7 +820,7 @@ int netpoll_setup(struct netpoll *np)
 	if (np->rx_hook) {
 		spin_lock_irqsave(&npinfo->rx_lock, flags);
 		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
-		npinfo->rx_np = np;
+		list_add_tail(&np->rx, &npinfo->rx_np);
 		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
 	}
 
@@ -801,9 +836,16 @@ int netpoll_setup(struct netpoll *np)
 	return 0;
 
  release:
-	if (!ndev->npinfo)
+	if (!ndev->npinfo) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		list_for_each_entry_safe(npe, tmp, &npinfo->rx_np, rx) {
+			npe->dev = NULL;
+		}
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
 		kfree(npinfo);
-	np->dev = NULL;
+	}
+
 	dev_put(ndev);
 	return err;
 }
@@ -823,10 +865,11 @@ void netpoll_cleanup(struct netpoll *np)
 	if (np->dev) {
 		npinfo = np->dev->npinfo;
 		if (npinfo) {
-			if (npinfo->rx_np == np) {
+			if (!list_empty(&npinfo->rx_np)) {
 				spin_lock_irqsave(&npinfo->rx_lock, flags);
-				npinfo->rx_np = NULL;
-				npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+				list_del(&np->rx);
+				if (list_empty(&npinfo->rx_np))
+					npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
 				spin_unlock_irqrestore(&npinfo->rx_lock, flags);
 			}
 
-- 
cgit v1.2.2


From 4d0392be21d4710121f855c0caf57d32ffd1ced0 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Fri, 15 Jan 2010 01:08:58 -0800
Subject: net/core/sock.c: quiet sparse noise

In sock_getsockopt the symbol 'lv' is declared as an
unsigned int type, probably due to sizeof returning a
size_t which is really an unsigned int.

This produces a sparse warning for SO_PEERNAME due to
the sock->ops->getname() call:

warning: incorrect type in argument 3 (different signedness)
   expected int *sockaddr_len
   got unsigned int *<noident>

Quiet the warning by changing the type of 'lv' to an int.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index e1f6f225f012..10b1d3243a72 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -741,7 +741,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		struct timeval tm;
 	} v;
 
-	unsigned int lv = sizeof(int);
+	int lv = sizeof(int);
 	int len;
 
 	if (get_user(len, optlen))
-- 
cgit v1.2.2


From 2c8c1e7297e19bdef3c178c3ea41d898a7716e3e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 17 Jan 2010 03:35:32 +0000
Subject: net: spread __net_init, __net_exit

__net_init/__net_exit are apparently not going away, so use them
to full extent.

In some cases __net_init was removed, because it was called from
__net_exit code.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 2 +-
 net/core/rtnetlink.c | 4 ++--
 net/core/sock.c      | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 02a3b2c69c1e..9a24377146bf 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -708,7 +708,7 @@ static struct notifier_block fib_rules_notifier = {
 	.notifier_call = fib_rules_event,
 };
 
-static int fib_rules_net_init(struct net *net)
+static int __net_init fib_rules_net_init(struct net *net)
 {
 	INIT_LIST_HEAD(&net->rules_ops);
 	spin_lock_init(&net->rules_mod_lock);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 794bcb897ff0..62f3878a6010 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1386,7 +1386,7 @@ static struct notifier_block rtnetlink_dev_notifier = {
 };
 
 
-static int rtnetlink_net_init(struct net *net)
+static int __net_init rtnetlink_net_init(struct net *net)
 {
 	struct sock *sk;
 	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
@@ -1397,7 +1397,7 @@ static int rtnetlink_net_init(struct net *net)
 	return 0;
 }
 
-static void rtnetlink_net_exit(struct net *net)
+static void __net_exit rtnetlink_net_exit(struct net *net)
 {
 	netlink_kernel_release(net->rtnl);
 	net->rtnl = NULL;
diff --git a/net/core/sock.c b/net/core/sock.c
index 10b1d3243a72..ceef50bd131b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2140,13 +2140,13 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
-static int sock_inuse_init_net(struct net *net)
+static int __net_init sock_inuse_init_net(struct net *net)
 {
 	net->core.inuse = alloc_percpu(struct prot_inuse);
 	return net->core.inuse ? 0 : -ENOMEM;
 }
 
-static void sock_inuse_exit_net(struct net *net)
+static void __net_exit sock_inuse_exit_net(struct net *net)
 {
 	free_percpu(net->core.inuse);
 }
-- 
cgit v1.2.2


From 11380a4b2d86fae9a6bce75c9373668cc323fe57 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 19 Jan 2010 13:46:10 -0800
Subject: net: Unexport napi_gro_flush().

Nothing outside of net/core/dev.c uses it.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index a008f6987a95..5747b9edc1bb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2582,7 +2582,7 @@ out:
 	return netif_receive_skb(skb);
 }
 
-void napi_gro_flush(struct napi_struct *napi)
+static void napi_gro_flush(struct napi_struct *napi)
 {
 	struct sk_buff *skb, *next;
 
@@ -2595,7 +2595,6 @@ void napi_gro_flush(struct napi_struct *napi)
 	napi->gro_count = 0;
 	napi->gro_list = NULL;
 }
-EXPORT_SYMBOL(napi_gro_flush);
 
 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
-- 
cgit v1.2.2


From 4b258461c0b31ded170a1a56b944b0fded1c887b Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Thu, 21 Jan 2010 01:26:29 -0800
Subject: net: Optimize non-gso test checks

Avoid checking twice whether skb needs to be linearized, if one
skb_linearize was already done.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 5747b9edc1bb..4fad9db417b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1982,6 +1982,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	return rc;
 }
 
+/*
+ * Returns true if either:
+ *	1. skb has frag_list and the device doesn't support FRAGLIST, or
+ *	2. skb is fragmented and the device does not support SG, or if
+ *	   at least one of fragments is in highmem and device does not
+ *	   support DMA from it.
+ */
+static inline int skb_needs_linearize(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
+	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
+					      illegal_highdma(dev, skb)));
+}
+
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
@@ -2018,18 +2033,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 	if (netif_needs_gso(dev, skb))
 		goto gso;
 
-	if (skb_has_frags(skb) &&
-	    !(dev->features & NETIF_F_FRAGLIST) &&
-	    __skb_linearize(skb))
-		goto out_kfree_skb;
-
-	/* Fragmented skb is linearized if device does not support SG,
-	 * or if at least one of fragments is in highmem and device
-	 * does not support DMA from it.
-	 */
-	if (skb_shinfo(skb)->nr_frags &&
-	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
-	    __skb_linearize(skb))
+	/* Convert a paged skb to linear, if required */
+	if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
 		goto out_kfree_skb;
 
 	/* If packet is not checksummed and device does not support
-- 
cgit v1.2.2


From 81c1ebfc4379f529b001e23164dd5c2282bdc0ec Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 22 Jan 2010 10:16:05 +0000
Subject: neigh: simplify seq_file code

Simpily pass 'struct neigh_table' with seq_file private pointer,
and save one dereference. Proc entry itself isn't interesting.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f35377b643e4..f2efd72da799 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2417,8 +2417,7 @@ EXPORT_SYMBOL(neigh_seq_stop);
 
 static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct proc_dir_entry *pde = seq->private;
-	struct neigh_table *tbl = pde->data;
+	struct neigh_table *tbl = seq->private;
 	int cpu;
 
 	if (*pos == 0)
@@ -2435,8 +2434,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct proc_dir_entry *pde = seq->private;
-	struct neigh_table *tbl = pde->data;
+	struct neigh_table *tbl = seq->private;
 	int cpu;
 
 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
@@ -2455,8 +2453,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
 
 static int neigh_stat_seq_show(struct seq_file *seq, void *v)
 {
-	struct proc_dir_entry *pde = seq->private;
-	struct neigh_table *tbl = pde->data;
+	struct neigh_table *tbl = seq->private;
 	struct neigh_statistics *st = v;
 
 	if (v == SEQ_START_TOKEN) {
@@ -2501,7 +2498,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file)
 
 	if (!ret) {
 		struct seq_file *sf = file->private_data;
-		sf->private = PDE(inode);
+		sf->private = PDE(inode)->data;
 	}
 	return ret;
 };
-- 
cgit v1.2.2


From 32e7bfc41110bc8f29ec0f293c3bcee6645fef34 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 25 Jan 2010 13:36:10 -0800
Subject: net: use helpers to access uc list V2

This patch introduces three macros to work with uc list from net drivers.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4fad9db417b1..2cba5c521e56 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3665,10 +3665,10 @@ void __dev_set_rx_mode(struct net_device *dev)
 		/* Unicast addresses changes may only happen under the rtnl,
 		 * therefore calling __dev_set_promiscuity here is safe.
 		 */
-		if (dev->uc.count > 0 && !dev->uc_promisc) {
+		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 			__dev_set_promiscuity(dev, 1);
 			dev->uc_promisc = 1;
-		} else if (dev->uc.count == 0 && dev->uc_promisc) {
+		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 			__dev_set_promiscuity(dev, -1);
 			dev->uc_promisc = 0;
 		}
-- 
cgit v1.2.2


From 8a83a00b0735190384a348156837918271034144 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 30 Jan 2010 12:23:03 +0000
Subject: net: maintain namespace isolation between vlan and real device

In the vlan and macvlan drivers, the start_xmit function forwards
data to the dev_queue_xmit function for another device, which may
potentially belong to a different namespace.

To make sure that classification stays within a single namespace,
this resets the potentially critical fields.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2cba5c521e56..94c1eeed25e5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 	if (skb->len > (dev->mtu + dev->hard_header_len))
 		return NET_RX_DROP;
 
-	skb_dst_drop(skb);
+	skb_set_dev(skb, dev);
 	skb->tstamp.tv64 = 0;
 	skb->pkt_type = PACKET_HOST;
 	skb->protocol = eth_type_trans(skb, dev);
-	skb->mark = 0;
-	secpath_reset(skb);
-	nf_reset(skb);
 	return netif_rx(skb);
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
 	return false;
 }
 
+/**
+ * skb_dev_set -- assign a new device to a buffer
+ * @skb: buffer for the new device
+ * @dev: network device
+ *
+ * If an skb is owned by a device already, we have to reset
+ * all data private to the namespace a device belongs to
+ * before assigning it a new device.
+ */
+#ifdef CONFIG_NET_NS
+void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
+{
+	skb_dst_drop(skb);
+	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
+		secpath_reset(skb);
+		nf_reset(skb);
+		skb_init_secmark(skb);
+		skb->mark = 0;
+		skb->priority = 0;
+		skb->nf_trace = 0;
+		skb->ipvs_property = 0;
+#ifdef CONFIG_NET_SCHED
+		skb->tc_index = 0;
+#endif
+	}
+	skb->dev = dev;
+}
+EXPORT_SYMBOL(skb_set_dev);
+#endif /* CONFIG_NET_NS */
+
 /*
  * Invalidate hardware checksum when packet is to be mangled, and
  * complete checksum manually on outgoing path.
-- 
cgit v1.2.2


From 1b3f720bf033fde1fbb6231f9b156b918c5f68d8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 4 Feb 2010 14:00:41 -0800
Subject: pktgen: Fix freezing problem

Add missing try_to_freeze() to one of the pktgen_thread_worker() code
paths so that it doesn't block suspend/hibernation.

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=15006

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reported-and-tested-by: Ciprian Dorin Craciun <ciprian.craciun@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index de0c2c726420..2e692afdc55d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3524,6 +3524,7 @@ static int pktgen_thread_worker(void *arg)
 			wait_event_interruptible_timeout(t->queue,
 							 t->control != 0,
 							 HZ/10);
+			try_to_freeze();
 			continue;
 		}
 
-- 
cgit v1.2.2


From 2fc1b5dd99f66d93ffc23fd8df82d384c1a354c8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 8 Feb 2010 15:00:39 -0800
Subject: dst: call cond_resched() in dst_gc_task()

Kernel bugzilla #15239

On some workloads, it is quite possible to get a huge dst list to
process in dst_gc_task(), and trigger soft lockup detection.

Fix is to call cond_resched(), as we run in process context.

Reported-by: Pawel Staszewski <pstaszewski@itcare.pl>
Tested-by: Pawel Staszewski <pstaszewski@itcare.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dst.c b/net/core/dst.c
index 57bc4d5b8d08..cb1b3488b739 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <net/net_namespace.h>
+#include <linux/sched.h>
 
 #include <net/dst.h>
 
@@ -79,6 +80,7 @@ loop:
 	while ((dst = next) != NULL) {
 		next = dst->next;
 		prefetch(&next->next);
+		cond_resched();
 		if (likely(atomic_read(&dst->__refcnt))) {
 			last->next = dst;
 			last = dst;
-- 
cgit v1.2.2


From 15682bc488d4af8c9bb998844a94281025e0a333 Mon Sep 17 00:00:00 2001
From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Date: Wed, 10 Feb 2010 20:03:05 -0800
Subject: ethtool: Introduce n-tuple filter programming support

This patchset enables the ethtool layer to program n-tuple
filters to an underlying device.  The idea is to allow capable
hardware to have static rules applied that can assist steering
flows into appropriate queues.

Hardware that is known to support these types of filters today
are ixgbe and niu.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c     |   5 +
 net/core/ethtool.c | 329 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 333 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 94c1eeed25e5..ae75f25ac0a5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5419,6 +5419,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	netdev_init_queues(dev);
 
+	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
+	dev->ethtool_ntuple_list.count = 0;
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
 	INIT_LIST_HEAD(&dev->link_watch_list);
@@ -5455,6 +5457,9 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
+	/* Clear ethtool n-tuple list */
+	ethtool_ntuple_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d8aee584e8d1..6ec73d3983a3 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -120,7 +120,7 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data)
  * NETIF_F_xxx values in include/linux/netdevice.h
  */
 static const u32 flags_dup_features =
-	ETH_FLAG_LRO;
+	(ETH_FLAG_LRO | ETH_FLAG_NTUPLE);
 
 u32 ethtool_op_get_flags(struct net_device *dev)
 {
@@ -139,9 +139,26 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data)
 	else
 		dev->features &= ~NETIF_F_LRO;
 
+	if (data & ETH_FLAG_NTUPLE)
+		dev->features |= NETIF_F_NTUPLE;
+	else
+		dev->features &= ~NETIF_F_NTUPLE;
+
 	return 0;
 }
 
+void ethtool_ntuple_flush(struct net_device *dev)
+{
+	struct ethtool_rx_ntuple_flow_spec_container *fsc, *f;
+
+	list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) {
+		list_del(&fsc->list);
+		kfree(fsc);
+	}
+	dev->ethtool_ntuple_list.count = 0;
+}
+EXPORT_SYMBOL(ethtool_ntuple_flush);
+
 /* Handlers for each ethtool command */
 
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
@@ -266,6 +283,307 @@ err_out:
 	return ret;
 }
 
+static int __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
+                                  struct ethtool_rx_ntuple_flow_spec *spec)
+{
+	struct ethtool_rx_ntuple_flow_spec_container *fsc;
+
+	/* don't add filters forever */
+	if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY)
+		return 0;
+
+	fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC);
+	if (!fsc)
+		return -ENOMEM;
+
+	/* Copy the whole filter over */
+	fsc->fs.flow_type = spec->flow_type;
+	memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u));
+	memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u));
+
+	fsc->fs.vlan_tag = spec->vlan_tag;
+	fsc->fs.vlan_tag_mask = spec->vlan_tag_mask;
+	fsc->fs.data = spec->data;
+	fsc->fs.data_mask = spec->data_mask;
+	fsc->fs.action = spec->action;
+
+	/* add to the list */
+	list_add_tail_rcu(&fsc->list, &list->list);
+	list->count++;
+
+	return 0;
+}
+
+static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_rx_ntuple cmd;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	int ret;
+
+	if (!ops->set_rx_ntuple)
+		return -EOPNOTSUPP;
+
+	if (!(dev->features & NETIF_F_NTUPLE))
+		return -EINVAL;
+
+	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+		return -EFAULT;
+
+	ret = ops->set_rx_ntuple(dev, &cmd);
+
+	/*
+	 * Cache filter in dev struct for GET operation only if
+	 * the underlying driver doesn't have its own GET operation, and
+	 * only if the filter was added successfully.
+	 */
+	if (!ops->get_rx_ntuple && !ret)
+		if (__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs))
+			return -ENOMEM;
+
+	return ret;
+}
+
+static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_gstrings gstrings;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rx_ntuple_flow_spec_container *fsc;
+	u8 *data;
+	char *p;
+	int ret, i, num_strings = 0;
+
+	if (!ops->get_sset_count)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+		return -EFAULT;
+
+	ret = ops->get_sset_count(dev, gstrings.string_set);
+	if (ret < 0)
+		return ret;
+
+	gstrings.len = ret;
+
+	data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	if (ops->get_rx_ntuple) {
+		/* driver-specific filter grab */
+		ret = ops->get_rx_ntuple(dev, gstrings.string_set, data);
+		goto copy;
+	}
+
+	/* default ethtool filter grab */
+	i = 0;
+	p = (char *)data;
+	list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) {
+		sprintf(p, "Filter %d:\n", i);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+
+		switch (fsc->fs.flow_type) {
+		case TCP_V4_FLOW:
+			sprintf(p, "\tFlow Type: TCP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case UDP_V4_FLOW:
+			sprintf(p, "\tFlow Type: UDP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case SCTP_V4_FLOW:
+			sprintf(p, "\tFlow Type: SCTP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case AH_ESP_V4_FLOW:
+			sprintf(p, "\tFlow Type: AH ESP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case ESP_V4_FLOW:
+			sprintf(p, "\tFlow Type: ESP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IP_USER_FLOW:
+			sprintf(p, "\tFlow Type: Raw IP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IPV4_FLOW:
+			sprintf(p, "\tFlow Type: IPv4\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		default:
+			sprintf(p, "\tFlow Type: Unknown\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			goto unknown_filter;
+		};
+
+		/* now the rest of the filters */
+		switch (fsc->fs.flow_type) {
+		case TCP_V4_FLOW:
+		case UDP_V4_FLOW:
+		case SCTP_V4_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+			        fsc->fs.h_u.tcp_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+			        fsc->fs.m_u.tcp_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+			        fsc->fs.h_u.tcp_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+			        fsc->fs.m_u.tcp_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc Port: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.tcp_ip4_spec.psrc,
+			        fsc->fs.m_u.tcp_ip4_spec.psrc);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest Port: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.tcp_ip4_spec.pdst,
+			        fsc->fs.m_u.tcp_ip4_spec.pdst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.tcp_ip4_spec.tos,
+			        fsc->fs.m_u.tcp_ip4_spec.tos);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case AH_ESP_V4_FLOW:
+		case ESP_V4_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+			        fsc->fs.h_u.ah_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+			        fsc->fs.m_u.ah_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+			        fsc->fs.h_u.ah_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+			        fsc->fs.m_u.ah_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSPI: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.ah_ip4_spec.spi,
+			        fsc->fs.m_u.ah_ip4_spec.spi);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.ah_ip4_spec.tos,
+			        fsc->fs.m_u.ah_ip4_spec.tos);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IP_USER_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+			        fsc->fs.h_u.raw_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+			        fsc->fs.m_u.raw_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+			        fsc->fs.h_u.raw_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+			        fsc->fs.m_u.raw_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IPV4_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+			        fsc->fs.h_u.usr_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+			        fsc->fs.m_u.usr_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+			        fsc->fs.h_u.usr_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+			        fsc->fs.m_u.usr_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n",
+			        fsc->fs.h_u.usr_ip4_spec.l4_4_bytes,
+			        fsc->fs.m_u.usr_ip4_spec.l4_4_bytes);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.usr_ip4_spec.tos,
+			        fsc->fs.m_u.usr_ip4_spec.tos);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tIP Version: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.usr_ip4_spec.ip_ver,
+			        fsc->fs.m_u.usr_ip4_spec.ip_ver);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tProtocol: %d, mask: 0x%x\n",
+			        fsc->fs.h_u.usr_ip4_spec.proto,
+			        fsc->fs.m_u.usr_ip4_spec.proto);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		};
+		sprintf(p, "\tVLAN: %d, mask: 0x%x\n",
+		        fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+		sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+		sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+		if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP)
+			sprintf(p, "\tAction: Drop\n");
+		else
+			sprintf(p, "\tAction: Direct to queue %d\n",
+			        fsc->fs.action);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+unknown_filter:
+		i++;
+	}
+copy:
+	/* indicate to userspace how many strings we actually have */
+	gstrings.len = num_strings;
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+		goto out;
+	useraddr += sizeof(gstrings);
+	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+		goto out;
+	ret = 0;
+
+out:
+	kfree(data);
+	return ret;
+}
+
 static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_regs regs;
@@ -313,6 +631,9 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
 	if (copy_from_user(&reset, useraddr, sizeof(reset)))
 		return -EFAULT;
 
+	/* Clear ethtool n-tuple list */
+	ethtool_ntuple_flush(dev);
+
 	ret = dev->ethtool_ops->reset(dev, &reset.data);
 	if (ret)
 		return ret;
@@ -1112,6 +1433,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_RESET:
 		rc = ethtool_reset(dev, useraddr);
 		break;
+	case ETHTOOL_SRXNTUPLE:
+		rc = ethtool_set_rx_ntuple(dev, useraddr);
+		break;
+	case ETHTOOL_GRXNTUPLE:
+		rc = ethtool_get_rx_ntuple(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.2


From 8e5574211d96c0552f84c757718475fdb4021be7 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Thu, 11 Feb 2010 12:14:23 -0800
Subject: ethtool: Use explicit designated initializers for .cmd

Initialize the .cmd member of various ethtool using a designated struct
initializer rather.  This makes things a teeny bit more robust, although
the chance of a struct layout changing is extremely remote, and also
makes the code a little easier to read.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6ec73d3983a3..a1280f643bf4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -163,7 +163,7 @@ EXPORT_SYMBOL(ethtool_ntuple_flush);
 
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 {
-	struct ethtool_cmd cmd = { ETHTOOL_GSET };
+	struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
 	int err;
 
 	if (!dev->ethtool_ops->get_settings)
@@ -645,7 +645,7 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
 
 static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
 {
-	struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
+	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
 
 	if (!dev->ethtool_ops->get_wol)
 		return -EOPNOTSUPP;
@@ -779,7 +779,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
 
 static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
 {
-	struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE };
+	struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
 
 	if (!dev->ethtool_ops->get_coalesce)
 		return -EOPNOTSUPP;
@@ -806,7 +806,7 @@ static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
 
 static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
 {
-	struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM };
+	struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
 
 	if (!dev->ethtool_ops->get_ringparam)
 		return -EOPNOTSUPP;
@@ -1160,7 +1160,7 @@ static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
 static int ethtool_get_value(struct net_device *dev, char __user *useraddr,
 			     u32 cmd, u32 (*actor)(struct net_device *))
 {
-	struct ethtool_value edata = { cmd };
+	struct ethtool_value edata = { .cmd = cmd };
 
 	if (!actor)
 		return -EOPNOTSUPP;
-- 
cgit v1.2.2


From 4cd24eaf0c6ee7f0242e34ee77ec899f255e66b5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 8 Feb 2010 04:30:35 +0000
Subject: net: use netdev_mc_count and netdev_mc_empty when appropriate

This patch replaces dev->mc_count in all drivers (hopefully I didn't miss
anything). Used spatch and did small tweaks and conding style changes when
it was suitable.

Jirka

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index ae75f25ac0a5..d1cf53d0d597 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4263,7 +4263,7 @@ static void dev_addr_discard(struct net_device *dev)
 	netif_addr_lock_bh(dev);
 
 	__dev_addr_discard(&dev->mc_list);
-	dev->mc_count = 0;
+	netdev_mc_count(dev) = 0;
 
 	netif_addr_unlock_bh(dev);
 }
-- 
cgit v1.2.2


From ebc08a6f47ee76ecad8e9f26c26e6ec9b46ca659 Mon Sep 17 00:00:00 2001
From: "Williams, Mitch A" <mitch.a.williams@intel.com>
Date: Wed, 10 Feb 2010 01:44:05 +0000
Subject: rtnetlink: Add VF config code to rtnetlink

Add code to allow rtnetlink clients to query and set VF information through
the PF driver.
Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 62f3878a6010..42da96a4eeee 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -35,6 +35,7 @@
 #include <linux/security.h>
 #include <linux/mutex.h>
 #include <linux/if_addr.h>
+#include <linux/pci.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -580,6 +581,15 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 	a->tx_compressed = b->tx_compressed;
 };
 
+static inline int rtnl_vfinfo_size(const struct net_device *dev)
+{
+	if (dev->dev.parent && dev_is_pci(dev->dev.parent))
+		return dev_num_vf(dev->dev.parent) *
+			sizeof(struct ifla_vf_info);
+	else
+		return 0;
+}
+
 static inline size_t if_nlmsg_size(const struct net_device *dev)
 {
 	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
@@ -597,6 +607,8 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + nla_total_size(4) /* IFLA_NUM_VF */
+	       + nla_total_size(rtnl_vfinfo_size(dev)) /* IFLA_VFINFO */
 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
@@ -665,6 +677,17 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	stats = dev_get_stats(dev);
 	copy_rtnl_link_stats(nla_data(attr), stats);
 
+	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
+		int i;
+		struct ifla_vf_info ivi;
+
+		NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
+		for (i = 0; i < dev_num_vf(dev->dev.parent); i++) {
+			if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
+				break;
+			NLA_PUT(skb, IFLA_VFINFO, sizeof(ivi), &ivi);
+		}
+	}
 	if (dev->rtnl_link_ops) {
 		if (rtnl_link_fill(skb, dev) < 0)
 			goto nla_put_failure;
@@ -725,6 +748,12 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	[IFLA_VF_MAC]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_mac) },
+	[IFLA_VF_VLAN]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_vlan) },
+	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_tx_rate) },
 };
 EXPORT_SYMBOL(ifla_policy);
 
@@ -898,6 +927,44 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		write_unlock_bh(&dev_base_lock);
 	}
 
+	if (tb[IFLA_VF_MAC]) {
+		struct ifla_vf_mac *ivm;
+		ivm = nla_data(tb[IFLA_VF_MAC]);
+		write_lock_bh(&dev_base_lock);
+		if (ops->ndo_set_vf_mac)
+			err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac);
+		write_unlock_bh(&dev_base_lock);
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+
+	if (tb[IFLA_VF_VLAN]) {
+		struct ifla_vf_vlan *ivv;
+		ivv = nla_data(tb[IFLA_VF_VLAN]);
+		write_lock_bh(&dev_base_lock);
+		if (ops->ndo_set_vf_vlan)
+			err = ops->ndo_set_vf_vlan(dev, ivv->vf,
+						   (u16)ivv->vlan,
+						   (u8)ivv->qos);
+		write_unlock_bh(&dev_base_lock);
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+	err = 0;
+
+	if (tb[IFLA_VF_TX_RATE]) {
+		struct ifla_vf_tx_rate *ivt;
+		ivt = nla_data(tb[IFLA_VF_TX_RATE]);
+		write_lock_bh(&dev_base_lock);
+		if (ops->ndo_set_vf_tx_rate)
+			err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate);
+		write_unlock_bh(&dev_base_lock);
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
 	err = 0;
 
 errout:
-- 
cgit v1.2.2


From e858911804f5ecadb41afd61582a11f68d416328 Mon Sep 17 00:00:00 2001
From: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
Date: Fri, 12 Feb 2010 13:48:05 +0000
Subject: ethtool: Fix filter addition when caching n-tuple filters

We can allow a filter to be added successfully to the underlying
hardware, but still return an error if the cached list memory
allocation fails.  This patch fixes that condition.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index a1280f643bf4..fbbe4b49116b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -283,18 +283,17 @@ err_out:
 	return ret;
 }
 
-static int __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
-                                  struct ethtool_rx_ntuple_flow_spec *spec)
+static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
+                              struct ethtool_rx_ntuple_flow_spec *spec,
+                              struct ethtool_rx_ntuple_flow_spec_container *fsc)
 {
-	struct ethtool_rx_ntuple_flow_spec_container *fsc;
 
 	/* don't add filters forever */
-	if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY)
-		return 0;
-
-	fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC);
-	if (!fsc)
-		return -ENOMEM;
+	if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) {
+		/* free the container */
+		kfree(fsc);
+		return;
+	}
 
 	/* Copy the whole filter over */
 	fsc->fs.flow_type = spec->flow_type;
@@ -310,14 +309,13 @@ static int __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
 	/* add to the list */
 	list_add_tail_rcu(&fsc->list, &list->list);
 	list->count++;
-
-	return 0;
 }
 
 static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rx_ntuple cmd;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL;
 	int ret;
 
 	if (!ops->set_rx_ntuple)
@@ -329,16 +327,26 @@ static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
-	ret = ops->set_rx_ntuple(dev, &cmd);
-
 	/*
 	 * Cache filter in dev struct for GET operation only if
 	 * the underlying driver doesn't have its own GET operation, and
-	 * only if the filter was added successfully.
+	 * only if the filter was added successfully.  First make sure we
+	 * can allocate the filter, then continue if successful.
 	 */
-	if (!ops->get_rx_ntuple && !ret)
-		if (__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs))
+	if (!ops->get_rx_ntuple) {
+		fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC);
+		if (!fsc)
 			return -ENOMEM;
+	}
+
+	ret = ops->set_rx_ntuple(dev, &cmd);
+	if (ret) {
+		kfree(fsc);
+		return ret;
+	}
+
+	if (!ops->get_rx_ntuple)
+		__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc);
 
 	return ret;
 }
-- 
cgit v1.2.2


From 0d643e1fb4207711d9c148b5c6a2820550a4a154 Mon Sep 17 00:00:00 2001
From: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
Date: Fri, 12 Feb 2010 13:48:25 +0000
Subject: ethtool: Move n-tuple capability check into set_flags

set_flags should check if the underlying device supports
n-tuple filter programming before setting the device flags
on the netdevice.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index fbbe4b49116b..794cf57078cd 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -134,15 +134,21 @@ u32 ethtool_op_get_flags(struct net_device *dev)
 
 int ethtool_op_set_flags(struct net_device *dev, u32 data)
 {
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+
 	if (data & ETH_FLAG_LRO)
 		dev->features |= NETIF_F_LRO;
 	else
 		dev->features &= ~NETIF_F_LRO;
 
-	if (data & ETH_FLAG_NTUPLE)
+	if (data & ETH_FLAG_NTUPLE) {
+		if (!ops->set_rx_ntuple)
+			return -EOPNOTSUPP;
 		dev->features |= NETIF_F_NTUPLE;
-	else
+	} else {
+		/* safe to clear regardless */
 		dev->features &= ~NETIF_F_NTUPLE;
+	}
 
 	return 0;
 }
@@ -318,9 +324,6 @@ static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
 	struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL;
 	int ret;
 
-	if (!ops->set_rx_ntuple)
-		return -EOPNOTSUPP;
-
 	if (!(dev->features & NETIF_F_NTUPLE))
 		return -EINVAL;
 
-- 
cgit v1.2.2


From 339c6e99853d2ef1f02ad8a313e079050a300427 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 15 Feb 2010 21:51:33 -0800
Subject: ethtool: reduce stack usage

dev_ethtool() is currently using 604 bytes of stack, even with gcc-4.4.2

objdump -d vmlinux | scripts/checkstack.pl
...
0xc04bbc33 dev_ethtool [vmlinux]:			604
...
Adding noinline attributes to selected functions can reduce stack usage.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 794cf57078cd..82cae3bca78d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -197,7 +197,10 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 	return dev->ethtool_ops->set_settings(dev, &cmd);
 }
 
-static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_drvinfo info;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -232,7 +235,10 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
 	return 0;
 }
 
-static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc cmd;
 
@@ -245,7 +251,10 @@ static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
 	return dev->ethtool_ops->set_rxnfc(dev, &cmd);
 }
 
-static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc info;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -317,7 +326,10 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
 	list->count++;
 }
 
-static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rx_ntuple cmd;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -788,7 +800,10 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
-static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
 
@@ -802,7 +817,10 @@ static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
 	return 0;
 }
 
-static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_coalesce coalesce;
 
@@ -1212,7 +1230,10 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
 	return actor(dev, edata.data);
 }
 
-static int ethtool_flash_device(struct net_device *dev, char __user *useraddr)
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_flash_device(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_flash efl;
 
-- 
cgit v1.2.2


From 1cab819b5e244e1b853c7b440981e6a960da3bfb Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Thu, 11 Feb 2010 13:48:29 +0000
Subject: ethtool: allow non-admin user to read GRO settings.

Looks like an oversight in GRO design.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d8aee584e8d1..236a9988ea91 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -927,6 +927,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GPERMADDR:
 	case ETHTOOL_GUFO:
 	case ETHTOOL_GGSO:
+	case ETHTOOL_GGRO:
 	case ETHTOOL_GFLAGS:
 	case ETHTOOL_GPFLAGS:
 	case ETHTOOL_GRXFH:
-- 
cgit v1.2.2


From 54716e3beb0ab20c49471348dfe399a71bfc8fd3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 14 Feb 2010 03:27:03 +0000
Subject: net neigh: Decouple per interface neighbour table controls from
 binary sysctls

Stop computing the number of neighbour table settings we have by
counting the number of binary sysctls.  This behaviour was silly
and meant that we could not add another neighbour table setting
without also adding another binary sysctl.

Don't pass the binary sysctl path for neighour table entries
into neigh_sysctl_register.  These parameters are no longer
used and so are just dead code.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f2efd72da799..d102f6d9abdc 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2556,9 +2556,11 @@ EXPORT_SYMBOL(neigh_app_ns);
 
 #ifdef CONFIG_SYSCTL
 
+#define NEIGH_VARS_MAX 19
+
 static struct neigh_sysctl_table {
 	struct ctl_table_header *sysctl_header;
-	struct ctl_table neigh_vars[__NET_NEIGH_MAX];
+	struct ctl_table neigh_vars[NEIGH_VARS_MAX];
 	char *dev_name;
 } neigh_sysctl_template __read_mostly = {
 	.neigh_vars = {
@@ -2675,8 +2677,7 @@ static struct neigh_sysctl_table {
 };
 
 int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
-			  int p_id, int pdev_id, char *p_name,
-			  proc_handler *handler)
+			  char *p_name, proc_handler *handler)
 {
 	struct neigh_sysctl_table *t;
 	const char *dev_name_source = NULL;
-- 
cgit v1.2.2


From dc4c2c31053ba5bf685d273cd62ecca406dddb2d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 12 Feb 2010 11:41:39 +0000
Subject: net: remove INIT_RCU_HEAD() usage

call_rcu() will unconditionally reinitialize RCU head anyway.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/drop_monitor.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index b8e9d3a86887..f8c874975350 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -296,7 +296,6 @@ static int dropmon_net_event(struct notifier_block *ev_block,
 
 		new_stat->dev = dev;
 		new_stat->last_rx = jiffies;
-		INIT_RCU_HEAD(&new_stat->rcu);
 		spin_lock(&trace_state_lock);
 		list_add_rcu(&new_stat->list, &hw_stats_list);
 		spin_unlock(&trace_state_lock);
-- 
cgit v1.2.2


From faf234220fb79a05891477a75180e1d9f7ab4105 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 17 Feb 2010 09:34:12 +0000
Subject: net: use kasprintf() for socket cache names

kasprintf() makes code smaller.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index ceef50bd131b..472a59f205b0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2228,13 +2228,10 @@ int proto_register(struct proto *prot, int alloc_slab)
 		}
 
 		if (prot->rsk_prot != NULL) {
-			static const char mask[] = "request_sock_%s";
-
-			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
 			if (prot->rsk_prot->slab_name == NULL)
 				goto out_free_sock_slab;
 
-			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
 								 prot->rsk_prot->obj_size, 0,
 								 SLAB_HWCACHE_ALIGN, NULL);
@@ -2247,14 +2244,11 @@ int proto_register(struct proto *prot, int alloc_slab)
 		}
 
 		if (prot->twsk_prot != NULL) {
-			static const char mask[] = "tw_sock_%s";
-
-			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
 
 			if (prot->twsk_prot->twsk_slab_name == NULL)
 				goto out_free_request_sock_slab;
 
-			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
 			prot->twsk_prot->twsk_slab =
 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
-- 
cgit v1.2.2


From 7af3351f71f4b3b5dbccb66cdc9b097052760a7f Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 17 Feb 2010 09:26:54 +0000
Subject: ethtool: Don't flush n-tuple list from ethtool_reset()

The n-tuple list should be flushed if and only if the ETH_RESET_FILTER
flag is set and the driver is able to reset filtering/flow direction
hardware without also resetting a component whose flag is not set.
This test is best left to the driver.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d08a0c7675bf..31b1eddc1b84 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -654,9 +654,6 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
 	if (copy_from_user(&reset, useraddr, sizeof(reset)))
 		return -EFAULT;
 
-	/* Clear ethtool n-tuple list */
-	ethtool_ntuple_flush(dev);
-
 	ret = dev->ethtool_ops->reset(dev, &reset.data);
 	if (ret)
 		return ret;
-- 
cgit v1.2.2


From e76b69cc0133952c98aa1ad6330cacacd269fd64 Mon Sep 17 00:00:00 2001
From: Ajit Khaparde <ajitkhaparde@gmail.com>
Date: Tue, 16 Feb 2010 20:25:43 +0000
Subject: net: bug fix for vlan + gro issue

Traffic (tcp) doesnot start on a vlan interface when gro is enabled.
Even the tcp handshake was not taking place.
This is because, the eth_type_trans call before the netif_receive_skb
in napi_gro_finish() resets the skb->dev to napi->dev from the previously
set vlan netdev interface. This causes the ip_route_input to drop the
incoming packet considering it as a packet coming from a martian source.

I could repro this on 2.6.32.7 (stable) and 2.6.33-rc7.
With this fix, the traffic starts and the test runs fine on both vlan
and non-vlan interfaces.

CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: Ajit Khaparde <ajitk@serverengines.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index d1cf53d0d597..1968980f513a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2813,7 +2813,7 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 	switch (ret) {
 	case GRO_NORMAL:
 	case GRO_HELD:
-		skb->protocol = eth_type_trans(skb, napi->dev);
+		skb->protocol = eth_type_trans(skb, skb->dev);
 
 		if (ret == GRO_HELD)
 			skb_gro_pull(skb, -ETH_HLEN);
-- 
cgit v1.2.2


From 5ff3f073670b544a9c0547cc6fef1f7eed5762ed Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 14 Feb 2010 01:01:00 +0000
Subject: net: export attach/detach filter routines

Export sk_attach_filter/sk_detach_filter routines,
so that tun module can use them.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 08db7b9143a3..7517110ff4ae 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -529,6 +529,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 		sk_filter_delayed_uncharge(sk, old_fp);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(sk_attach_filter);
 
 int sk_detach_filter(struct sock *sk)
 {
@@ -545,3 +546,4 @@ int sk_detach_filter(struct sock *sk)
 	rcu_read_unlock_bh();
 	return ret;
 }
+EXPORT_SYMBOL_GPL(sk_detach_filter);
-- 
cgit v1.2.2


From b8afe6416101549e877f8470f2a160df69676166 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 19 Feb 2010 13:23:47 +0000
Subject: net-sysfs: Use rtnl_trylock in wireless sysfs methods.

The wireless sysfs methods like the rest of the networking sysfs
methods are removed with the rtnl_lock held and block until
the existing methods stop executing.  So use rtnl_trylock
and restart_syscall so that the code continues to work.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index fbc1c7472c5e..099c753c4213 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -410,7 +410,8 @@ static ssize_t wireless_show(struct device *d, char *buf,
 	const struct iw_statistics *iw;
 	ssize_t ret = -EINVAL;
 
-	rtnl_lock();
+	if (!rtnl_trylock())
+		return restart_syscall();
 	if (dev_isalive(dev)) {
 		iw = get_wireless_stats(dev);
 		if (iw)
-- 
cgit v1.2.2


From bd55775c8dd656fc69b3a42a1c4ab32abb7e8af9 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Mon, 22 Feb 2010 16:20:22 -0800
Subject: xfrm: SA lookups signature with mark

pass mark to all SA lookups to prepare them for when we add code
to have them search.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 2e692afdc55d..43923811bd6a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2188,12 +2188,13 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
 /* If there was already an IPSEC SA, we keep it as is, else
  * we go look for it ...
 */
+#define DUMMY_MARK 0
 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
 {
 	struct xfrm_state *x = pkt_dev->flows[flow].x;
 	if (!x) {
 		/*slow path: we dont already have xfrm_state*/
-		x = xfrm_stateonly_find(&init_net,
+		x = xfrm_stateonly_find(&init_net, DUMMY_MARK,
 					(xfrm_address_t *)&pkt_dev->cur_daddr,
 					(xfrm_address_t *)&pkt_dev->cur_saddr,
 					AF_INET,
-- 
cgit v1.2.2


From c4d49794ff2838038fd9756eae39c39a5a685833 Mon Sep 17 00:00:00 2001
From: Ajit Khaparde <ajitkhaparde@gmail.com>
Date: Tue, 16 Feb 2010 20:25:43 +0000
Subject: net: bug fix for vlan + gro issue

Traffic (tcp) doesnot start on a vlan interface when gro is enabled.
Even the tcp handshake was not taking place.
This is because, the eth_type_trans call before the netif_receive_skb
in napi_gro_finish() resets the skb->dev to napi->dev from the previously
set vlan netdev interface. This causes the ip_route_input to drop the
incoming packet considering it as a packet coming from a martian source.

I could repro this on 2.6.32.7 (stable) and 2.6.33-rc7.
With this fix, the traffic starts and the test runs fine on both vlan
and non-vlan interfaces.

CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: Ajit Khaparde <ajitk@serverengines.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index be9924f60ec3..ec874218b206 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2761,7 +2761,7 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 	switch (ret) {
 	case GRO_NORMAL:
 	case GRO_HELD:
-		skb->protocol = eth_type_trans(skb, napi->dev);
+		skb->protocol = eth_type_trans(skb, skb->dev);
 
 		if (ret == GRO_HELD)
 			skb_gro_pull(skb, -ETH_HLEN);
-- 
cgit v1.2.2


From a898def29e4119bc01ebe7ca97423181f4c0ea2d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 22 Feb 2010 17:04:49 -0800
Subject: net: Add checking to rcu_dereference() primitives

Update rcu_dereference() primitives to use new lockdep-based
checking. The rcu_dereference() in __in6_dev_get() may be
protected either by rcu_read_lock() or RTNL, per Eric Dumazet.
The rcu_dereference() in __sk_free() is protected by the fact
that it is never reached if an update could change it.  Check
for this by using rcu_dereference_check() to verify that the
struct sock's ->sk_wmem_alloc counter is zero.

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1266887105-1528-5-git-send-email-paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 net/core/dev.c       | 2 +-
 net/core/filter.c    | 6 +++---
 net/core/rtnetlink.c | 8 ++++++++
 net/core/sock.c      | 3 ++-
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index ec874218b206..bb1f1da2b8a7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2041,7 +2041,7 @@ gso:
 	rcu_read_lock_bh();
 
 	txq = dev_pick_tx(dev, skb);
-	q = rcu_dereference(txq->qdisc);
+	q = rcu_dereference_bh(txq->qdisc);
 
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
diff --git a/net/core/filter.c b/net/core/filter.c
index 08db7b9143a3..3541aa48d21d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -86,7 +86,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 		return err;
 
 	rcu_read_lock_bh();
-	filter = rcu_dereference(sk->sk_filter);
+	filter = rcu_dereference_bh(sk->sk_filter);
 	if (filter) {
 		unsigned int pkt_len = sk_run_filter(skb, filter->insns,
 				filter->len);
@@ -521,7 +521,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	}
 
 	rcu_read_lock_bh();
-	old_fp = rcu_dereference(sk->sk_filter);
+	old_fp = rcu_dereference_bh(sk->sk_filter);
 	rcu_assign_pointer(sk->sk_filter, fp);
 	rcu_read_unlock_bh();
 
@@ -536,7 +536,7 @@ int sk_detach_filter(struct sock *sk)
 	struct sk_filter *filter;
 
 	rcu_read_lock_bh();
-	filter = rcu_dereference(sk->sk_filter);
+	filter = rcu_dereference_bh(sk->sk_filter);
 	if (filter) {
 		rcu_assign_pointer(sk->sk_filter, NULL);
 		sk_filter_delayed_uncharge(sk, filter);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 794bcb897ff0..4c7d3f635ba7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -89,6 +89,14 @@ int rtnl_is_locked(void)
 }
 EXPORT_SYMBOL(rtnl_is_locked);
 
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rtnl_is_held(void)
+{
+	return lockdep_is_held(&rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_is_held);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+
 static struct rtnl_link *rtnl_msg_handlers[NPROTO];
 
 static inline int rtm_msgindex(int msgtype)
diff --git a/net/core/sock.c b/net/core/sock.c
index e1f6f225f012..305cba401ae6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1073,7 +1073,8 @@ static void __sk_free(struct sock *sk)
 	if (sk->sk_destruct)
 		sk->sk_destruct(sk);
 
-	filter = rcu_dereference(sk->sk_filter);
+	filter = rcu_dereference_check(sk->sk_filter,
+				       atomic_read(&sk->sk_wmem_alloc) == 0);
 	if (filter) {
 		sk_filter_uncharge(sk, filter);
 		rcu_assign_pointer(sk->sk_filter, NULL);
-- 
cgit v1.2.2


From 4edb246626be6e031950205c885bdf29fb2ff1eb Mon Sep 17 00:00:00 2001
From: "Williams, Mitch A" <mitch.a.williams@intel.com>
Date: Wed, 24 Feb 2010 21:59:56 +0000
Subject: rtnetlink: clean up SR-IOV config interface

This patch consists of a few minor cleanups to the SR-IOV
configurion code in rtnetlink.
- Remove unneccesary lock
- Remove unneccesary casts
- Return correct error code for no driver support

These changes are based on comments from Patrick McHardy

Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 42da96a4eeee..4dd4c3cdc442 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -930,10 +930,9 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 	if (tb[IFLA_VF_MAC]) {
 		struct ifla_vf_mac *ivm;
 		ivm = nla_data(tb[IFLA_VF_MAC]);
-		write_lock_bh(&dev_base_lock);
+		err = -EOPNOTSUPP;
 		if (ops->ndo_set_vf_mac)
 			err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac);
-		write_unlock_bh(&dev_base_lock);
 		if (err < 0)
 			goto errout;
 		modified = 1;
@@ -942,12 +941,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 	if (tb[IFLA_VF_VLAN]) {
 		struct ifla_vf_vlan *ivv;
 		ivv = nla_data(tb[IFLA_VF_VLAN]);
-		write_lock_bh(&dev_base_lock);
+		err = -EOPNOTSUPP;
 		if (ops->ndo_set_vf_vlan)
 			err = ops->ndo_set_vf_vlan(dev, ivv->vf,
-						   (u16)ivv->vlan,
-						   (u8)ivv->qos);
-		write_unlock_bh(&dev_base_lock);
+						   ivv->vlan,
+						   ivv->qos);
 		if (err < 0)
 			goto errout;
 		modified = 1;
@@ -957,10 +955,9 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 	if (tb[IFLA_VF_TX_RATE]) {
 		struct ifla_vf_tx_rate *ivt;
 		ivt = nla_data(tb[IFLA_VF_TX_RATE]);
-		write_lock_bh(&dev_base_lock);
+		err = -EOPNOTSUPP;
 		if (ops->ndo_set_vf_tx_rate)
 			err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate);
-		write_unlock_bh(&dev_base_lock);
 		if (err < 0)
 			goto errout;
 		modified = 1;
-- 
cgit v1.2.2


From e5e26d75f490d7d41f25a4b39ed6db1713beb417 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Wed, 24 Feb 2010 14:01:38 +0000
Subject: netdev: use list_first_entry macro

Use list_first_entry macro; no longer any need to use
'next' directly in list to find first entry.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1968980f513a..eb7f1a4fefc6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3018,7 +3018,7 @@ static void net_rx_action(struct softirq_action *h)
 		 * entries to the tail of this list, and only ->poll()
 		 * calls can remove this head entry from the list.
 		 */
-		n = list_entry(list->next, struct napi_struct, poll_list);
+		n = list_first_entry(list, struct napi_struct, poll_list);
 
 		have = netpoll_poll_lock(n);
 
@@ -4882,7 +4882,7 @@ static void rollback_registered_many(struct list_head *head)
 	}
 
 	/* Process any work delayed until the end of the batch */
-	dev = list_entry(head->next, struct net_device, unreg_list);
+	dev = list_first_entry(head, struct net_device, unreg_list);
 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
 	synchronize_net();
@@ -5268,7 +5268,7 @@ void netdev_run_todo(void)
 
 	while (!list_empty(&list)) {
 		struct net_device *dev
-			= list_entry(list.next, struct net_device, todo_list);
+			= list_first_entry(&list, struct net_device, todo_list);
 		list_del(&dev->todo_list);
 
 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
-- 
cgit v1.2.2


From c79c5ffdce14abb4de3878c5aa8c3c6e5093c69b Mon Sep 17 00:00:00 2001
From: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
Date: Fri, 26 Feb 2010 01:54:20 +0000
Subject: ethtool: Add n-tuple string length to drvinfo and return it

The drvinfo struct should include the number of strings that
get_rx_ntuple will return.  It will be variable if an underlying
driver implements its own get_rx_ntuple routine, so userspace
needs to know how much data is coming.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 31b1eddc1b84..1c94f48e27b5 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -224,6 +224,9 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use
 		rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
 		if (rc >= 0)
 			info.n_priv_flags = rc;
+		rc = ops->get_sset_count(dev, ETH_SS_NTUPLE_FILTERS);
+		if (rc >= 0)
+			info.n_ntuples = rc;
 	}
 	if (ops->get_regs_len)
 		info.regdump_len = ops->get_regs_len(dev);
-- 
cgit v1.2.2


From 6e17d45ae310758ab30623a42ad070858c9a48de Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 24 Feb 2010 22:49:15 +0000
Subject: net: add addr len check to dev_mc_add

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev_mcast.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 9e2fa39f22a3..fd91569e2394 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -96,6 +96,8 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 	int err;
 
 	netif_addr_lock_bh(dev);
+	if (alen != dev->addr_len)
+		return -EINVAL;
 	err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl);
 	if (!err)
 		__dev_set_rx_mode(dev);
-- 
cgit v1.2.2


From 738b0343e73604750feb107e063c28b3ca36cb84 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 26 Feb 2010 05:12:02 -0800
Subject: Revert "ethtool: Add n-tuple string length to drvinfo and return it"

This reverts commit c79c5ffdce14abb4de3878c5aa8c3c6e5093c69b.

As Jeff points out we can't break the user visible interface
like this, we need to add this into the reserved[] thing.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 1c94f48e27b5..31b1eddc1b84 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -224,9 +224,6 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use
 		rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
 		if (rc >= 0)
 			info.n_priv_flags = rc;
-		rc = ops->get_sset_count(dev, ETH_SS_NTUPLE_FILTERS);
-		if (rc >= 0)
-			info.n_ntuples = rc;
 	}
 	if (ops->get_regs_len)
 		info.regdump_len = ops->get_regs_len(dev);
-- 
cgit v1.2.2


From 10de05afe01c12cedc42eb9ce05b111eed6c8210 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 26 Feb 2010 06:34:50 +0000
Subject: rtnetlink: ignore NETDEV_PRE_UP notifier in rtnetlink_event()

Commit 3b8bcfd (net: introduce pre-up netdev notifier) added a new
notifier which is run before a device is set UP for use by cfg80211.

The patch missed to add the new notifier to the ignore list in
rtnetlink_event(), so we currently get an unnecessary netlink
notification before a device is set UP.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4dd4c3cdc442..b7c7dfd86507 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1432,6 +1432,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_DOWN:
 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
 		break;
+	case NETDEV_PRE_UP:
 	case NETDEV_POST_INIT:
 	case NETDEV_REGISTER:
 	case NETDEV_CHANGE:
-- 
cgit v1.2.2


From a2835763e130c343ace5320c20d33c281e7097b7 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 26 Feb 2010 06:34:51 +0000
Subject: rtnetlink: handle rtnl_link netlink notifications manually

In order to support specifying device flags during device creation,
we must be able to roll back device registration in case setting the
flags fails without sending any notifications related to the device
to userspace.

This patch changes rollback_registered_many() and register_netdevice()
to manually send netlink notifications for devices not handled by
rtnl_link and allows to defer notifications for devices handled by
rtnl_link until setup is complete.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 8 +++++++-
 net/core/rtnetlink.c | 4 +---
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index eb7f1a4fefc6..75332b089529 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4865,6 +4865,10 @@ static void rollback_registered_many(struct list_head *head)
 		*/
 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
+		if (!dev->rtnl_link_ops ||
+		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
+
 		/*
 		 *	Flush the unicast and multicast chains
 		 */
@@ -5091,7 +5095,9 @@ int register_netdevice(struct net_device *dev)
 	 *	Prevent userspace races by waiting until the network
 	 *	device is fully setup before sending notifications.
 	 */
-	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+	if (!dev->rtnl_link_ops ||
+	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
 
 out:
 	return ret;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b7c7dfd86507..020e43bfef5f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1425,9 +1425,6 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	struct net_device *dev = ptr;
 
 	switch (event) {
-	case NETDEV_UNREGISTER:
-		rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
-		break;
 	case NETDEV_UP:
 	case NETDEV_DOWN:
 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
@@ -1437,6 +1434,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_REGISTER:
 	case NETDEV_CHANGE:
 	case NETDEV_GOING_DOWN:
+	case NETDEV_UNREGISTER:
 	case NETDEV_UNREGISTER_BATCH:
 		break;
 	default:
-- 
cgit v1.2.2


From bd38081160bb3d036db98472e537b6a7dd4da51a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 26 Feb 2010 06:34:53 +0000
Subject: dev: support deferring device flag change notifications

Split dev_change_flags() into two functions: __dev_change_flags() to
perform the actual changes and __dev_notify_flags() to invoke netdevice
notifiers. This will be used by rtnl_link to defer netlink notifications
until the device has been fully configured.

This changes ordering of some operations, in particular:

- netlink notifications are sent after all changes have been performed.
  As a side effect this surpresses one unnecessary netlink message when
  the IFF_UP and other flags are changed simultaneously.

- The NETDEV_UP/NETDEV_DOWN and NETDEV_CHANGE notifiers are invoked
  after all changes have been performed. Their relative is unchanged.

- net_dmaengine_put() is invoked before the NETDEV_DOWN notifier instead
  of afterwards. This should not make any difference since both RX and TX
  are already shut down at this point.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 163 ++++++++++++++++++++++++++++++++-------------------
 net/core/rtnetlink.c |   2 -
 2 files changed, 104 insertions(+), 61 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 75332b089529..e5972f7f7e1b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1113,32 +1113,13 @@ void dev_load(struct net *net, const char *name)
 }
 EXPORT_SYMBOL(dev_load);
 
-/**
- *	dev_open	- prepare an interface for use.
- *	@dev:	device to open
- *
- *	Takes a device from down to up state. The device's private open
- *	function is invoked and then the multicast lists are loaded. Finally
- *	the device is moved into the up state and a %NETDEV_UP message is
- *	sent to the netdev notifier chain.
- *
- *	Calling this function on an active interface is a nop. On a failure
- *	a negative errno code is returned.
- */
-int dev_open(struct net_device *dev)
+static int __dev_open(struct net_device *dev)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int ret;
 
 	ASSERT_RTNL();
 
-	/*
-	 *	Is it already up?
-	 */
-
-	if (dev->flags & IFF_UP)
-		return 0;
-
 	/*
 	 *	Is it even present?
 	 */
@@ -1187,36 +1168,57 @@ int dev_open(struct net_device *dev)
 		 *	Wakeup transmit queue engine
 		 */
 		dev_activate(dev);
-
-		/*
-		 *	... and announce new interface.
-		 */
-		call_netdevice_notifiers(NETDEV_UP, dev);
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL(dev_open);
 
 /**
- *	dev_close - shutdown an interface.
- *	@dev: device to shutdown
+ *	dev_open	- prepare an interface for use.
+ *	@dev:	device to open
  *
- *	This function moves an active device into down state. A
- *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
- *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
- *	chain.
+ *	Takes a device from down to up state. The device's private open
+ *	function is invoked and then the multicast lists are loaded. Finally
+ *	the device is moved into the up state and a %NETDEV_UP message is
+ *	sent to the netdev notifier chain.
+ *
+ *	Calling this function on an active interface is a nop. On a failure
+ *	a negative errno code is returned.
  */
-int dev_close(struct net_device *dev)
+int dev_open(struct net_device *dev)
+{
+	int ret;
+
+	/*
+	 *	Is it already up?
+	 */
+	if (dev->flags & IFF_UP)
+		return 0;
+
+	/*
+	 *	Open device
+	 */
+	ret = __dev_open(dev);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 *	... and announce new interface.
+	 */
+	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+	call_netdevice_notifiers(NETDEV_UP, dev);
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+static int __dev_close(struct net_device *dev)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	ASSERT_RTNL();
 
+	ASSERT_RTNL();
 	might_sleep();
 
-	if (!(dev->flags & IFF_UP))
-		return 0;
-
 	/*
 	 *	Tell people we are going down, so that they can
 	 *	prepare to death, when device is still operating.
@@ -1252,14 +1254,34 @@ int dev_close(struct net_device *dev)
 	dev->flags &= ~IFF_UP;
 
 	/*
-	 * Tell people we are down
+	 *	Shutdown NET_DMA
 	 */
-	call_netdevice_notifiers(NETDEV_DOWN, dev);
+	net_dmaengine_put();
+
+	return 0;
+}
+
+/**
+ *	dev_close - shutdown an interface.
+ *	@dev: device to shutdown
+ *
+ *	This function moves an active device into down state. A
+ *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ *	chain.
+ */
+int dev_close(struct net_device *dev)
+{
+	if (!(dev->flags & IFF_UP))
+		return 0;
+
+	__dev_close(dev);
 
 	/*
-	 *	Shutdown NET_DMA
+	 * Tell people we are down
 	 */
-	net_dmaengine_put();
+	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+	call_netdevice_notifiers(NETDEV_DOWN, dev);
 
 	return 0;
 }
@@ -4299,18 +4321,10 @@ unsigned dev_get_flags(const struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_get_flags);
 
-/**
- *	dev_change_flags - change device settings
- *	@dev: device
- *	@flags: device state flags
- *
- *	Change settings on device based state flags. The flags are
- *	in the userspace exported format.
- */
-int dev_change_flags(struct net_device *dev, unsigned flags)
+int __dev_change_flags(struct net_device *dev, unsigned int flags)
 {
-	int ret, changes;
 	int old_flags = dev->flags;
+	int ret;
 
 	ASSERT_RTNL();
 
@@ -4341,17 +4355,12 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 
 	ret = 0;
 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
-		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
+		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 
 		if (!ret)
 			dev_set_rx_mode(dev);
 	}
 
-	if (dev->flags & IFF_UP &&
-	    ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
-					  IFF_VOLATILE)))
-		call_netdevice_notifiers(NETDEV_CHANGE, dev);
-
 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 
@@ -4370,11 +4379,47 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 		dev_set_allmulti(dev, inc);
 	}
 
-	/* Exclude state transition flags, already notified */
-	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
+	return ret;
+}
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
+{
+	unsigned int changes = dev->flags ^ old_flags;
+
+	if (changes & IFF_UP) {
+		if (dev->flags & IFF_UP)
+			call_netdevice_notifiers(NETDEV_UP, dev);
+		else
+			call_netdevice_notifiers(NETDEV_DOWN, dev);
+	}
+
+	if (dev->flags & IFF_UP &&
+	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
+		call_netdevice_notifiers(NETDEV_CHANGE, dev);
+}
+
+/**
+ *	dev_change_flags - change device settings
+ *	@dev: device
+ *	@flags: device state flags
+ *
+ *	Change settings on device based state flags. The flags are
+ *	in the userspace exported format.
+ */
+int dev_change_flags(struct net_device *dev, unsigned flags)
+{
+	int ret, changes;
+	int old_flags = dev->flags;
+
+	ret = __dev_change_flags(dev, flags);
+	if (ret < 0)
+		return ret;
+
+	changes = old_flags ^ dev->flags;
 	if (changes)
 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
 
+	__dev_notify_flags(dev, old_flags);
 	return ret;
 }
 EXPORT_SYMBOL(dev_change_flags);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 020e43bfef5f..c21ec4236dd0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1427,8 +1427,6 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	switch (event) {
 	case NETDEV_UP:
 	case NETDEV_DOWN:
-		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
-		break;
 	case NETDEV_PRE_UP:
 	case NETDEV_POST_INIT:
 	case NETDEV_REGISTER:
-- 
cgit v1.2.2


From 3729d5021257b283f7fce33d957893162ccb2c9d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 26 Feb 2010 06:34:54 +0000
Subject: rtnetlink: support specifying device flags on device creation

commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f
Author: Patrick McHardy <kaber@trash.net>
Date:   Tue Feb 23 20:41:30 2010 +0100

Support specifying the initial device flags when creating a device though
rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING
in order to surpress netlink registration notifications. To complete setup,
rtnl_configure_link() must be called, which performs the device flag changes
and invokes the deferred notifiers if everything went well.

Two examples:

# add macvlan to eth0
#
$ ip link add link eth0 up allmulticast on type macvlan

[LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN
    link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff
[ROUTE]ff00::/8 dev macvlan0  table local  metric 256  mtu 1500 advmss 1440 hoplimit 0
[ROUTE]fe80::/64 dev macvlan0  proto kernel  metric 256  mtu 1500 advmss 1440 hoplimit 0
[LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500
    link/ether 26:f8:84:02:f9:2a
[ADDR]11: macvlan0    inet6 fe80::24f8:84ff:fe02:f92a/64 scope link
       valid_lft forever preferred_lft forever
[ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo  table local  proto none  metric 0  mtu 16436 advmss 16376 hoplimit 0
[ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0  proto kernel  metric 1024  mtu 1500 advmss 1440 hoplimit 0
[NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE
[ROUTE]2001:6f8:974::/64 dev macvlan0  proto kernel  metric 256  expires 0sec mtu 1500 advmss 1440 hoplimit 0
[PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084
[ADDR]11: macvlan0    inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic
       valid_lft 86399sec preferred_lft 14399sec

# add VLAN to eth1, eth1 is down
#
$ ip link add link eth1 up type vlan id 1000
RTNETLINK answers: Network is down

<no events>

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 52 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c21ec4236dd0..d1472a423323 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -549,6 +549,19 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
 	}
 }
 
+static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
+					   const struct ifinfomsg *ifm)
+{
+	unsigned int flags = ifm->ifi_flags;
+
+	/* bugwards compatibility: ifi_change == 0 is treated as ~0 */
+	if (ifm->ifi_change)
+		flags = (flags & ifm->ifi_change) |
+			(dev->flags & ~ifm->ifi_change);
+
+	return flags;
+}
+
 static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 				 const struct net_device_stats *b)
 {
@@ -904,13 +917,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 	}
 
 	if (ifm->ifi_flags || ifm->ifi_change) {
-		unsigned int flags = ifm->ifi_flags;
-
-		/* bugwards compatibility: ifi_change == 0 is treated as ~0 */
-		if (ifm->ifi_change)
-			flags = (flags & ifm->ifi_change) |
-				(dev->flags & ~ifm->ifi_change);
-		err = dev_change_flags(dev, flags);
+		err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
 		if (err < 0)
 			goto errout;
 	}
@@ -1053,6 +1060,26 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	return 0;
 }
 
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+{
+	unsigned int old_flags;
+	int err;
+
+	old_flags = dev->flags;
+	if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
+		err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+		if (err < 0)
+			return err;
+	}
+
+	dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+
+	__dev_notify_flags(dev, old_flags);
+	return 0;
+}
+EXPORT_SYMBOL(rtnl_configure_link);
+
 struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
 	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
 {
@@ -1074,6 +1101,7 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
 
 	dev_net_set(dev, net);
 	dev->rtnl_link_ops = ops;
+	dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
 	dev->real_num_tx_queues = real_num_queues;
 
 	if (strchr(dev->name, '%')) {
@@ -1203,7 +1231,7 @@ replay:
 		if (!(nlh->nlmsg_flags & NLM_F_CREATE))
 			return -ENODEV;
 
-		if (ifm->ifi_index || ifm->ifi_flags || ifm->ifi_change)
+		if (ifm->ifi_index)
 			return -EOPNOTSUPP;
 		if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
 			return -EOPNOTSUPP;
@@ -1234,9 +1262,15 @@ replay:
 			err = ops->newlink(net, dev, tb, data);
 		else
 			err = register_netdevice(dev);
-		if (err < 0 && !IS_ERR(dev))
+		if (err < 0 && !IS_ERR(dev)) {
 			free_netdev(dev);
+			goto out;
+		}
 
+		err = rtnl_configure_link(dev, ifm);
+		if (err < 0)
+			unregister_netdevice(dev);
+out:
 		put_net(dest_net);
 		return err;
 	}
-- 
cgit v1.2.2


From 9675478bbafed08848bf8d7e28400d5e46330b23 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Fri, 26 Feb 2010 21:43:38 +0000
Subject: ethtool: do not set some flags, if others failed

NETIF_F_NTUPLE flag setting introduced a bug:  non-ntuple flags
like LRO may be successfully set, before ioctl(2) returns failure
to userspace.

The set-flags operation should be all-or-none, rather than leaving
things in an inconsistent state prior to reporting failure to
userspace.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 31b1eddc1b84..0f2f82185ec4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -135,21 +135,23 @@ u32 ethtool_op_get_flags(struct net_device *dev)
 int ethtool_op_set_flags(struct net_device *dev, u32 data)
 {
 	const struct ethtool_ops *ops = dev->ethtool_ops;
+	unsigned long features = dev->features;
 
 	if (data & ETH_FLAG_LRO)
-		dev->features |= NETIF_F_LRO;
+		features |= NETIF_F_LRO;
 	else
-		dev->features &= ~NETIF_F_LRO;
+		features &= ~NETIF_F_LRO;
 
 	if (data & ETH_FLAG_NTUPLE) {
 		if (!ops->set_rx_ntuple)
 			return -EOPNOTSUPP;
-		dev->features |= NETIF_F_NTUPLE;
+		features |= NETIF_F_NTUPLE;
 	} else {
 		/* safe to clear regardless */
-		dev->features &= ~NETIF_F_NTUPLE;
+		features &= ~NETIF_F_NTUPLE;
 	}
 
+	dev->features = features;
 	return 0;
 }
 
-- 
cgit v1.2.2


From 76dadd76c265a0cdb5a76aa4eef03fcc9639b388 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 28 Feb 2010 01:20:36 +0000
Subject: scm: Only support SCM_RIGHTS on unix domain sockets.

We use scm_send and scm_recv on both unix domain and
netlink sockets, but only unix domain sockets support
everything required for file descriptor passing,
so error if someone attempts to pass file descriptors
over netlink sockets.

Cc: stable@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/scm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/scm.c b/net/core/scm.c
index b7ba91b074b3..9b264634acfd 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -156,6 +156,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 		switch (cmsg->cmsg_type)
 		{
 		case SCM_RIGHTS:
+			if (!sock->ops || sock->ops->family != PF_UNIX)
+				goto error;
 			err=scm_fp_copy(cmsg, &p->fp);
 			if (err<0)
 				goto error;
-- 
cgit v1.2.2


From 8eae939f1400326b06d0c9afe53d2a484a326871 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Thu, 4 Mar 2010 18:01:40 +0000
Subject: net: add limit for socket backlog

We got system OOM while running some UDP netperf testing on the loopback
device. The case is multiple senders sent stream UDP packets to a single
receiver via loopback on local host. Of course, the receiver is not able
to handle all the packets in time. But we surprisingly found that these
packets were not discarded due to the receiver's sk->sk_rcvbuf limit.
Instead, they are kept queuing to sk->sk_backlog and finally ate up all
the memory. We believe this is a secure hole that a none privileged user
can crash the system.

The root cause for this problem is, when the receiver is doing
__release_sock() (i.e. after userspace recv, kernel udp_recvmsg ->
skb_free_datagram_locked -> release_sock), it moves skbs from backlog to
sk_receive_queue with the softirq enabled. In the above case, multiple
busy senders will almost make it an endless loop. The skbs in the
backlog end up eat all the system memory.

The issue is not only for UDP. Any protocols using socket backlog is
potentially affected. The patch adds limit for socket backlog so that
the backlog size cannot be expanded endlessly.

Reported-by: Alex Shi <alex.shi@intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru
Cc: "Pekka Savola (ipv6)" <pekkas@netcore.fi>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Vlad Yasevich <vladislav.yasevich@hp.com>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Allan Stephens <allan.stephens@windriver.com>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index fcd397a762ff..6e22dc973d23 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -340,8 +340,12 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 		rc = sk_backlog_rcv(sk, skb);
 
 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-	} else
-		sk_add_backlog(sk, skb);
+	} else if (sk_add_backlog_limited(sk, skb)) {
+		bh_unlock_sock(sk);
+		atomic_inc(&sk->sk_drops);
+		goto discard_and_relse;
+	}
+
 	bh_unlock_sock(sk);
 out:
 	sock_put(sk);
@@ -1139,6 +1143,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		sock_lock_init(newsk);
 		bh_lock_sock(newsk);
 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
+		newsk->sk_backlog.len = 0;
 
 		atomic_set(&newsk->sk_rmem_alloc, 0);
 		/*
@@ -1542,6 +1547,12 @@ static void __release_sock(struct sock *sk)
 
 		bh_lock_sock(sk);
 	} while ((skb = sk->sk_backlog.head) != NULL);
+
+	/*
+	 * Doing the zeroing here guarantee we can not loop forever
+	 * while a wild producer attempts to flood us.
+	 */
+	sk->sk_backlog.len = 0;
 }
 
 /**
@@ -1874,6 +1885,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_allocation	=	GFP_KERNEL;
 	sk->sk_rcvbuf		=	sysctl_rmem_default;
 	sk->sk_sndbuf		=	sysctl_wmem_default;
+	sk->sk_backlog.limit	=	sk->sk_rcvbuf << 1;
 	sk->sk_state		=	TCP_CLOSE;
 	sk_set_socket(sk, sock);
 
-- 
cgit v1.2.2


From a3a858ff18a72a8d388e31ab0d98f7e944841a62 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Thu, 4 Mar 2010 18:01:47 +0000
Subject: net: backlog functions rename

sk_add_backlog -> __sk_add_backlog
sk_add_backlog_limited -> sk_add_backlog

Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 6e22dc973d23..61a65a2e0455 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -340,7 +340,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 		rc = sk_backlog_rcv(sk, skb);
 
 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-	} else if (sk_add_backlog_limited(sk, skb)) {
+	} else if (sk_add_backlog(sk, skb)) {
 		bh_unlock_sock(sk);
 		atomic_inc(&sk->sk_drops);
 		goto discard_and_relse;
-- 
cgit v1.2.2


From 723b2f57ad83ee7087acf9a95e8e289414b1f521 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@redhat.com>
Date: Wed, 3 Mar 2010 22:51:50 +0000
Subject: ethtool: Add direct access to ops->get_sset_count

This patch is an alternative approach for accessing string
counts, vs. the drvinfo indirect approach.  This way the drvinfo
space doesn't run out, and we don't break ABI later.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 0f2f82185ec4..70075c47ada8 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -214,6 +214,10 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use
 	info.cmd = ETHTOOL_GDRVINFO;
 	ops->get_drvinfo(dev, &info);
 
+	/*
+	 * this method of obtaining string set info is deprecated;
+	 * consider using ETHTOOL_GSSET_INFO instead
+	 */
 	if (ops->get_sset_count) {
 		int rc;
 
@@ -237,6 +241,71 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use
 	return 0;
 }
 
+/*
+ * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
+ */
+static noinline int ethtool_get_sset_info(struct net_device *dev,
+                                          void __user *useraddr)
+{
+	struct ethtool_sset_info info;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	u64 sset_mask;
+	int i, idx = 0, n_bits = 0, ret, rc;
+	u32 *info_buf = NULL;
+
+	if (!ops->get_sset_count)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&info, useraddr, sizeof(info)))
+		return -EFAULT;
+
+	/* store copy of mask, because we zero struct later on */
+	sset_mask = info.sset_mask;
+	if (!sset_mask)
+		return 0;
+
+	/* calculate size of return buffer */
+	for (i = 0; i < 64; i++)
+		if (sset_mask & (1ULL << i))
+			n_bits++;
+
+	memset(&info, 0, sizeof(info));
+	info.cmd = ETHTOOL_GSSET_INFO;
+
+	info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER);
+	if (!info_buf)
+		return -ENOMEM;
+
+	/*
+	 * fill return buffer based on input bitmask and successful
+	 * get_sset_count return
+	 */
+	for (i = 0; i < 64; i++) {
+		if (!(sset_mask & (1ULL << i)))
+			continue;
+
+		rc = ops->get_sset_count(dev, i);
+		if (rc >= 0) {
+			info.sset_mask |= (1ULL << i);
+			info_buf[idx++] = rc;
+		}
+	}
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &info, sizeof(info)))
+		goto out;
+
+	useraddr += offsetof(struct ethtool_sset_info, data);
+	if (copy_to_user(useraddr, info_buf, idx * sizeof(u32)))
+		goto out;
+
+	ret = 0;
+
+out:
+	kfree(info_buf);
+	return ret;
+}
+
 /*
  * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
  */
@@ -1471,6 +1540,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GRXNTUPLE:
 		rc = ethtool_get_rx_ntuple(dev, useraddr);
 		break;
+	case ETHTOOL_GSSET_INFO:
+		rc = ethtool_get_sset_info(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.2


From d17792ebdf90289c9fd1bce888076d3d60ecd53b Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Thu, 4 Mar 2010 08:21:53 +0000
Subject: ethtool: Add direct access to ops->get_sset_count

On 03/04/2010 09:26 AM, Ben Hutchings wrote:
> On Thu, 2010-03-04 at 00:51 -0800, Jeff Kirsher wrote:
>> From: Jeff Garzik<jgarzik@redhat.com>
>>
>> This patch is an alternative approach for accessing string
>> counts, vs. the drvinfo indirect approach.  This way the drvinfo
>> space doesn't run out, and we don't break ABI later.
> [...]
>> --- a/net/core/ethtool.c
>> +++ b/net/core/ethtool.c
>> @@ -214,6 +214,10 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use
>>   	info.cmd = ETHTOOL_GDRVINFO;
>>   	ops->get_drvinfo(dev,&info);
>>
>> +	/*
>> +	 * this method of obtaining string set info is deprecated;
>> +	 * consider using ETHTOOL_GSSET_INFO instead
>> +	 */
>
> This comment belongs on the interface (ethtool.h) not the
> implementation.

Debatable -- the current comment is located at the callsite of
ops->get_sset_count(), which is where an implementor might think to add
a new call.  Not all the numeric fields in ethtool_drvinfo are obtained
from ->get_sset_count().

Hence the "some" in the attached patch to include/linux/ethtool.h,
addressing your comment.

> [...]
>> +static noinline int ethtool_get_sset_info(struct net_device *dev,
>> +                                          void __user *useraddr)
>> +{
> [...]
>> +	/* calculate size of return buffer */
>> +	for (i = 0; i<  64; i++)
>> +		if (sset_mask&  (1ULL<<  i))
>> +			n_bits++;
> [...]
>
> We have a function for this:
>
> 	n_bits = hweight64(sset_mask);

Agreed.

I've attached a follow-up patch, which should enable my/Jeff's kernel
patch to be applied, followed by this one.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 70075c47ada8..33d2ded50f84 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/ethtool.h>
 #include <linux/netdevice.h>
+#include <linux/bitops.h>
 #include <asm/uaccess.h>
 
 /*
@@ -216,7 +217,7 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use
 
 	/*
 	 * this method of obtaining string set info is deprecated;
-	 * consider using ETHTOOL_GSSET_INFO instead
+	 * Use ETHTOOL_GSSET_INFO instead.
 	 */
 	if (ops->get_sset_count) {
 		int rc;
@@ -265,9 +266,7 @@ static noinline int ethtool_get_sset_info(struct net_device *dev,
 		return 0;
 
 	/* calculate size of return buffer */
-	for (i = 0; i < 64; i++)
-		if (sset_mask & (1ULL << i))
-			n_bits++;
+	n_bits = hweight64(sset_mask);
 
 	memset(&info, 0, sizeof(info));
 	info.cmd = ETHTOOL_GSSET_INFO;
-- 
cgit v1.2.2


From 72150e9b7fec217fbd646a29ea2f65a3d4d55ea9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 6 Mar 2010 01:04:45 +0000
Subject: sock.c: potential null dereference

We test that "prot->rsk_prot" is non-null right before we dereference it
on this line.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 61a65a2e0455..c5812bbc2cc9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2288,7 +2288,8 @@ out_free_request_sock_slab:
 		prot->rsk_prot->slab = NULL;
 	}
 out_free_request_sock_slab_name:
-	kfree(prot->rsk_prot->slab_name);
+	if (prot->rsk_prot)
+		kfree(prot->rsk_prot->slab_name);
 out_free_sock_slab:
 	kmem_cache_destroy(prot->slab);
 	prot->slab = NULL;
-- 
cgit v1.2.2


From f5c445ed4148434f142be0263a8ad7cb58503e8a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 8 Mar 2010 12:17:04 -0800
Subject: ethtool: Use noinline_for_stack

Use self documenting noinline_for_stack instead of duplicated comments.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 40 ++++++++--------------------------------
 1 file changed, 8 insertions(+), 32 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 33d2ded50f84..f4cb6b6299d9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -200,10 +200,7 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 	return dev->ethtool_ops->set_settings(dev, &cmd);
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_drvinfo info;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -242,10 +239,7 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use
 	return 0;
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_get_sset_info(struct net_device *dev,
+static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
                                           void __user *useraddr)
 {
 	struct ethtool_sset_info info;
@@ -305,10 +299,7 @@ out:
 	return ret;
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc cmd;
 
@@ -321,10 +312,7 @@ static noinline int ethtool_set_rxnfc(struct net_device *dev, void __user *usera
 	return dev->ethtool_ops->set_rxnfc(dev, &cmd);
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc info;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -396,10 +384,7 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
 	list->count++;
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rx_ntuple cmd;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -867,10 +852,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
 
@@ -884,10 +866,7 @@ static noinline int ethtool_get_coalesce(struct net_device *dev, void __user *us
 	return 0;
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_coalesce coalesce;
 
@@ -1297,10 +1276,7 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
 	return actor(dev, edata.data);
 }
 
-/*
- * noinline attribute so that gcc doesnt use too much stack in dev_ethtool()
- */
-static noinline int ethtool_flash_device(struct net_device *dev, char __user *useraddr)
+static noinline_for_stack int ethtool_flash_device(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_flash efl;
 
-- 
cgit v1.2.2


From 0a141509ede48ac33ef756ac1640f4d3f46fa2db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 9 Mar 2010 19:40:54 +0000
Subject: net: Annotates neigh_invalidate()

Annotates neigh_invalidate() with __releases() and __acquires() for
sparse sake.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d102f6d9abdc..6cee6434da67 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -771,6 +771,8 @@ static __inline__ int neigh_max_probes(struct neighbour *n)
 }
 
 static void neigh_invalidate(struct neighbour *neigh)
+	__releases(neigh->lock)
+	__acquires(neigh->lock)
 {
 	struct sk_buff *skb;
 
-- 
cgit v1.2.2


From 3041f5170751e3522aa1bd6e8ca5d98e846720b0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 9 Mar 2010 19:09:08 +0000
Subject: net: Fix dev_mc_add()

Commit 6e17d45a (net: add addr len check to dev_mc_add)
added a bug in dev_mc_add(), since it can now exit with a lock
imbalance.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev_mcast.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index fd91569e2394..3dc295beb483 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -97,8 +97,9 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 
 	netif_addr_lock_bh(dev);
 	if (alen != dev->addr_len)
-		return -EINVAL;
-	err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl);
+		err = -EINVAL;
+	else
+		err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl);
 	if (!err)
 		__dev_set_rx_mode(dev);
 	netif_addr_unlock_bh(dev);
-- 
cgit v1.2.2


From 21edbb223ed2af88b090e7945af7d91d672e3aa6 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 16 Mar 2010 05:29:54 +0000
Subject: NET: netpoll, fix potential NULL ptr dereference

Stanse found that one error path in netpoll_setup dereferences npinfo
even though it is NULL. Avoid that by adding new label and go to that
instead.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Daniel Borkmann <danborkmann@googlemail.com>
Cc: David S. Miller <davem@davemloft.net>
Acked-by: chavey@google.com
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 7aa697253765..d4ec38fa64e6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -735,7 +735,7 @@ int netpoll_setup(struct netpoll *np)
 		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
 		if (!npinfo) {
 			err = -ENOMEM;
-			goto release;
+			goto put;
 		}
 
 		npinfo->rx_flags = 0;
@@ -845,7 +845,7 @@ int netpoll_setup(struct netpoll *np)
 
 		kfree(npinfo);
 	}
-
+put:
 	dev_put(ndev);
 	return err;
 }
-- 
cgit v1.2.2


From 0641e4fbf2f824faee00ea74c459a088d94905fd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 18 Mar 2010 21:16:45 -0700
Subject: net: Potential null skb->dev dereference

When doing "ifenslave -d bond0 eth0", there is chance to get NULL
dereference in netif_receive_skb(), because dev->master suddenly becomes
NULL after we tested it.

We should use ACCESS_ONCE() to avoid this (or rcu_dereference())

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index bcc490cc9452..59d4394d2ce8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2483,6 +2483,7 @@ int netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
+	struct net_device *master;
 	struct net_device *null_or_orig;
 	struct net_device *null_or_bond;
 	int ret = NET_RX_DROP;
@@ -2503,11 +2504,12 @@ int netif_receive_skb(struct sk_buff *skb)
 
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
-	if (orig_dev->master) {
-		if (skb_bond_should_drop(skb))
+	master = ACCESS_ONCE(orig_dev->master);
+	if (master) {
+		if (skb_bond_should_drop(skb, master))
 			null_or_orig = orig_dev; /* deliver only exact match */
 		else
-			skb->dev = orig_dev->master;
+			skb->dev = master;
 	}
 
 	__get_cpu_var(netdev_rx_stat).total++;
-- 
cgit v1.2.2


From 5fc05f8764f301138003ff562a31ad3721f1675f Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Sun, 21 Mar 2010 22:59:58 +0000
Subject: netpoll: warn when there are spaces in parameters

v2: update according to Frans' comments.

Currently, if we leave spaces before dst port,
netconsole will silently accept it as 0. Warn about this.

Also, when spaces appear in other places, make them
visible in error messages.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: David Miller <davem@davemloft.net>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index d4ec38fa64e6..6f9206b36dc2 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -614,7 +614,7 @@ void netpoll_print_options(struct netpoll *np)
 			 np->name, np->local_port);
 	printk(KERN_INFO "%s: local IP %pI4\n",
 			 np->name, &np->local_ip);
-	printk(KERN_INFO "%s: interface %s\n",
+	printk(KERN_INFO "%s: interface '%s'\n",
 			 np->name, np->dev_name);
 	printk(KERN_INFO "%s: remote port %d\n",
 			 np->name, np->remote_port);
@@ -661,6 +661,9 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 		if ((delim = strchr(cur, '@')) == NULL)
 			goto parse_failed;
 		*delim = 0;
+		if (*cur == ' ' || *cur == '\t')
+			printk(KERN_INFO "%s: warning: whitespace"
+					"is not allowed\n", np->name);
 		np->remote_port = simple_strtol(cur, NULL, 10);
 		cur = delim;
 	}
@@ -708,7 +711,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 	return 0;
 
  parse_failed:
-	printk(KERN_INFO "%s: couldn't parse config at %s!\n",
+	printk(KERN_INFO "%s: couldn't parse config at '%s'!\n",
 	       np->name, cur);
 	return -1;
 }
-- 
cgit v1.2.2


From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Mar 2010 17:04:11 +0900
Subject: include cleanup: Update gfp.h and slab.h includes to prepare for
 breaking implicit slab.h inclusion from percpu.h

percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files.  percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.

percpu.h -> slab.h dependency is about to be removed.  Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability.  As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.

  http://userweb.kernel.org/~tj/misc/slabh-sweep.py

The script does the followings.

* Scan files for gfp and slab usages and update includes such that
  only the necessary includes are there.  ie. if only gfp is used,
  gfp.h, if slab is used, slab.h.

* When the script inserts a new include, it looks at the include
  blocks and try to put the new include such that its order conforms
  to its surrounding.  It's put in the include block which contains
  core kernel includes, in the same order that the rest are ordered -
  alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
  doesn't seem to be any matching order.

* If the script can't find a place to put a new include (mostly
  because the file doesn't have fitting include block), it prints out
  an error message indicating which .h file needs to be added to the
  file.

The conversion was done in the following steps.

1. The initial automatic conversion of all .c files updated slightly
   over 4000 files, deleting around 700 includes and adding ~480 gfp.h
   and ~3000 slab.h inclusions.  The script emitted errors for ~400
   files.

2. Each error was manually checked.  Some didn't need the inclusion,
   some needed manual addition while adding it to implementation .h or
   embedding .c file was more appropriate for others.  This step added
   inclusions to around 150 files.

3. The script was run again and the output was compared to the edits
   from #2 to make sure no file was left behind.

4. Several build tests were done and a couple of problems were fixed.
   e.g. lib/decompress_*.c used malloc/free() wrappers around slab
   APIs requiring slab.h to be added manually.

5. The script was run on all .h files but without automatically
   editing them as sprinkling gfp.h and slab.h inclusions around .h
   files could easily lead to inclusion dependency hell.  Most gfp.h
   inclusion directives were ignored as stuff from gfp.h was usually
   wildly available and often used in preprocessor macros.  Each
   slab.h inclusion directive was examined and added manually as
   necessary.

6. percpu.h was updated not to include slab.h.

7. Build test were done on the following configurations and failures
   were fixed.  CONFIG_GCOV_KERNEL was turned off for all tests (as my
   distributed build env didn't work with gcov compiles) and a few
   more options had to be turned off depending on archs to make things
   build (like ipr on powerpc/64 which failed due to missing writeq).

   * x86 and x86_64 UP and SMP allmodconfig and a custom test config.
   * powerpc and powerpc64 SMP allmodconfig
   * sparc and sparc64 SMP allmodconfig
   * ia64 SMP allmodconfig
   * s390 SMP allmodconfig
   * alpha SMP allmodconfig
   * um on x86_64 SMP allmodconfig

8. percpu.h modifications were reverted so that it could be applied as
   a separate patch and serve as bisection point.

Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
---
 net/core/datagram.c        | 1 +
 net/core/dev.c             | 1 +
 net/core/drop_monitor.c    | 1 +
 net/core/dst.c             | 1 +
 net/core/ethtool.c         | 1 +
 net/core/fib_rules.c       | 1 +
 net/core/filter.c          | 1 +
 net/core/gen_estimator.c   | 1 +
 net/core/iovec.c           | 1 -
 net/core/link_watch.c      | 1 -
 net/core/neighbour.c       | 1 +
 net/core/net-sysfs.c       | 1 +
 net/core/net-traces.c      | 1 +
 net/core/netpoll.c         | 1 +
 net/core/scm.c             | 1 +
 net/core/sysctl_net_core.c | 1 +
 16 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 95c2e0840d0d..2dccd4ee591b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -48,6 +48,7 @@
 #include <linux/poll.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
diff --git a/net/core/dev.c b/net/core/dev.c
index 59d4394d2ce8..1c8a0ce473a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -80,6 +80,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
 #include <linux/string.h>
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index f8c874975350..cf208d8042b1 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/timer.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <net/genetlink.h>
 #include <net/netevent.h>
 
diff --git a/net/core/dst.c b/net/core/dst.c
index cb1b3488b739..f307bc18f6a0 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -12,6 +12,7 @@
 #include <linux/workqueue.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/string.h>
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f4cb6b6299d9..9d55c57f318a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -18,6 +18,7 @@
 #include <linux/ethtool.h>
 #include <linux/netdevice.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9a24377146bf..d2c3e7dc2e5f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
diff --git a/net/core/filter.c b/net/core/filter.c
index d38ef7fd50f0..ff943bed21af 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -25,6 +25,7 @@
 #include <linux/inet.h>
 #include <linux/netdevice.h>
 #include <linux/if_packet.h>
+#include <linux/gfp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/netlink.h>
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 493775f4f2f1..cf8e70392fe0 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -32,6 +32,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/rbtree.h>
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <net/gen_stats.h>
 
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 16ad45d4882b..1e7f4e91a935 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -20,7 +20,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/net.h>
 #include <linux/in6.h>
 #include <asm/uaccess.h>
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 5910b555a54a..bdbce2f5875b 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -19,7 +19,6 @@
 #include <linux/rtnetlink.h>
 #include <linux/jiffies.h>
 #include <linux/spinlock.h>
-#include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <asm/types.h>
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6cee6434da67..bff37908bd55 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -15,6 +15,7 @@
  *	Harald Welte		Add neighbour cache statistics like rtstat
  */
 
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 099c753c4213..59cfc7d8fc45 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 #include <linux/wireless.h>
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index f1e982c508bb..afa6380ed88a 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
+#include <linux/slab.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 6f9206b36dc2..a58f59b97597 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
+#include <linux/slab.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
diff --git a/net/core/scm.c b/net/core/scm.c
index 9b264634acfd..b88f6f9d0b97 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -26,6 +26,7 @@
 #include <linux/security.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 06124872af5b..b7b6b8208f75 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -12,6 +12,7 @@
 #include <linux/netdevice.h>
 #include <linux/ratelimit.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
-- 
cgit v1.2.2


From 8728c544a9cbdcb0034aa5c45706c5f953f030ee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 11 Apr 2010 21:18:17 +0000
Subject: net: dev_pick_tx() fix

When dev_pick_tx() caches tx queue_index on a socket, we must check
socket dst_entry matches skb one, or risk a crash later, as reported by
Denys Fedorysychenko, if old packets are in flight during a route
change, involving devices with different number of queues.

Bug introduced by commit a4ee3ce3
(net: Use sk_tx_queue_mapping for connected sockets)

Reported-by: Denys Fedorysychenko <nuclearcat@nuclearcat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1c8a0ce473a8..92584bfef09b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1989,8 +1989,12 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 			if (dev->real_num_tx_queues > 1)
 				queue_index = skb_tx_hash(dev, skb);
 
-			if (sk && sk->sk_dst_cache)
-				sk_tx_queue_set(sk, queue_index);
+			if (sk) {
+				struct dst_entry *dst = rcu_dereference(sk->sk_dst_cache);
+
+				if (dst && skb_dst(skb) == dst)
+					sk_tx_queue_set(sk, queue_index);
+			}
 		}
 	}
 
-- 
cgit v1.2.2


From 05d17608a69b3ae653ea5c9857283bef3439c733 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 20 Apr 2010 00:25:58 +0000
Subject: net: Fix an RCU warning in dev_pick_tx()

Fix the following RCU warning in dev_pick_tx():

===================================================
[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
net/core/dev.c:1993 invoked rcu_dereference_check() without protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
2 locks held by swapper/0:
 #0:  (&idev->mc_ifc_timer){+.-...}, at: [<ffffffff81039e65>] run_timer_softirq+0x17b/0x278
 #1:  (rcu_read_lock_bh){.+....}, at: [<ffffffff812ea3eb>] dev_queue_xmit+0x14e/0x4dc

stack backtrace:
Pid: 0, comm: swapper Not tainted 2.6.34-rc5-cachefs #4
Call Trace:
 <IRQ>  [<ffffffff810516c4>] lockdep_rcu_dereference+0xaa/0xb2
 [<ffffffff812ea4f6>] dev_queue_xmit+0x259/0x4dc
 [<ffffffff812ea3eb>] ? dev_queue_xmit+0x14e/0x4dc
 [<ffffffff81052324>] ? trace_hardirqs_on+0xd/0xf
 [<ffffffff81035362>] ? local_bh_enable_ip+0xbc/0xc1
 [<ffffffff812f0954>] neigh_resolve_output+0x24b/0x27c
 [<ffffffff8134f673>] ip6_output_finish+0x7c/0xb4
 [<ffffffff81350c34>] ip6_output2+0x256/0x261
 [<ffffffff81052324>] ? trace_hardirqs_on+0xd/0xf
 [<ffffffff813517fb>] ip6_output+0xbbc/0xbcb
 [<ffffffff8135bc5d>] ? fib6_force_start_gc+0x2b/0x2d
 [<ffffffff81368acb>] mld_sendpack+0x273/0x39d
 [<ffffffff81368858>] ? mld_sendpack+0x0/0x39d
 [<ffffffff81052099>] ? mark_held_locks+0x52/0x70
 [<ffffffff813692fc>] mld_ifc_timer_expire+0x24f/0x288
 [<ffffffff81039ed6>] run_timer_softirq+0x1ec/0x278
 [<ffffffff81039e65>] ? run_timer_softirq+0x17b/0x278
 [<ffffffff813690ad>] ? mld_ifc_timer_expire+0x0/0x288
 [<ffffffff81035531>] ? __do_softirq+0x69/0x140
 [<ffffffff8103556a>] __do_softirq+0xa2/0x140
 [<ffffffff81002e0c>] call_softirq+0x1c/0x28
 [<ffffffff81004b54>] do_softirq+0x38/0x80
 [<ffffffff81034f06>] irq_exit+0x45/0x47
 [<ffffffff810177c3>] smp_apic_timer_interrupt+0x88/0x96
 [<ffffffff810028d3>] apic_timer_interrupt+0x13/0x20
 <EOI>  [<ffffffff810488dd>] ? __atomic_notifier_call_chain+0x0/0x86
 [<ffffffff810096bf>] ? mwait_idle+0x6e/0x78
 [<ffffffff810096b6>] ? mwait_idle+0x65/0x78
 [<ffffffff810011cb>] cpu_idle+0x4d/0x83
 [<ffffffff81380b05>] rest_init+0xb9/0xc0
 [<ffffffff81380a4c>] ? rest_init+0x0/0xc0
 [<ffffffff8168dcf0>] start_kernel+0x392/0x39d
 [<ffffffff8168d2a3>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff8168d38b>] x86_64_start_kernel+0xe4/0xeb

An rcu_dereference() should be an rcu_dereference_bh().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 92584bfef09b..f769098774b7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1990,7 +1990,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 				queue_index = skb_tx_hash(dev, skb);
 
 			if (sk) {
-				struct dst_entry *dst = rcu_dereference(sk->sk_dst_cache);
+				struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache);
 
 				if (dst && skb_dst(skb) == dst)
 					sk_tx_queue_set(sk, queue_index);
-- 
cgit v1.2.2


From 80032cffb95edff4fc216b1cb21682257be326b7 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 21 Apr 2010 23:53:27 +0000
Subject: rtnetlink: potential ERR_PTR dereference

In the original code, if rtnl_create_link() returned an ERR_PTR then that
would get passed to rtnl_configure_link() which dereferences it.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4568120d8533..fe776c9ddeca 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1270,10 +1270,11 @@ replay:
 			err = ops->newlink(net, dev, tb, data);
 		else
 			err = register_netdevice(dev);
-		if (err < 0 && !IS_ERR(dev)) {
+
+		if (err < 0 && !IS_ERR(dev))
 			free_netdev(dev);
+		if (err < 0)
 			goto out;
-		}
 
 		err = rtnl_configure_link(dev, ifm);
 		if (err < 0)
-- 
cgit v1.2.2


From 6ec82562ffc6f297d0de36d65776cff8e5704867 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 6 May 2010 00:53:53 -0700
Subject: veth: Dont kfree_skb() after dev_forward_skb()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case of congestion, netif_rx() frees the skb, so we must assume
dev_forward_skb() also consume skb.

Bug introduced by commit 445409602c092
(veth: move loopback logic to common location)

We must change dev_forward_skb() to always consume skb, and veth to not
double free it.

Bug report : http://marc.info/?l=linux-netdev&m=127310770900442&w=3

Reported-by: Martín Ferrari <martin.ferrari@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index f769098774b7..264137fce3a2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1451,7 +1451,7 @@ static inline void net_timestamp(struct sk_buff *skb)
  *
  * return values:
  *	NET_RX_SUCCESS	(no congestion)
- *	NET_RX_DROP     (packet was dropped)
+ *	NET_RX_DROP     (packet was dropped, but freed)
  *
  * dev_forward_skb can be used for injecting an skb from the
  * start_xmit function of one device into the receive queue
@@ -1465,12 +1465,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	skb_orphan(skb);
 
-	if (!(dev->flags & IFF_UP))
-		return NET_RX_DROP;
-
-	if (skb->len > (dev->mtu + dev->hard_header_len))
+	if (!(dev->flags & IFF_UP) ||
+	    (skb->len > (dev->mtu + dev->hard_header_len))) {
+		kfree_skb(skb);
 		return NET_RX_DROP;
-
+	}
 	skb_set_dev(skb, dev);
 	skb->tstamp.tv64 = 0;
 	skb->pkt_type = PACKET_HOST;
-- 
cgit v1.2.2


From c02db8c6290bb992442fec1407643c94cc414375 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Sun, 16 May 2010 01:05:45 -0700
Subject: rtnetlink: make SR-IOV VF interface symmetric

Now we have a set of nested attributes:

  IFLA_VFINFO_LIST (NESTED)
    IFLA_VF_INFO (NESTED)
      IFLA_VF_MAC
      IFLA_VF_VLAN
      IFLA_VF_TX_RATE

This allows a single set to operate on multiple attributes if desired.
Among other things, it means a dump can be replayed to set state.

The current interface has yet to be released, so this seems like
something to consider for 2.6.34.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 159 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 110 insertions(+), 49 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fe776c9ddeca..31e85d327aa2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -602,12 +602,19 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 	a->tx_compressed = b->tx_compressed;
 };
 
+/* All VF info */
 static inline int rtnl_vfinfo_size(const struct net_device *dev)
 {
-	if (dev->dev.parent && dev_is_pci(dev->dev.parent))
-		return dev_num_vf(dev->dev.parent) *
-			sizeof(struct ifla_vf_info);
-	else
+	if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
+
+		int num_vfs = dev_num_vf(dev->dev.parent);
+		size_t size = nlmsg_total_size(sizeof(struct nlattr));
+		size += nlmsg_total_size(num_vfs * sizeof(struct nlattr));
+		size += num_vfs * (sizeof(struct ifla_vf_mac) +
+				  sizeof(struct ifla_vf_vlan) +
+				  sizeof(struct ifla_vf_tx_rate));
+		return size;
+	} else
 		return 0;
 }
 
@@ -629,7 +636,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
 	       + nla_total_size(4) /* IFLA_NUM_VF */
-	       + nla_total_size(rtnl_vfinfo_size(dev)) /* IFLA_VFINFO */
+	       + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
@@ -700,14 +707,37 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 
 	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
 		int i;
-		struct ifla_vf_info ivi;
 
-		NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
-		for (i = 0; i < dev_num_vf(dev->dev.parent); i++) {
+		struct nlattr *vfinfo, *vf;
+		int num_vfs = dev_num_vf(dev->dev.parent);
+
+		NLA_PUT_U32(skb, IFLA_NUM_VF, num_vfs);
+		vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+		if (!vfinfo)
+			goto nla_put_failure;
+		for (i = 0; i < num_vfs; i++) {
+			struct ifla_vf_info ivi;
+			struct ifla_vf_mac vf_mac;
+			struct ifla_vf_vlan vf_vlan;
+			struct ifla_vf_tx_rate vf_tx_rate;
 			if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
 				break;
-			NLA_PUT(skb, IFLA_VFINFO, sizeof(ivi), &ivi);
+			vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf;
+			memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+			vf_vlan.vlan = ivi.vlan;
+			vf_vlan.qos = ivi.qos;
+			vf_tx_rate.rate = ivi.tx_rate;
+			vf = nla_nest_start(skb, IFLA_VF_INFO);
+			if (!vf) {
+				nla_nest_cancel(skb, vfinfo);
+				goto nla_put_failure;
+			}
+			NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac);
+			NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan);
+			NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate);
+			nla_nest_end(skb, vf);
 		}
+		nla_nest_end(skb, vfinfo);
 	}
 	if (dev->rtnl_link_ops) {
 		if (rtnl_link_fill(skb, dev) < 0)
@@ -769,12 +799,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
-	[IFLA_VF_MAC]		= { .type = NLA_BINARY,
-				    .len = sizeof(struct ifla_vf_mac) },
-	[IFLA_VF_VLAN]		= { .type = NLA_BINARY,
-				    .len = sizeof(struct ifla_vf_vlan) },
-	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,
-				    .len = sizeof(struct ifla_vf_tx_rate) },
+	[IFLA_VFINFO_LIST]	= {. type = NLA_NESTED },
 };
 EXPORT_SYMBOL(ifla_policy);
 
@@ -783,6 +808,19 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 	[IFLA_INFO_DATA]	= { .type = NLA_NESTED },
 };
 
+static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
+	[IFLA_VF_INFO]		= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
+	[IFLA_VF_MAC]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_mac) },
+	[IFLA_VF_VLAN]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_vlan) },
+	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_tx_rate) },
+};
+
 struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
 {
 	struct net *net;
@@ -812,6 +850,52 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 	return 0;
 }
 
+static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
+{
+	int rem, err = -EINVAL;
+	struct nlattr *vf;
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	nla_for_each_nested(vf, attr, rem) {
+		switch (nla_type(vf)) {
+		case IFLA_VF_MAC: {
+			struct ifla_vf_mac *ivm;
+			ivm = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_mac)
+				err = ops->ndo_set_vf_mac(dev, ivm->vf,
+							  ivm->mac);
+			break;
+		}
+		case IFLA_VF_VLAN: {
+			struct ifla_vf_vlan *ivv;
+			ivv = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_vlan)
+				err = ops->ndo_set_vf_vlan(dev, ivv->vf,
+							   ivv->vlan,
+							   ivv->qos);
+			break;
+		}
+		case IFLA_VF_TX_RATE: {
+			struct ifla_vf_tx_rate *ivt;
+			ivt = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_tx_rate)
+				err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
+							      ivt->rate);
+			break;
+		}
+		default:
+			err = -EINVAL;
+			break;
+		}
+		if (err)
+			break;
+	}
+	return err;
+}
+
 static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		      struct nlattr **tb, char *ifname, int modified)
 {
@@ -942,40 +1026,17 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		write_unlock_bh(&dev_base_lock);
 	}
 
-	if (tb[IFLA_VF_MAC]) {
-		struct ifla_vf_mac *ivm;
-		ivm = nla_data(tb[IFLA_VF_MAC]);
-		err = -EOPNOTSUPP;
-		if (ops->ndo_set_vf_mac)
-			err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac);
-		if (err < 0)
-			goto errout;
-		modified = 1;
-	}
-
-	if (tb[IFLA_VF_VLAN]) {
-		struct ifla_vf_vlan *ivv;
-		ivv = nla_data(tb[IFLA_VF_VLAN]);
-		err = -EOPNOTSUPP;
-		if (ops->ndo_set_vf_vlan)
-			err = ops->ndo_set_vf_vlan(dev, ivv->vf,
-						   ivv->vlan,
-						   ivv->qos);
-		if (err < 0)
-			goto errout;
-		modified = 1;
-	}
-	err = 0;
-
-	if (tb[IFLA_VF_TX_RATE]) {
-		struct ifla_vf_tx_rate *ivt;
-		ivt = nla_data(tb[IFLA_VF_TX_RATE]);
-		err = -EOPNOTSUPP;
-		if (ops->ndo_set_vf_tx_rate)
-			err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate);
-		if (err < 0)
-			goto errout;
-		modified = 1;
+	if (tb[IFLA_VFINFO_LIST]) {
+		struct nlattr *attr;
+		int rem;
+		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
+			if (nla_type(attr) != IFLA_VF_INFO)
+				goto errout;
+			err = do_setvfinfo(dev, attr);
+			if (err < 0)
+				goto errout;
+			modified = 1;
+		}
 	}
 	err = 0;
 
-- 
cgit v1.2.2