From 3fff4c42bd0a89869a0eb1e7874cc06ffa4aa0f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Sep 2009 16:18:09 +0200 Subject: printk: Remove ratelimit.h from kernel.h Decouple kernel.h from ratelimit.h: the global declaration of printk's ratelimit_state is not needed, and it leads to messy circular dependencies due to ratelimit.h's (new) adding of a spinlock_types.h include. Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: David S. Miller LKML-Reference: Signed-off-by: Ingo Molnar --- net/core/sysctl_net_core.c | 2 ++ net/core/utils.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7db1de0497c6..887c03c4e3c6 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -10,7 +10,9 @@ #include #include #include +#include #include + #include #include diff --git a/net/core/utils.c b/net/core/utils.c index 83221aee7084..838250241d26 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -24,6 +24,8 @@ #include #include #include +#include + #include #include -- cgit v1.2.2 From a9828ec6bc0b7e19a65f7e13daa8bd35a926a753 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 1 Oct 2009 11:33:03 +0000 Subject: ethtool: Remove support for obsolete string query operations The in-tree implementations have all been converted to get_sset_count(). Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 58 ++++++++++-------------------------------------------- 1 file changed, 10 insertions(+), 48 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4c12ddb5f5ee..e1951084b973 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -198,13 +198,6 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) info.n_priv_flags = rc; - } else { - /* code path for obsolete hooks */ - - if (ops->self_test_count) - info.testinfo_len = ops->self_test_count(dev); - if (ops->get_stats_count) - info.n_stats = ops->get_stats_count(dev); } if (ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); @@ -684,16 +677,10 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) u64 *data; int ret, test_len; - if (!ops->self_test) - return -EOPNOTSUPP; - if (!ops->get_sset_count && !ops->self_test_count) + if (!ops->self_test || !ops->get_sset_count) return -EOPNOTSUPP; - if (ops->get_sset_count) - test_len = ops->get_sset_count(dev, ETH_SS_TEST); - else - /* code path for obsolete hook */ - test_len = ops->self_test_count(dev); + test_len = ops->get_sset_count(dev, ETH_SS_TEST); if (test_len < 0) return test_len; WARN_ON(test_len == 0); @@ -728,36 +715,17 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) u8 *data; int ret; - if (!ops->get_strings) + if (!ops->get_strings || !ops->get_sset_count) return -EOPNOTSUPP; if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; - if (ops->get_sset_count) { - ret = ops->get_sset_count(dev, gstrings.string_set); - if (ret < 0) - return ret; - - gstrings.len = ret; - } else { - /* code path for obsolete hooks */ - - switch (gstrings.string_set) { - case ETH_SS_TEST: - if (!ops->self_test_count) - return -EOPNOTSUPP; - gstrings.len = ops->self_test_count(dev); - break; - case ETH_SS_STATS: - if (!ops->get_stats_count) - return -EOPNOTSUPP; - gstrings.len = ops->get_stats_count(dev); - break; - default: - return -EINVAL; - } - } + ret = ops->get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); if (!data) @@ -798,16 +766,10 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) u64 *data; int ret, n_stats; - if (!ops->get_ethtool_stats) - return -EOPNOTSUPP; - if (!ops->get_sset_count && !ops->get_stats_count) + if (!ops->get_ethtool_stats || !ops->get_sset_count) return -EOPNOTSUPP; - if (ops->get_sset_count) - n_stats = ops->get_sset_count(dev, ETH_SS_STATS); - else - /* code path for obsolete hook */ - n_stats = ops->get_stats_count(dev); + n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; WARN_ON(n_stats == 0); -- cgit v1.2.2 From 0835acfe72e43b2f9bd46ec8c0d219e94c3525e0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Sep 2009 13:03:33 +0000 Subject: pktgen: Avoid dirtying skb->users when txq is full We can avoid two atomic ops on skb->users if packet is not going to be sent to the device (because hardware txqueue is full) Signed-off-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/pktgen.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b69455217ed6..e856ab0d0745 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3441,12 +3441,14 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) txq = netdev_get_tx_queue(odev, queue_map); __netif_tx_lock_bh(txq); - atomic_inc(&(pkt_dev->skb->users)); - if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) + if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) { ret = NETDEV_TX_BUSY; - else - ret = (*xmit)(pkt_dev->skb, odev); + pkt_dev->last_ok = 0; + goto unlock; + } + atomic_inc(&(pkt_dev->skb->users)); + ret = (*xmit)(pkt_dev->skb, odev); switch (ret) { case NETDEV_TX_OK: @@ -3468,6 +3470,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) atomic_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; } +unlock: __netif_tx_unlock_bh(txq); /* If pkt_dev->count is zero, then run forever */ -- cgit v1.2.2 From 7ffbe3fdace0bdfcdab8dc6c77506feda0871f79 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Oct 2009 05:15:27 +0000 Subject: net: introduce NETDEV_POST_INIT notifier For various purposes including a wireless extensions bugfix, we need to hook into the netdev creation before before netdev_register_kobject(). This will also ease doing the dev type assignment that Marcel was working on for cfg80211 drivers w/o touching them all. Signed-off-by: Johannes Berg Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b8f74cfb1bfd..a74c8fd69556 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4836,6 +4836,12 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_GSO; netdev_initialize_kobject(dev); + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + ret = netdev_register_kobject(dev); if (ret) goto err_uninit; -- cgit v1.2.2 From d519e17e2d01a0ee9abe083019532061b4438065 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Fri, 2 Oct 2009 09:26:12 +0000 Subject: net: export device speed and duplex via sysfs This patch exports the link-speed (in Mbps) and duplex of an interface via sysfs. This eliminates the need to use ethtool just to check the link-speed. Not requiring 'ethtool' and not relying on the SIOCETHTOOL ioctl should be helpful in an embedded environment where space is at a premium as well. NOTE: This patch also intentionally allows non-root users to check the link speed and duplex -- something not possible with ethtool. Here's some sample output: # cat /sys/class/net/eth0/speed 100 # cat /sys/class/net/eth0/duplex half # ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Half 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: Not reported Advertised auto-negotiation: No Speed: 100Mb/s Duplex: Half Port: Twisted Pair PHYAD: 1 Transceiver: internal Auto-negotiation: off Supports Wake-on: g Wake-on: g Current message level: 0x000000ff (255) Link detected: yes Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 821d30918cfc..effb78410eb2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -130,6 +130,44 @@ static ssize_t show_carrier(struct device *dev, return -EINVAL; } +static ssize_t show_speed(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + + if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) + ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + } + rtnl_unlock(); + return ret; +} + +static ssize_t show_duplex(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + + if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) + ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); + } + rtnl_unlock(); + return ret; +} + static ssize_t show_dormant(struct device *dev, struct device_attribute *attr, char *buf) { @@ -259,6 +297,8 @@ static struct device_attribute net_class_attributes[] = { __ATTR(address, S_IRUGO, show_address, NULL), __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(speed, S_IRUGO, show_speed, NULL), + __ATTR(duplex, S_IRUGO, show_duplex, NULL), __ATTR(dormant, S_IRUGO, show_dormant, NULL), __ATTR(operstate, S_IRUGO, show_operstate, NULL), __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), -- cgit v1.2.2 From d250a5f90e53f5e150618186230795352d154c88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2009 10:32:18 +0000 Subject: pkt_sched: gen_estimator: Dont report fake rate estimators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jarek Poplawski a écrit : > > > Hmm... So you made me to do some "real" work here, and guess what?: > there is one serious checkpatch warning! ;-) Plus, this new parameter > should be added to the function description. Otherwise: > Signed-off-by: Jarek Poplawski > > Thanks, > Jarek P. > > PS: I guess full "Don't" would show we really mean it... Okay :) Here is the last round, before the night ! Thanks again [RFC] pkt_sched: gen_estimator: Don't report fake rate estimators We currently send TCA_STATS_RATE_EST elements to netlink users, even if no estimator is running. # tc -s -d qdisc qdisc pfifo_fast 0: dev eth0 root bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 Sent 112833764978 bytes 1495081739 pkt (dropped 0, overlimits 0 requeues 0) rate 0bit 0pps backlog 0b 0p requeues 0 User has no way to tell if the "rate 0bit 0pps" is a real estimation, or a fake one (because no estimator is active) After this patch, tc command output is : $ tc -s -d qdisc qdisc pfifo_fast 0: dev eth0 root bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 Sent 561075 bytes 1196 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 We add a parameter to gnet_stats_copy_rate_est() function so that it can use gen_estimator_active(bstats, r), as suggested by Jarek. This parameter can be NULL if check is not necessary, (htb for example has a mandatory rate estimator) Signed-off-by: Eric Dumazet Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/gen_stats.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 8569310268ab..393b1d8618e2 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -127,6 +127,7 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle + * @b: basic statistics * @r: rate estimator statistics * * Appends the rate estimator statistics to the top level TLV created by @@ -136,8 +137,13 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r) +gnet_stats_copy_rate_est(struct gnet_dump *d, + const struct gnet_stats_basic_packed *b, + struct gnet_stats_rate_est *r) { + if (b && !gen_estimator_active(b, r)) + return 0; + if (d->compat_tc_stats) { d->tc_stats.bps = r->bps; d->tc_stats.pps = r->pps; -- cgit v1.2.2 From d73d3a8cb4723e161589864741d8528d70b350eb Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 5 Oct 2009 10:59:58 +0000 Subject: ethtool: Add reset operation After updating firmware stored in flash, users may wish to reset the relevant hardware and start the new firmware immediately. This should not be completely automatic as it may be disruptive. A selective reset may also be useful for debugging or diagnostics. This adds a separate reset operation which takes flags indicating the components to be reset. Drivers are allowed to reset only a subset of those requested, and must indicate the actual subset. This allows the use of generic component masks and some future expansion. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e1951084b973..d8aee584e8d1 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -302,6 +302,26 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) return ret; } +static int ethtool_reset(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value reset; + int ret; + + if (!dev->ethtool_ops->reset) + return -EOPNOTSUPP; + + if (copy_from_user(&reset, useraddr, sizeof(reset))) + return -EFAULT; + + ret = dev->ethtool_ops->reset(dev, &reset.data); + if (ret) + return ret; + + if (copy_to_user(useraddr, &reset, sizeof(reset))) + return -EFAULT; + return 0; +} + static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; @@ -1089,6 +1109,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, useraddr); break; + case ETHTOOL_RESET: + rc = ethtool_reset(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.2 From 3d23e349d807177eaf519d444677cee86b1a04cf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 29 Sep 2009 23:27:28 +0200 Subject: wext: refactor Refactor wext to * split out iwpriv handling * split out iwspy handling * split out procfs support * allow cfg80211 to have wireless extensions compat code w/o CONFIG_WIRELESS_EXT After this, drivers need to - select WIRELESS_EXT - for wext support - select WEXT_PRIV - for iwpriv support - select WEXT_SPY - for iwspy support except cfg80211 -- which gets new hooks in wext-core.c and can then get wext handlers without CONFIG_WIRELESS_EXT. Wireless extensions procfs support is auto-selected based on PROC_FS and anything that requires the wext core (i.e. WIRELESS_EXT or CFG80211_WEXT). Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/core/net-sysfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index effb78410eb2..9b07535c2889 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -543,8 +543,12 @@ int netdev_register_kobject(struct net_device *net) *groups++ = &netstat_group; #ifdef CONFIG_WIRELESS_EXT_SYSFS - if (net->wireless_handlers || net->ieee80211_ptr) + if (net->ieee80211_ptr) *groups++ = &wireless_group; +#ifdef CONFIG_WIRELESS_EXT + else if (net->wireless_handlers) + *groups++ = &wireless_group; +#endif #endif #endif /* CONFIG_SYSFS */ -- cgit v1.2.2 From d9f5950f90292f7cc42834338dfd5f44dc4cc4ca Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Wed, 7 Oct 2009 12:24:25 +0000 Subject: net: Make UFO on master device independent of attached devices Now that software UFO is supported, UFO can be enabled on master devices like bridge, bond even though the attached device doesn't support this feature in hardware. This allows UFO to be used between KVM host and guest even when a physical interface attached to the bridge doesn't support UFO. Signed-off-by: Sridhar Samudrala Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a74c8fd69556..510ff205d5db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5489,7 +5489,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one, one |= NETIF_F_ALL_CSUM; one |= all & NETIF_F_ONE_FOR_ALL; - all &= one | NETIF_F_LLTX | NETIF_F_GSO; + all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; all |= one & mask & NETIF_F_ONE_FOR_ALL; return all; -- cgit v1.2.2 From 3b885787ea4112eaa80945999ea0901bf742707f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 12 Oct 2009 13:26:31 -0700 Subject: net: Generalize socket rx gap / receive queue overflow cmsg Create a new socket level option to report number of queue overflows Recently I augmented the AF_PACKET protocol to report the number of frames lost on the socket receive queue between any two enqueued frames. This value was exported via a SOL_PACKET level cmsg. AFter I completed that work it was requested that this feature be generalized so that any datagram oriented socket could make use of this option. As such I've created this patch, It creates a new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue overflowed between any two given frames. It also augments the AF_PACKET protocol to take advantage of this new feature (as it previously did not touch sk->sk_drops, which this patch uses to record the overflow count). Tested successfully by me. Notes: 1) Unlike my previous patch, this patch simply records the sk_drops value, which is not a number of drops between packets, but rather a total number of drops. Deltas must be computed in user space. 2) While this patch currently works with datagram oriented protocols, it will also be accepted by non-datagram oriented protocols. I'm not sure if thats agreeable to everyone, but my argument in favor of doing so is that, for those protocols which aren't applicable to this option, sk_drops will always be zero, and reporting no drops on a receive queue that isn't used for those non-participating protocols seems reasonable to me. This also saves us having to code in a per-protocol opt in mechanism. 3) This applies cleanly to net-next assuming that commit 977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted Signed-off-by: Neil Horman Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 7626b6aacd68..43ca2c995393 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -276,6 +276,8 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err = 0; int skb_len; + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK @@ -305,7 +307,10 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ skb_len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); + spin_lock_irqsave(&list->lock, flags); + skb->dropcount = atomic_read(&sk->sk_drops); + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, skb_len); @@ -702,6 +707,12 @@ set_rcvbuf: /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ + case SO_RXQ_OVFL: + if (valbool) + sock_set_flag(sk, SOCK_RXQ_OVFL); + else + sock_reset_flag(sk, SOCK_RXQ_OVFL); + break; default: ret = -ENOPROTOOPT; break; @@ -901,6 +912,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_mark; break; + case SO_RXQ_OVFL: + v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.2 From 89d71a66c40d629e3b1285def543ab1425558cd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Oct 2009 05:34:20 +0000 Subject: net: Use netdev_alloc_skb_ip_align() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 510ff205d5db..28b0b9e992a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2604,20 +2604,13 @@ EXPORT_SYMBOL(napi_reuse_skb); struct sk_buff *napi_get_frags(struct napi_struct *napi) { - struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); - if (!skb) - goto out; - - skb_reserve(skb, NET_IP_ALIGN); - - napi->skb = skb; + skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + if (skb) + napi->skb = skb; } - -out: return skb; } EXPORT_SYMBOL(napi_get_frags); -- cgit v1.2.2 From 766e9037cc139ee25ed93ee5ad11e1450c4b99f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2009 20:40:11 -0700 Subject: net: sk_drops consolidation sock_queue_rcv_skb() can update sk_drops itself, removing need for callers to take care of it. This is more consistent since sock_queue_rcv_skb() also reads sk_drops when queueing a skb. This adds sk_drops managment to many protocols that not cared yet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 43ca2c995393..38713aa3faf2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -274,7 +274,7 @@ static void sock_disable_timestamp(struct sock *sk, int flag) int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err = 0; + int err; int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -284,17 +284,17 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf) { - err = -ENOMEM; - goto out; + atomic_inc(&sk->sk_drops); + return -ENOMEM; } err = sk_filter(sk, skb); if (err) - goto out; + return err; if (!sk_rmem_schedule(sk, skb->truesize)) { - err = -ENOBUFS; - goto out; + atomic_inc(&sk->sk_drops); + return -ENOBUFS; } skb->dev = NULL; @@ -314,8 +314,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, skb_len); -out: - return err; + return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); -- cgit v1.2.2 From 8edf19c2fe028563fc6ea9cb1995b8ee4172d4b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Oct 2009 00:12:40 +0000 Subject: net: sk_drops consolidation part 2 - skb_kill_datagram() can increment sk->sk_drops itself, not callers. - UDP on IPV4 & IPV6 dropped frames (because of bad checksum or policy checks) increment sk_drops Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 1c6cf3a1a4f6..4d57f5e12b05 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -262,6 +262,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } kfree_skb(skb); + atomic_inc(&sk->sk_drops); sk_mem_reclaim_partial(sk); return err; -- cgit v1.2.2 From 7e75f93eda027d9f9e0203ee6ffd210ea92e98f3 Mon Sep 17 00:00:00 2001 From: jamal Date: Mon, 19 Oct 2009 02:17:56 +0000 Subject: pkt_sched: ingress socket filter by mark Allow bpf to set a filter to drop packets that dont match a specific mark Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index d1d779ca096d..e3987e1d4839 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -303,6 +303,9 @@ load_b: case SKF_AD_IFINDEX: A = skb->dev->ifindex; continue; + case SKF_AD_MARK: + A = skb->mark; + continue; case SKF_AD_NLATTR: { struct nlattr *nla; -- cgit v1.2.2 From d19742fb1c68e6db83b76e06dea5a374c99e104f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Oct 2009 01:06:22 -0700 Subject: filter: Add SKF_AD_QUEUE instruction It can help being able to filter packets on their queue_mapping. If filter performance is not good, we could add a "numqueue" field in struct packet_type, so that netif_nit_deliver() and other functions can directly ignore packets with not expected queue number. Lets experiment this simple filter extension first. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e3987e1d4839..08db7b9143a3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -306,6 +306,9 @@ load_b: case SKF_AD_MARK: A = skb->mark; continue; + case SKF_AD_QUEUE: + A = skb->queue_mapping; + continue; case SKF_AD_NLATTR: { struct nlattr *nla; -- cgit v1.2.2 From e022f0b4a03f4fff9323b509df023b8af635716e Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 19 Oct 2009 23:46:20 +0000 Subject: net: Introduce sk_tx_queue_mapping Introduce sk_tx_queue_mapping; and functions that set, test and get this value. Reset sk_tx_queue_mapping to -1 whenever the dst cache is set/reset, and in socket alloc. Setting txq to -1 and using valid txq=<0 to n-1> allows the tx path to use the value of sk_tx_queue_mapping directly instead of subtracting 1 on every tx. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 38713aa3faf2..934d9673f084 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -357,6 +357,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) struct dst_entry *dst = sk->sk_dst_cache; if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_tx_queue_clear(sk); sk->sk_dst_cache = NULL; dst_release(dst); return NULL; @@ -953,7 +954,8 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) void *sptr = nsk->sk_security; #endif BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != - sizeof(osk->sk_node) + sizeof(osk->sk_refcnt)); + sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) + + sizeof(osk->sk_tx_queue_mapping)); memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); #ifdef CONFIG_SECURITY_NETWORK @@ -997,6 +999,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; + sk_tx_queue_clear(sk); } return sk; -- cgit v1.2.2 From ea94ff3b55188df157a8740bdf3976a87563d705 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 19 Oct 2009 23:46:45 +0000 Subject: net: Fix for dst_negative_advice dst_negative_advice() should check for changed dst and reset sk_tx_queue_mapping accordingly. Pass sock to the callers of dst_negative_advice. (sk_reset_txq is defined just for use by dst_negative_advice. The only way I could find to get around this is to move dst_negative_() from dst.h to dst.c, include sock.h in dst.c, etc) Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 934d9673f084..5a51512f638a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -352,6 +352,12 @@ discard_and_relse: } EXPORT_SYMBOL(sk_receive_skb); +void sk_reset_txq(struct sock *sk) +{ + sk_tx_queue_clear(sk); +} +EXPORT_SYMBOL(sk_reset_txq); + struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk->sk_dst_cache; -- cgit v1.2.2 From a4ee3ce3293dc931fab19beb472a8bde1295aebe Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 19 Oct 2009 23:50:07 +0000 Subject: net: Use sk_tx_queue_mapping for connected sockets For connected sockets, the first run of dev_pick_tx saves the calculated txq in sk_tx_queue_mapping. This is not saved if either the device has a queue select or the socket is not connected. Next iterations of dev_pick_tx uses the cached value of sk_tx_queue_mapping. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 28b0b9e992a0..fa88dcd476d6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1791,13 +1791,25 @@ EXPORT_SYMBOL(skb_tx_hash); static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { - const struct net_device_ops *ops = dev->netdev_ops; - u16 queue_index = 0; + u16 queue_index; + struct sock *sk = skb->sk; + + if (sk_tx_queue_recorded(sk)) { + queue_index = sk_tx_queue_get(sk); + } else { + const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb); - else if (dev->real_num_tx_queues > 1) - queue_index = skb_tx_hash(dev, skb); + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb); + } else { + queue_index = 0; + if (dev->real_num_tx_queues > 1) + queue_index = skb_tx_hash(dev, skb); + + if (sk && sk->sk_dst_cache) + sk_tx_queue_set(sk, queue_index); + } + } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); -- cgit v1.2.2 From a3d1289126e7b14307074b76bf1677015ea5036f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Oct 2009 10:59:31 +0000 Subject: rtnetlink: rtnl_setlink() and rtnl_getlink() changes rtnl_getlink() & rtnl_setlink() run with RTNL held, we can use __dev_get_by_index() and __dev_get_by_name() variants and avoid dev_hold()/dev_put() Adds to rtnl_getlink() the capability to find a device by its name, not only by its index. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eb42873f2a3a..ba13b0974a7b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -910,9 +910,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = dev_get_by_name(net, ifname); + dev = __dev_get_by_name(net, ifname); else goto errout; @@ -922,11 +922,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } if ((err = validate_linkmsg(dev, tb)) < 0) - goto errout_dev; + goto errout; err = do_setlink(dev, ifm, tb, ifname, 0); -errout_dev: - dev_put(dev); errout: return err; } @@ -1154,6 +1152,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -1163,19 +1162,23 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (err < 0) return err; + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) { - dev = dev_get_by_index(net, ifm->ifi_index); - if (dev == NULL) - return -ENODEV; - } else + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else return -EINVAL; + if (dev == NULL) + return -ENODEV; + nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); - if (nskb == NULL) { - err = -ENOBUFS; - goto errout; - } + if (nskb == NULL) + return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, 0); @@ -1183,11 +1186,8 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); - goto errout; - } - err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); -errout: - dev_put(dev); + } else + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); return err; } -- cgit v1.2.2 From 7c28bd0b8ec4d128bd7660671d1b626b0abc471f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 24 Oct 2009 06:13:17 -0700 Subject: rtnetlink: speedup rtnl_dump_ifinfo() When handling large number of netdevice, rtnl_dump_ifinfo() is very slow because it has O(N^2) complexity. Instead of scanning one single list, we can use the 256 sub lists of the dev_index hash table. This considerably speedups "ip link" operations Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++----- net/core/rtnetlink.c | 37 ++++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fa88dcd476d6..e7bada1d5ed9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -193,18 +193,15 @@ static struct list_head ptype_all __read_mostly; /* Taps */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } /* Device list insertion */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ba13b0974a7b..52ea418d5302 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -682,22 +682,33 @@ nla_put_failure: static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int idx; - int s_idx = cb->args[0]; + int h, s_h; + int idx = 0, s_idx; struct net_device *dev; - - idx = 0; - for_each_netdev(net, dev) { - if (idx < s_idx) - goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0) - break; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI) <= 0) + goto out; cont: - idx++; + idx++; + } } - cb->args[0] = idx; +out: + cb->args[1] = idx; + cb->args[0] = h; return skb->len; } -- cgit v1.2.2 From 05423b241311c9380b7280179295bac7794281b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 26 Oct 2009 18:40:35 -0700 Subject: vlan: allow null VLAN ID to be used We currently use a 16 bit field (vlan_tci) to store VLAN ID/PRIO on a skb. Null value is used as a special value, meaning vlan tagging not enabled. This forbids use of null vlan ID. As pointed by David, some drivers use the 3 high order bits (PRIO) As VLAN ID is 12 bits, we can use the remaining bit (CFI) as a flag, and allow null VLAN ID. In case future code really wants to use VLAN_CFI_MASK, we'll have to use a bit outside of vlan_tci. #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */ #define VLAN_TAG_PRESENT VLAN_CFI_MASK #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ Reported-by: Gertjan Hofman Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e7bada1d5ed9..950c13fa60d2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2300,7 +2300,7 @@ int netif_receive_skb(struct sk_buff *skb) if (!skb->tstamp.tv64) net_timestamp(skb); - if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) + if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; /* if we've gotten here through NAPI, check netpoll */ -- cgit v1.2.2 From 44a0873d52282f24b1894c58c0f157e0f626ddc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:03:04 +0000 Subject: net: Introduce unregister_netdevice_queue() This patchs adds an unreg_list anchor to struct net_device, and introduces an unregister_netdevice_queue() function, able to queue a net_device to a list instead of immediately unregister it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 950c13fa60d2..ff94e2b8df7f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5245,25 +5245,31 @@ void synchronize_net(void) EXPORT_SYMBOL(synchronize_net); /** - * unregister_netdevice - remove device from the kernel + * unregister_netdevice_queue - remove device from the kernel * @dev: device - * + * @head: list + * This function shuts down a device interface and removes it * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ -void unregister_netdevice(struct net_device *dev) +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + if (head) { + list_add_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); + } } -EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdev - remove device from the kernel -- cgit v1.2.2 From 9b5e383c11b08784eb0087617f880077982ef769 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:04:19 +0000 Subject: net: Introduce unregister_netdevice_many() Introduce rollback_registered_many() and unregister_netdevice_many() rollback_registered_many() is able to perform necessary steps at device dismantle time, factorizing two expensive synchronize_net() calls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 97 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ff94e2b8df7f..04d3e3014020 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4637,59 +4637,76 @@ static void net_set_todo(struct net_device *dev) list_add_tail(&dev->todo_list, &net_todo_list); } -static void rollback_registered(struct net_device *dev) +static void rollback_registered_many(struct list_head *head) { + struct net_device *dev; + BUG_ON(dev_boot_phase); ASSERT_RTNL(); - /* Some devices call without registering for initialization unwind. */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " - "was registered\n", dev->name, dev); + list_for_each_entry(dev, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); - WARN_ON(1); - return; - } + WARN_ON(1); + return; + } - BUG_ON(dev->reg_state != NETREG_REGISTERED); + BUG_ON(dev->reg_state != NETREG_REGISTERED); - /* If device is running, close it first. */ - dev_close(dev); + /* If device is running, close it first. */ + dev_close(dev); - /* And unlink it from device chain. */ - unlist_netdevice(dev); + /* And unlink it from device chain. */ + unlist_netdevice(dev); - dev->reg_state = NETREG_UNREGISTERING; + dev->reg_state = NETREG_UNREGISTERING; + } synchronize_net(); - /* Shutdown queueing discipline. */ - dev_shutdown(dev); + list_for_each_entry(dev, head, unreg_list) { + /* Shutdown queueing discipline. */ + dev_shutdown(dev); - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* - * Flush the unicast and multicast chains - */ - dev_unicast_flush(dev); - dev_addr_discard(dev); + /* + * Flush the unicast and multicast chains + */ + dev_unicast_flush(dev); + dev_addr_discard(dev); - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); + /* Notifier chain MUST detach us from master device. */ + WARN_ON(dev->master); - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); + } synchronize_net(); - dev_put(dev); + list_for_each_entry(dev, head, unreg_list) + dev_put(dev); +} + +static void rollback_registered(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + rollback_registered_many(&single); } static void __netdev_init_queue_locks_one(struct net_device *dev, @@ -5271,6 +5288,22 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + * + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev; + + if (!list_empty(head)) { + rollback_registered_many(head); + list_for_each_entry(dev, head, unreg_list) + net_set_todo(dev); + } +} + /** * unregister_netdev - remove device from the kernel * @dev: device -- cgit v1.2.2 From 23289a37e2b127dfc4de1313fba15bb4c9f0cd5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:06:36 +0000 Subject: net: add a list_head parameter to dellink() method Adding a list_head parameter to rtnl_link_ops->dellink() methods allow us to queue devices on a list, in order to dismantle them all at once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/rtnetlink.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 04d3e3014020..4513dfd5718e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5629,7 +5629,7 @@ restart: /* Delete virtual devices */ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { - dev->rtnl_link_ops->dellink(dev); + dev->rtnl_link_ops->dellink(dev, NULL); goto restart; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 52ea418d5302..391a62cd9df6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -248,7 +248,7 @@ static LIST_HEAD(link_ops); int __rtnl_link_register(struct rtnl_link_ops *ops) { if (!ops->dellink) - ops->dellink = unregister_netdevice; + ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); return 0; @@ -277,13 +277,13 @@ EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; -restart: + LIST_HEAD(list_kill); + for_each_netdev(net, dev) { - if (dev->rtnl_link_ops == ops) { - ops->dellink(dev); - goto restart; - } + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); } + unregister_netdevice_many(&list_kill); } void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) @@ -972,7 +972,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (!ops) return -EOPNOTSUPP; - ops->dellink(dev); + ops->dellink(dev, NULL); return 0; } -- cgit v1.2.2 From 63c8099d90096db56ee1c66c31f05d4fcfbc1c69 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:06:49 +0000 Subject: vlan: Optimize multiple unregistration Use unregister_netdevice_many() to speedup master device unregister. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4513dfd5718e..09551cc143a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5303,6 +5303,7 @@ void unregister_netdevice_many(struct list_head *head) net_set_todo(dev); } } +EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel -- cgit v1.2.2 From ac5e3af9996fb911d4fdff910a8ac3cb7fc63a94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 26 Oct 2009 01:23:33 +0000 Subject: net: sysfs: ethtool_ops can be NULL commit d519e17e2d01a0ee9abe083019532061b4438065 (net: export device speed and duplex via sysfs) made the wrong assumption that netdev->ethtool_ops was always set. This makes possible to crash kernel and let rtnl in locked state. modprobe dummy ip link set dummy0 up (udev runs and crash) Signed-off-by: Eric Dumazet Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 753c420060df..89de182353b0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -139,7 +139,9 @@ static ssize_t show_speed(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + if (netif_running(netdev) && + netdev->ethtool_ops && + netdev->ethtool_ops->get_settings) { struct ethtool_cmd cmd = { ETHTOOL_GSET }; if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) @@ -158,7 +160,9 @@ static ssize_t show_duplex(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + if (netif_running(netdev) && + netdev->ethtool_ops && + netdev->ethtool_ops->get_settings) { struct ethtool_cmd cmd = { ETHTOOL_GSET }; if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) -- cgit v1.2.2 From fb699dfd426a189fe33b91586c15176a75c8aed0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Oct 2009 19:18:49 +0000 Subject: net: Introduce dev_get_by_index_rcu() Some workloads hit dev_base_lock rwlock pretty hard. We can use RCU lookups to avoid touching this rwlock. netdevices are already freed after a RCU grace period, so this patch adds no penalty at device dismantle time. dev_ifname() converted to dev_get_by_index_rcu() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 09551cc143a9..68a1bb68b5a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -214,12 +214,15 @@ static int list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail(&dev->dev_list, &net->dev_base_head); hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); - hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); return 0; } -/* Device list removal */ +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ static void unlist_netdevice(struct net_device *dev) { ASSERT_RTNL(); @@ -228,7 +231,7 @@ static void unlist_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_del(&dev->dev_list); hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); + hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); } @@ -646,6 +649,31 @@ struct net_device *__dev_get_by_index(struct net *net, int ifindex) } EXPORT_SYMBOL(__dev_get_by_index); +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_index_rcu); + /** * dev_get_by_index - find a device by its ifindex @@ -662,11 +690,11 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); @@ -2939,15 +2967,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifr.ifr_ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); if (!dev) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); return -ENODEV; } strcpy(ifr.ifr_name, dev->name); - read_unlock(&dev_base_lock); + rcu_read_unlock(); if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; -- cgit v1.2.2 From 5b252f0c2f98df21fadf0f6cf189b87a0b938228 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 29 Oct 2009 07:17:09 +0000 Subject: gro: Name the GRO result enumeration type This clarifies which return and parameter types are GRO result codes and not RX result codes. Signed-off-by: Ben Hutchings Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 68a1bb68b5a8..1dc13744684c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2476,7 +2476,7 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); -int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct packet_type *ptype; @@ -2484,7 +2484,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; int same_flow; int mac_len; - int ret; + enum gro_result ret; if (!(skb->dev->features & NETIF_F_GRO)) goto normal; @@ -2568,7 +2568,8 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static gro_result_t +__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; @@ -2585,7 +2586,7 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return dev_gro_receive(napi, skb); } -int napi_skb_finish(int ret, struct sk_buff *skb) +int napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { int err = NET_RX_SUCCESS; @@ -2600,6 +2601,10 @@ int napi_skb_finish(int ret, struct sk_buff *skb) case GRO_MERGED_FREE: kfree_skb(skb); break; + + case GRO_HELD: + case GRO_MERGED: + break; } return err; @@ -2652,7 +2657,8 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) +int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) { int err = NET_RX_SUCCESS; @@ -2674,6 +2680,9 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; + + case GRO_MERGED: + break; } return err; -- cgit v1.2.2 From c7c4b3b6e976b95facbb723951bdcd554a3530a4 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 29 Oct 2009 21:36:53 -0700 Subject: gro: Change all receive functions to return GRO result codes This will allow drivers to adjust their receive path dynamically based on whether GRO is being applied successfully. Currently all in-tree callers ignore the return values of these functions and do not need to be changed. Signed-off-by: Ben Hutchings Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1dc13744684c..631cc40da197 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2586,18 +2586,15 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return dev_gro_receive(napi, skb); } -int napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: - return netif_receive_skb(skb); + if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: kfree_skb(skb); break; @@ -2607,7 +2604,7 @@ int napi_skb_finish(gro_result_t ret, struct sk_buff *skb) break; } - return err; + return ret; } EXPORT_SYMBOL(napi_skb_finish); @@ -2627,7 +2624,7 @@ void skb_gro_reset_offset(struct sk_buff *skb) } EXPORT_SYMBOL(skb_gro_reset_offset); -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); @@ -2657,26 +2654,21 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: case GRO_HELD: skb->protocol = eth_type_trans(skb, napi->dev); - if (ret == GRO_NORMAL) - return netif_receive_skb(skb); - - skb_gro_pull(skb, -ETH_HLEN); + if (ret == GRO_HELD) + skb_gro_pull(skb, -ETH_HLEN); + else if (netif_receive_skb(skb)) + ret = GRO_DROP; break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; @@ -2685,7 +2677,7 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, break; } - return err; + return ret; } EXPORT_SYMBOL(napi_frags_finish); @@ -2726,12 +2718,12 @@ out: } EXPORT_SYMBOL(napi_frags_skb); -int napi_gro_frags(struct napi_struct *napi) +gro_result_t napi_gro_frags(struct napi_struct *napi) { struct sk_buff *skb = napi_frags_skb(napi); if (!skb) - return NET_RX_DROP; + return GRO_DROP; return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); } -- cgit v1.2.2 From 0bd8d53656da72bc065766b5f2a05ca738020b8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 30 Oct 2009 01:40:11 -0700 Subject: net: use hlist_for_each_entry() Small cleanup of __dev_get_by_name() and __dev_get_by_index() to use hlist_for_each_entry() : They'll look like their _rcu variant. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 631cc40da197..94f42a15fff1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -587,13 +587,13 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each(p, dev_name_hash(net, name)) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); + hlist_for_each_entry(dev, p, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; - } + return NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -638,13 +638,13 @@ EXPORT_SYMBOL(dev_get_by_name); struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each(p, dev_index_hash(net, ifindex)) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); + hlist_for_each_entry(dev, p, head, index_hlist) if (dev->ifindex == ifindex) return dev; - } + return NULL; } EXPORT_SYMBOL(__dev_get_by_index); -- cgit v1.2.2 From 0c509a6c9393b27a8c5a01acd4a72616206cfc24 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 29 Oct 2009 14:18:21 +0000 Subject: net: Allow devices to specify a device specific sysfs group. This isn't beautifully abstracted, but it is simple, simplifies uses and so far is only needed for the bonding driver. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 89de182353b0..157645c0da73 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -544,8 +544,11 @@ int netdev_register_kobject(struct net_device *net) dev_set_name(dev, "%s", net->name); #ifdef CONFIG_SYSFS - *groups++ = &netstat_group; + /* Allow for a device specific group */ + if (*groups) + groups++; + *groups++ = &netstat_group; #ifdef CONFIG_WIRELESS_EXT_SYSFS if (net->ieee80211_ptr) *groups++ = &wireless_group; -- cgit v1.2.2 From 72c9528bab94cc052d00ce241b8e85f5d71e45f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 30 Oct 2009 07:11:27 +0000 Subject: net: Introduce dev_get_by_name_rcu() Some workloads hit dev_base_lock rwlock pretty hard. We can use RCU lookups to avoid touching this rwlock (and avoid touching netdevice refcount) netdevices are already freed after a RCU grace period, so this patch adds no penalty at device dismantle time. However, it adds a synchronize_rcu() call in dev_change_name() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 94f42a15fff1..f54d8b8a434b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -213,7 +213,7 @@ static int list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail(&dev->dev_list, &net->dev_base_head); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -230,7 +230,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del(&dev->dev_list); - hlist_del(&dev->name_hlist); + hlist_del_rcu(&dev->name_hlist); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); } @@ -598,6 +598,32 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name) } EXPORT_SYMBOL(__dev_get_by_name); +/** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); + /** * dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -614,11 +640,11 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); @@ -960,7 +986,12 @@ rollback: write_lock_bh(&dev_base_lock); hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + synchronize_rcu(); + + write_lock_bh(&dev_base_lock); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -1062,9 +1093,9 @@ void dev_load(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); - read_unlock(&dev_base_lock); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); if (!dev && capable(CAP_NET_ADMIN)) request_module("%s", name); -- cgit v1.2.2 From 9fdce099bb72df534daa6193318feaec177998fc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Oct 2009 14:51:13 +0000 Subject: veth: Fix unregister_netdevice_queue for veth I tested the recent unregister many changes and got a weird, nasty and seemingly unrelasted kernel oops. Changing unregister_netdevice_queue to use list_move_tail fixes the problem for me. ip link add type veth rmmod veth ls /sys/class/net/ showed one of the veth devices still present. A subsequent ip link oopsed the box. Signed-off-by: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f54d8b8a434b..3c40d545a035 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5258,6 +5258,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); @@ -5339,7 +5340,7 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) ASSERT_RTNL(); if (head) { - list_add_tail(&dev->unreg_list, head); + list_move_tail(&dev->unreg_list, head); } else { rollback_registered(dev); /* Finish processing unregister after unlock */ -- cgit v1.2.2 From 3710becf8a58a5c6c4e797e3a3c968c161abdb41 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 1 Nov 2009 19:42:09 +0000 Subject: net: RCU locking for simple ioctl() All ioctls() implemented by dev_ifsioc_locked() : SIOCGIFFLAGS, SIOCGIFMETRIC, SIOCGIFMTU, SIOCGIFHWADDR, SIOCGIFSLAVE, SIOCGIFMAP, SIOCGIFINDEX & SIOCGIFTXQLEN can use RCU lock instead of dev_base_lock rwlock Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3c40d545a035..76a1502efe67 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4315,12 +4315,12 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) EXPORT_SYMBOL(dev_set_mac_address); /* - * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); if (!dev) return -ENODEV; @@ -4552,9 +4552,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGIFINDEX: case SIOCGIFTXQLEN: dev_load(net, ifr.ifr_name); - read_lock(&dev_base_lock); + rcu_read_lock(); ret = dev_ifsioc_locked(net, &ifr, cmd); - read_unlock(&dev_base_lock); + rcu_read_unlock(); if (!ret) { if (colon) *colon = ':'; -- cgit v1.2.2 From c6d14c84566d6b70ad9dc1618db0dec87cca9300 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Nov 2009 05:43:23 -0800 Subject: net: Introduce for_each_netdev_rcu() iterator Adds RCU management to the list of netdevices. Convert some for_each_netdev() users to RCU version, if it can avoid read_lock-ing dev_base_lock Ie: read_lock(&dev_base_loack); for_each_netdev(net, dev) some_action(); read_unlock(&dev_base_lock); becomes : rcu_read_lock(); for_each_netdev_rcu(net, dev) some_action(); rcu_read_unlock(); Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 76a1502efe67..bf629ac08b87 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -175,7 +175,7 @@ static struct list_head ptype_all __read_mostly; /* Taps */ * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * - * Pure readers hold dev_base_lock for reading. + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() * * Writers must hold the rtnl semaphore while they loop through the * dev_base_head list, and hold dev_base_lock for writing when they do the @@ -212,7 +212,7 @@ static int list_netdevice(struct net_device *dev) ASSERT_RTNL(); write_lock_bh(&dev_base_lock); - list_add_tail(&dev->dev_list, &net->dev_base_head); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); @@ -229,7 +229,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); - list_del(&dev->dev_list); + list_del_rcu(&dev->dev_list); hlist_del_rcu(&dev->name_hlist); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -799,15 +799,15 @@ struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags, struct net_device *dev, *ret; ret = NULL; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { dev_hold(dev); ret = dev; break; } } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_get_by_flags); @@ -3077,18 +3077,18 @@ static int dev_ifconf(struct net *net, char __user *arg) * in detail. */ void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(RCU) { struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; - read_lock(&dev_base_lock); + rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; off = 1; - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (off++ == *pos) return dev; @@ -3097,16 +3097,18 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net *net = seq_file_net(seq); + struct net_device *dev = (v == SEQ_START_TOKEN) ? + first_net_device(seq_file_net(seq)) : + next_net_device((struct net_device *)v); + ++*pos; - return v == SEQ_START_TOKEN ? - first_net_device(net) : next_net_device((struct net_device *)v); + return rcu_dereference(dev); } void dev_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(RCU) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -- cgit v1.2.2 From bf8e56bfc4fcfcef9f08e6233dc619706807893a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Nov 2009 21:03:39 -0800 Subject: net: sock_bindtodevice() RCU-ification Avoid dev_hold()/dev_put() in sock_bindtodevice() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 5a51512f638a..38820eaecd43 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -420,14 +420,16 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) if (devname[0] == '\0') { index = 0; } else { - struct net_device *dev = dev_get_by_name(net, devname); + struct net_device *dev; + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, devname); + if (dev) + index = dev->ifindex; + rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; - - index = dev->ifindex; - dev_put(dev); } lock_sock(sk); -- cgit v1.2.2 From baac8564547ac7f944af1c2e8cc6fdd57f2836a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Nov 2009 21:04:32 -0800 Subject: pktgen: tx_bytes might be slightly wrong cur_pkt_size can be changed in proc fs while pktgen is running, we better use a private field to get precise tx-bytes counter. Signed-off-by: Ben Greear Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5ce017bf4afa..d38470a32792 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -340,6 +340,7 @@ struct pktgen_dev { __u16 cur_udp_src; __u16 cur_queue_map; __u32 cur_pkt_size; + __u32 last_pkt_size; __u8 hh[14]; /* = { @@ -3434,7 +3435,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->clone_count--; /* back out increment, OOM */ return; } - + pkt_dev->last_pkt_size = pkt_dev->skb->len; pkt_dev->allocated_skbs++; pkt_dev->clone_count = 0; /* reset counter */ } @@ -3461,7 +3462,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; - pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; break; default: /* Drivers are not supposed to return other values! */ if (net_ratelimit()) -- cgit v1.2.2 From 000ba2e43f33901859fd794bb33c885909d53b3b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 5 Nov 2009 22:37:11 -0800 Subject: net: Fix build warning in sock_bindtodevice(). net/core/sock.c: In function 'sock_setsockopt': net/core/sock.c:396: warning: 'index' may be used uninitialized in this function net/core/sock.c:396: note: 'index' was declared here GCC can't see that all paths initialize index, so just set it to the default (0) and eliminate the specific code block that handles the null device name string. Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 38820eaecd43..76ff58d43e26 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -417,9 +417,8 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) if (copy_from_user(devname, optval, optlen)) goto out; - if (devname[0] == '\0') { - index = 0; - } else { + index = 0; + if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); -- cgit v1.2.2 From d3bcfefaca27c1bfc8f740f5fff5b25d52ed1211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 6 Nov 2009 22:17:25 -0800 Subject: net: Replace old style lock initializer SPIN_LOCK_UNLOCKED is deprecated. Use DEFINE_SPINLOCK instead. Signed-off-by: Thomas Gleixner Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 0a113f26bc9f..b8e9d3a86887 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -41,7 +41,7 @@ static void send_dm_alert(struct work_struct *unused); * netlink alerts */ static int trace_state = TRACE_OFF; -static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(trace_state_lock); struct per_cpu_dm_data { struct work_struct dm_alert_work; -- cgit v1.2.2 From e0d087af725b09358336098a6b57bb7f90f96175 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Nov 2009 01:26:17 -0800 Subject: rtnetlink: Cleanups Pure cleanups patch Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 54 ++++++++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 391a62cd9df6..e2f3317f290f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -38,7 +38,6 @@ #include #include -#include #include #include @@ -53,8 +52,7 @@ #include #include -struct rtnl_link -{ +struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; }; @@ -65,6 +63,7 @@ void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_lock); void __rtnl_unlock(void) { @@ -76,16 +75,19 @@ void rtnl_unlock(void) /* This fellow will unlock it for us. */ netdev_run_todo(); } +EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_is_locked); static struct rtnl_link *rtnl_msg_handlers[NPROTO]; @@ -168,7 +170,6 @@ int __rtnl_register(int protocol, int msgtype, return 0; } - EXPORT_SYMBOL_GPL(__rtnl_register); /** @@ -188,7 +189,6 @@ void rtnl_register(int protocol, int msgtype, "protocol = %d, message type = %d\n", protocol, msgtype); } - EXPORT_SYMBOL_GPL(rtnl_register); /** @@ -213,7 +213,6 @@ int rtnl_unregister(int protocol, int msgtype) return 0; } - EXPORT_SYMBOL_GPL(rtnl_unregister); /** @@ -230,7 +229,6 @@ void rtnl_unregister_all(int protocol) kfree(rtnl_msg_handlers[protocol]); rtnl_msg_handlers[protocol] = NULL; } - EXPORT_SYMBOL_GPL(rtnl_unregister_all); static LIST_HEAD(link_ops); @@ -253,7 +251,6 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) list_add_tail(&ops->list, &link_ops); return 0; } - EXPORT_SYMBOL_GPL(__rtnl_link_register); /** @@ -271,7 +268,6 @@ int rtnl_link_register(struct rtnl_link_ops *ops) rtnl_unlock(); return err; } - EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) @@ -309,7 +305,6 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } list_del(&ops->list); } - EXPORT_SYMBOL_GPL(__rtnl_link_unregister); /** @@ -322,7 +317,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops) __rtnl_link_unregister(ops); rtnl_unlock(); } - EXPORT_SYMBOL_GPL(rtnl_link_unregister); static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) @@ -427,12 +421,13 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data struct rtattr *rta; int size = RTA_LENGTH(attrlen); - rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); rta->rta_type = attrtype; rta->rta_len = size; memcpy(RTA_DATA(rta), data, attrlen); memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); } +EXPORT_SYMBOL(__rta_fill); int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) { @@ -454,6 +449,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) return nlmsg_unicast(rtnl, skb, pid); } +EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) @@ -466,6 +462,7 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, nlmsg_notify(rtnl, skb, pid, group, report, flags); } +EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { @@ -473,6 +470,7 @@ void rtnl_set_sk_err(struct net *net, u32 group, int error) netlink_set_err(rtnl, 0, group, error); } +EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { @@ -501,6 +499,7 @@ nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } +EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, u32 ts, u32 tsage, long expires, u32 error) @@ -520,14 +519,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } - EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = dev->operstate; - switch(transition) { + switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_UNKNOWN) && @@ -728,6 +726,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, }; +EXPORT_SYMBOL(ifla_policy); static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, @@ -932,7 +931,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) goto errout; } - if ((err = validate_linkmsg(dev, tb)) < 0) + err = validate_linkmsg(dev, tb); + if (err < 0) goto errout; err = do_setlink(dev, ifm, tb, ifname, 0); @@ -985,7 +985,8 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, unsigned int real_num_queues = 1; if (ops->get_tx_queues) { - err = ops->get_tx_queues(net, tb, &num_queues, &real_num_queues); + err = ops->get_tx_queues(net, tb, &num_queues, + &real_num_queues); if (err) goto err; } @@ -1026,6 +1027,7 @@ err_free: err: return ERR_PTR(err); } +EXPORT_SYMBOL(rtnl_create_link); static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { @@ -1059,7 +1061,8 @@ replay: else dev = NULL; - if ((err = validate_linkmsg(dev, tb)) < 0) + err = validate_linkmsg(dev, tb); + if (err < 0) return err; if (tb[IFLA_LINKINFO]) { @@ -1210,7 +1213,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (s_idx == 0) s_idx = 1; - for (idx=1; idxnlh->nlmsg_type-RTM_BASE; if (idx < s_idx || idx == PF_PACKET) continue; @@ -1277,7 +1280,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) return 0; - family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; if (family >= NPROTO) return -EAFNOSUPPORT; @@ -1310,7 +1313,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_len > min_len) { int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); while (RTA_OK(attr, attrlen)) { unsigned flavor = attr->rta_type; @@ -1416,14 +1419,3 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); } -EXPORT_SYMBOL(__rta_fill); -EXPORT_SYMBOL(rtnetlink_put_metrics); -EXPORT_SYMBOL(rtnl_lock); -EXPORT_SYMBOL(rtnl_trylock); -EXPORT_SYMBOL(rtnl_unlock); -EXPORT_SYMBOL(rtnl_is_locked); -EXPORT_SYMBOL(rtnl_unicast); -EXPORT_SYMBOL(rtnl_notify); -EXPORT_SYMBOL(rtnl_set_sk_err); -EXPORT_SYMBOL(rtnl_create_link); -EXPORT_SYMBOL(ifla_policy); -- cgit v1.2.2 From 81adee47dfb608df3ad0b91d230fb3cef75f0060 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 8 Nov 2009 00:53:51 -0800 Subject: net: Support specifying the network namespace upon device creation. There is no good reason to not support userspace specifying the network namespace during device creation, and it makes it easier to create a network device and pass it to a child network namespace with a well known name. We have to be careful to ensure that the target network namespace for the new device exists through the life of the call. To keep that logic clear I have factored out the network namespace grabbing logic into rtnl_link_get_net. In addtion we need to continue to pass the source network namespace to the rtnl_link_ops.newlink method so that we can find the base device source network namespace. Signed-off-by: Eric W. Biederman Acked-by: Eric Dumazet --- net/core/rtnetlink.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e2f3317f290f..33148a568199 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -733,6 +733,20 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_DATA] = { .type = NLA_NESTED }, }; +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net; + /* Examine the link attributes and figure out which + * network namespace we are talking about. + */ + if (tb[IFLA_NET_NS_PID]) + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else + net = get_net(src_net); + return net; +} +EXPORT_SYMBOL(rtnl_link_get_net); + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { if (dev) { @@ -756,8 +770,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int err; if (tb[IFLA_NET_NS_PID]) { - struct net *net; - net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + struct net *net = rtnl_link_get_net(dev_net(dev), tb); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; @@ -976,8 +989,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return 0; } -struct net_device *rtnl_create_link(struct net *net, char *ifname, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *src_net, struct net *net, + char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; struct net_device *dev; @@ -985,7 +998,7 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, unsigned int real_num_queues = 1; if (ops->get_tx_queues) { - err = ops->get_tx_queues(net, tb, &num_queues, + err = ops->get_tx_queues(src_net, tb, &num_queues, &real_num_queues); if (err) goto err; @@ -995,16 +1008,16 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, if (!dev) goto err; + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; dev->real_num_tx_queues = real_num_queues; + if (strchr(dev->name, '%')) { err = dev_alloc_name(dev, dev->name); if (err < 0) goto err_free; } - dev_net_set(dev, net); - dev->rtnl_link_ops = ops; - if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); if (tb[IFLA_ADDRESS]) @@ -1083,6 +1096,7 @@ replay: if (1) { struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct net *dest_net; if (ops) { if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { @@ -1147,17 +1161,19 @@ replay: if (!ifname[0]) snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - dev = rtnl_create_link(net, ifname, ops, tb); + dest_net = rtnl_link_get_net(net, tb); + dev = rtnl_create_link(net, dest_net, ifname, ops, tb); if (IS_ERR(dev)) err = PTR_ERR(dev); else if (ops->newlink) - err = ops->newlink(dev, tb, data); + err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); - if (err < 0 && !IS_ERR(dev)) free_netdev(dev); + + put_net(dest_net); return err; } } -- cgit v1.2.2 From e84af6ddef0e447c56b256a2bb5a90af11bdac2e Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 10 Nov 2009 14:11:01 +0000 Subject: skbuff: Do not allow skb recycling with disabled IRQs NAPI drivers try to recycle SKBs in their polling routine, but we generally don't know the context in which the polling will be called, and the skb recycling itself may require IRQs to be enabled. This patch adds irqs_disabled() test to the skb_recycle_check() routine, so that we'll not let the drivers hit the skb recycling path with IRQs disabled. As a side effect, this patch actually disables skb recycling for some [broken] drivers. E.g. gianfar driver grabs an irqsave spinlock during TX ring processing, and then tries to recycle an skb, and that caused the following badness: nf_conntrack version 0.5.0 (1008 buckets, 4032 max) ------------[ cut here ]------------ Badness at kernel/softirq.c:143 NIP: c003e3c4 LR: c423a528 CTR: c003e344 ... NIP [c003e3c4] local_bh_enable+0x80/0xc4 LR [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack] Call Trace: [c15d1b60] [c003e32c] local_bh_disable+0x1c/0x34 (unreliable) [c15d1b70] [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack] [c15d1b80] [c02c6370] nf_conntrack_destroy+0x3c/0x70 Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 80a96166df39..941bac907484 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -493,6 +493,9 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size) { struct skb_shared_info *shinfo; + if (irqs_disabled()) + return 0; + if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) return 0; -- cgit v1.2.2 From 08e9897d512fe7a67e46209543b3815b57a36dc7 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 10 Nov 2009 07:20:34 +0000 Subject: netdev: fold name hash properly (v3) The full_name_hash function does not produce well distributed values in the lower bits, so most code uses hash_32() to fold it. This is really a bug introduced when name hashing was added, back in 2.5 when I added name hashing. hash_32 is all that is needed since full_name_hash returns unsigned int which is only 32 bits on 64 bit platforms. Also, there is no point in using hash_32 on ifindex, because the is naturally sequential and usually well distributed. Signed-off-by: Stephen Hemminger Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bf629ac08b87..ad8e320ceba7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -196,7 +197,7 @@ EXPORT_SYMBOL(dev_base_lock); static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)]; + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) -- cgit v1.2.2 From f8572d8f2a2ba75408b97dc24ef47c83671795d7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 5 Nov 2009 13:32:03 -0800 Subject: sysctl net: Remove unused binary sysctl code Now that sys_sysctl is a compatiblity wrapper around /proc/sys all sysctl strategy routines, and all ctl_name and strategy entries in the sysctl tables are unused, and can be revmoed. In addition neigh_sysctl_register has been modified to no longer take a strategy argument and it's callers have been modified not to pass one. Cc: "David Miller" Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Signed-off-by: Eric W. Biederman --- net/core/neighbour.c | 47 ++++++---------------------------------------- net/core/sysctl_net_core.c | 21 ++++----------------- 2 files changed, 10 insertions(+), 58 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e587e6819698..2b54e6c6a7c8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2566,21 +2566,18 @@ static struct neigh_sysctl_table { } neigh_sysctl_template __read_mostly = { .neigh_vars = { { - .ctl_name = NET_NEIGH_MCAST_SOLICIT, .procname = "mcast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_UCAST_SOLICIT, .procname = "ucast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_APP_SOLICIT, .procname = "app_solicit", .maxlen = sizeof(int), .mode = 0644, @@ -2593,38 +2590,30 @@ static struct neigh_sysctl_table { .proc_handler = proc_dointvec_userhz_jiffies, }, { - .ctl_name = NET_NEIGH_REACHABLE_TIME, .procname = "base_reachable_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_DELAY_PROBE_TIME, .procname = "delay_first_probe_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_GC_STALE_TIME, .procname = "gc_stale_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_UNRES_QLEN, .procname = "unres_qlen", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_PROXY_QLEN, .procname = "proxy_qlen", .maxlen = sizeof(int), .mode = 0644, @@ -2649,45 +2638,36 @@ static struct neigh_sysctl_table { .proc_handler = proc_dointvec_userhz_jiffies, }, { - .ctl_name = NET_NEIGH_RETRANS_TIME_MS, .procname = "retrans_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies, }, { - .ctl_name = NET_NEIGH_REACHABLE_TIME_MS, .procname = "base_reachable_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies, }, { - .ctl_name = NET_NEIGH_GC_INTERVAL, .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_GC_THRESH1, .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_GC_THRESH2, .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_GC_THRESH3, .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, @@ -2699,7 +2679,7 @@ static struct neigh_sysctl_table { int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, int p_id, int pdev_id, char *p_name, - proc_handler *handler, ctl_handler *strategy) + proc_handler *handler) { struct neigh_sysctl_table *t; const char *dev_name_source = NULL; @@ -2710,10 +2690,10 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, #define NEIGH_CTL_PATH_DEV 3 struct ctl_path neigh_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "proto", .ctl_name = 0, }, - { .procname = "neigh", .ctl_name = 0, }, - { .procname = "default", .ctl_name = NET_PROTO_CONF_DEFAULT, }, + { .procname = "net", }, + { .procname = "proto", }, + { .procname = "neigh", }, + { .procname = "default", }, { }, }; @@ -2738,7 +2718,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; - neigh_path[NEIGH_CTL_PATH_DEV].ctl_name = dev->ifindex; /* Terminate the table early */ memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); } else { @@ -2750,31 +2729,19 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, } - if (handler || strategy) { + if (handler) { /* RetransTime */ t->neigh_vars[3].proc_handler = handler; - t->neigh_vars[3].strategy = strategy; t->neigh_vars[3].extra1 = dev; - if (!strategy) - t->neigh_vars[3].ctl_name = CTL_UNNUMBERED; /* ReachableTime */ t->neigh_vars[4].proc_handler = handler; - t->neigh_vars[4].strategy = strategy; t->neigh_vars[4].extra1 = dev; - if (!strategy) - t->neigh_vars[4].ctl_name = CTL_UNNUMBERED; /* RetransTime (in milliseconds)*/ t->neigh_vars[12].proc_handler = handler; - t->neigh_vars[12].strategy = strategy; t->neigh_vars[12].extra1 = dev; - if (!strategy) - t->neigh_vars[12].ctl_name = CTL_UNNUMBERED; /* ReachableTime (in milliseconds) */ t->neigh_vars[13].proc_handler = handler; - t->neigh_vars[13].strategy = strategy; t->neigh_vars[13].extra1 = dev; - if (!strategy) - t->neigh_vars[13].ctl_name = CTL_UNNUMBERED; } t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); @@ -2782,9 +2749,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, goto free; neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name; - neigh_path[NEIGH_CTL_PATH_NEIGH].ctl_name = pdev_id; neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name; - neigh_path[NEIGH_CTL_PATH_PROTO].ctl_name = p_id; t->sysctl_header = register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7db1de0497c6..1ce4e6e85125 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -17,7 +17,6 @@ static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { - .ctl_name = NET_CORE_WMEM_MAX, .procname = "wmem_max", .data = &sysctl_wmem_max, .maxlen = sizeof(int), @@ -25,7 +24,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_RMEM_MAX, .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), @@ -33,7 +31,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_WMEM_DEFAULT, .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), @@ -41,7 +38,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_RMEM_DEFAULT, .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), @@ -49,7 +45,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_DEV_WEIGHT, .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), @@ -57,7 +52,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_MAX_BACKLOG, .procname = "netdev_max_backlog", .data = &netdev_max_backlog, .maxlen = sizeof(int), @@ -65,16 +59,13 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_MSG_COST, .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_CORE_MSG_BURST, .procname = "message_burst", .data = &net_ratelimit_state.burst, .maxlen = sizeof(int), @@ -82,7 +73,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CORE_OPTMEM_MAX, .procname = "optmem_max", .data = &sysctl_optmem_max, .maxlen = sizeof(int), @@ -91,7 +81,6 @@ static struct ctl_table net_core_table[] = { }, #endif /* CONFIG_NET */ { - .ctl_name = NET_CORE_BUDGET, .procname = "netdev_budget", .data = &netdev_budget, .maxlen = sizeof(int), @@ -99,31 +88,29 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_WARNINGS, .procname = "warnings", .data = &net_msg_warn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { .ctl_name = 0 } + { } }; static struct ctl_table netns_core_table[] = { { - .ctl_name = NET_CORE_SOMAXCONN, .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { .ctl_name = 0 } + { } }; __net_initdata struct ctl_path net_core_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "core", .ctl_name = NET_CORE, }, + { .procname = "net", }, + { .procname = "core", }, { }, }; -- cgit v1.2.2 From 572a9d7b6fc7f20f573664063324c086be310c42 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 10 Nov 2009 06:14:14 +0000 Subject: net: allow to propagate errors through ->ndo_hard_start_xmit() Currently the ->ndo_hard_start_xmit() callbacks are only permitted to return one of the NETDEV_TX codes. This prevents any kind of error propagation for virtual devices, like queue congestion of the underlying device in case of layered devices, or unreachability in case of tunnels. This patches changes the NET_XMIT codes to avoid clashes with the NETDEV_TX codes and changes the two callers of dev_hard_start_xmit() to expect either errno codes, NET_XMIT codes or NETDEV_TX codes as return value. In case of qdisc_restart(), all non NETDEV_TX codes are mapped to NETDEV_TX_OK since no error propagation is possible when using qdiscs. In case of dev_queue_xmit(), the error is propagated upwards. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ad8e320ceba7..548340b57296 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1757,7 +1757,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; - int rc; + int rc = NETDEV_TX_OK; if (likely(!skb->next)) { if (!list_empty(&ptype_all)) @@ -1805,6 +1805,8 @@ gso: nskb->next = NULL; rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { + if (rc & ~NETDEV_TX_MASK) + goto out_kfree_gso_skb; nskb->next = skb->next; skb->next = nskb; return rc; @@ -1814,11 +1816,12 @@ gso: return NETDEV_TX_BUSY; } while (skb->next); - skb->destructor = DEV_GSO_CB(skb)->destructor; - +out_kfree_gso_skb: + if (likely(skb->next == NULL)) + skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); - return NETDEV_TX_OK; + return rc; } static u32 skb_tx_hashrnd; @@ -1906,6 +1909,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +static inline bool dev_xmit_complete(int rc) +{ + /* successful transmission */ + if (rc == NETDEV_TX_OK) + return true; + + /* error while transmitting, driver consumed skb */ + if (rc < 0) + return true; + + /* error while queueing to a different device, driver consumed skb */ + if (rc & NET_XMIT_MASK) + return true; + + return false; +} + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2003,8 +2023,8 @@ gso: HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { - rc = NET_XMIT_SUCCESS; - if (!dev_hard_start_xmit(skb, dev, txq)) { + rc = dev_hard_start_xmit(skb, dev, txq); + if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } -- cgit v1.2.2 From ed04642f753b1240fc65c2978b7365e93209972a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Nov 2009 21:54:04 +0000 Subject: net: check the return value of ndo_select_queue() Check the return value of ndo_select_queue(). If the value isn't smaller than the real_num_tx_queues, print a warning message, and reset it to zero. Signed-off-by: Changli Gao Signed-off-by: Eric Dumazet ---- Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 548340b57296..32045df1da5c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1848,6 +1848,20 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) } EXPORT_SYMBOL(skb_tx_hash); +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + if (net_ratelimit()) { + WARN(1, "%s selects TX queue %d, but " + "real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + } + return 0; + } + return queue_index; +} + static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { @@ -1861,6 +1875,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); } else { queue_index = 0; if (dev->real_num_tx_queues > 1) -- cgit v1.2.2 From 9a1654ba0b50402a6bd03c7b0fe9b0200a5ea7b1 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sun, 15 Nov 2009 07:20:12 +0000 Subject: net: Optimize hard_start_xmit() return checking Recent changes in the TX error propagation require additional checking and masking of values returned from hard_start_xmit(), mainly to separate cases where skb was consumed. This aim can be simplified by changing the order of NETDEV_TX and NET_XMIT codes, because the latter are treated similarly to negative (ERRNO) values. After this change much simpler dev_xmit_complete() is also used in sch_direct_xmit(), so it is moved to netdevice.h. Additionally NET_RX definitions in netdevice.h are moved up from between TX codes to avoid confusion while reading the TX comment. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/dev.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 32045df1da5c..4b24d79414e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1924,23 +1924,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } -static inline bool dev_xmit_complete(int rc) -{ - /* successful transmission */ - if (rc == NETDEV_TX_OK) - return true; - - /* error while transmitting, driver consumed skb */ - if (rc < 0) - return true; - - /* error while queueing to a different device, driver consumed skb */ - if (rc & NET_XMIT_MASK) - return true; - - return false; -} - /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit -- cgit v1.2.2 From d83345adf96bc13a5e360f4649a2e68ef968dec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Nov 2009 03:36:51 +0000 Subject: net: add dev_txq_stats_fold() helper Some drivers ndo_get_stats() method need to perform txqueue stats folding. Move folding from dev_get_stats() to a new dev_txq_stats_fold() function Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d867522290b9..c3e0578d29d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5168,6 +5168,32 @@ void netdev_run_todo(void) } } +/** + * dev_txq_stats_fold - fold tx_queues stats + * @dev: device to get statistics from + * @stats: struct net_device_stats to hold results + */ +void dev_txq_stats_fold(const struct net_device *dev, + struct net_device_stats *stats) +{ + unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; + unsigned int i; + struct netdev_queue *txq; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + tx_bytes += txq->tx_bytes; + tx_packets += txq->tx_packets; + tx_dropped += txq->tx_dropped; + } + if (tx_bytes || tx_packets || tx_dropped) { + stats->tx_bytes = tx_bytes; + stats->tx_packets = tx_packets; + stats->tx_dropped = tx_dropped; + } +} +EXPORT_SYMBOL(dev_txq_stats_fold); + /** * dev_get_stats - get network device statistics * @dev: device to get statistics from @@ -5182,25 +5208,9 @@ const struct net_device_stats *dev_get_stats(struct net_device *dev) if (ops->ndo_get_stats) return ops->ndo_get_stats(dev); - else { - unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; - struct net_device_stats *stats = &dev->stats; - unsigned int i; - struct netdev_queue *txq; - - for (i = 0; i < dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; - tx_dropped += txq->tx_dropped; - } - if (tx_bytes || tx_packets || tx_dropped) { - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_packets; - stats->tx_dropped = tx_dropped; - } - return stats; - } + + dev_txq_stats_fold(dev, &dev->stats); + return &dev->stats; } EXPORT_SYMBOL(dev_get_stats); -- cgit v1.2.2 From 395264d509aec45149745843d9a737140a1ece16 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Mon, 16 Nov 2009 13:49:35 +0000 Subject: net: introduce NETDEV_UNREGISTER_PERNET This new event is called once for each unique net namespace in batched unregister operations (with the argument set to a random device from that namespace) and once per device in non-batched unregister operations. It allows us to factorize some device unregister work such as clearing the routing cache. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c3e0578d29d1..e25fe5d9343b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1344,6 +1344,7 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev); } } @@ -4721,7 +4722,8 @@ static void net_set_todo(struct net_device *dev) static void rollback_registered_many(struct list_head *head) { - struct net_device *dev; + struct net_device *dev, *aux, *fdev; + LIST_HEAD(pernet_list); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -4779,8 +4781,24 @@ static void rollback_registered_many(struct list_head *head) synchronize_net(); - list_for_each_entry(dev, head, unreg_list) + list_for_each_entry_safe(dev, aux, head, unreg_list) { + int new_net = 1; + list_for_each_entry(fdev, &pernet_list, unreg_list) { + if (dev_net(dev) == dev_net(fdev)) { + new_net = 0; + dev_put(dev); + break; + } + } + if (new_net) + list_move(&dev->unreg_list, &pernet_list); + } + + list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) { + call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); + list_move(&dev->unreg_list, head); dev_put(dev); + } } static void rollback_registered(struct net_device *dev) @@ -5074,6 +5092,8 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users + * should have already handle it the first time */ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -5385,6 +5405,10 @@ EXPORT_SYMBOL(unregister_netdevice_queue); * unregister_netdevice_many - unregister many devices * @head: list of devices * + * WARNING: Calling this modifies the given list + * (in rollback_registered_many). It may change the order of the elements + * in the list. However, you can assume it does not add or delete elements + * to/from the list. */ void unregister_netdevice_many(struct list_head *head) { @@ -5504,6 +5528,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); /* * Flush the unicast and multicast chains -- cgit v1.2.2 From e014debecd3ee3832e6476b3a9c948edfcfd1250 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Nov 2009 05:59:21 +0000 Subject: linkwatch: linkwatch_forget_dev() to speedup device dismantle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Herbert Xu a écrit : > On Tue, Nov 17, 2009 at 04:26:04AM -0800, David Miller wrote: >> Really, the link watch stuff is just due for a redesign. I don't >> think a simple hack is going to cut it this time, sorry Eric :-) > > I have no objections against any redesigns, but since the only > caller of linkwatch_forget_dev runs in process context with the > RTNL, it could also legally emit those events. Thanks guys, here an updated version then, before linkwatch surgery ? In this version, I force the event to be sent synchronously. [PATCH net-next-2.6] linkwatch: linkwatch_forget_dev() to speedup device dismantle time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105 real 0m0.266s user 0m0.000s sys 0m0.001s real 0m0.770s user 0m0.000s sys 0m0.000s real 0m1.022s user 0m0.000s sys 0m0.000s One problem of current schem in vlan dismantle phase is the holding of device done by following chain : vlan_dev_stop() -> netif_carrier_off(dev) -> linkwatch_fire_event(dev) -> dev_hold() ... And __linkwatch_run_queue() runs up to one second later... A generic fix to this problem is to add a linkwatch_forget_dev() method to unlink the device from the list of watched devices. dev->link_watch_next becomes dev->link_watch_list (and use a bit more memory), to be able to unlink device in O(1). After patch : time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105 real 0m0.024s user 0m0.000s sys 0m0.000s real 0m0.032s user 0m0.000s sys 0m0.001s real 0m0.033s user 0m0.000s sys 0m0.000s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++ net/core/link_watch.c | 94 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e25fe5d9343b..c128af708ebf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5085,6 +5085,8 @@ static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + linkwatch_forget_dev(dev); + rebroadcast_time = warning_time = jiffies; while (atomic_read(&dev->refcnt) != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { @@ -5311,6 +5313,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->link_watch_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index bf8f7af699d7..5910b555a54a 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -35,7 +35,7 @@ static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); -static struct net_device *lweventlist; +static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) @@ -89,8 +89,10 @@ static void linkwatch_add_event(struct net_device *dev) unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); - dev->link_watch_next = lweventlist; - lweventlist = dev; + if (list_empty(&dev->link_watch_list)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + dev_hold(dev); + } spin_unlock_irqrestore(&lweventlist_lock, flags); } @@ -133,9 +135,35 @@ static void linkwatch_schedule_work(int urgent) } +static void linkwatch_do_dev(struct net_device *dev) +{ + /* + * Make sure the above read is complete since it can be + * rewritten as soon as we clear the bit below. + */ + smp_mb__before_clear_bit(); + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + rfc2863_policy(dev); + if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) + dev_activate(dev); + else + dev_deactivate(dev); + + netdev_state_change(dev); + } + dev_put(dev); +} + static void __linkwatch_run_queue(int urgent_only) { - struct net_device *next; + struct net_device *dev; + LIST_HEAD(wrk); /* * Limit the number of linkwatch events to one @@ -153,46 +181,40 @@ static void __linkwatch_run_queue(int urgent_only) clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); - next = lweventlist; - lweventlist = NULL; - spin_unlock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &wrk); - while (next) { - struct net_device *dev = next; + while (!list_empty(&wrk)) { - next = dev->link_watch_next; + dev = list_first_entry(&wrk, struct net_device, link_watch_list); + list_del_init(&dev->link_watch_list); if (urgent_only && !linkwatch_urgent_event(dev)) { - linkwatch_add_event(dev); + list_add_tail(&dev->link_watch_list, &lweventlist); continue; } - - /* - * Make sure the above read is complete since it can be - * rewritten as soon as we clear the bit below. - */ - smp_mb__before_clear_bit(); - - /* We are about to handle this device, - * so new events can be accepted - */ - clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); - - rfc2863_policy(dev); - if (dev->flags & IFF_UP) { - if (netif_carrier_ok(dev)) - dev_activate(dev); - else - dev_deactivate(dev); - - netdev_state_change(dev); - } - - dev_put(dev); + spin_unlock_irq(&lweventlist_lock); + linkwatch_do_dev(dev); + spin_lock_irq(&lweventlist_lock); } - if (lweventlist) + if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); + spin_unlock_irq(&lweventlist_lock); +} + +void linkwatch_forget_dev(struct net_device *dev) +{ + unsigned long flags; + int clean = 0; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (!list_empty(&dev->link_watch_list)) { + list_del_init(&dev->link_watch_list); + clean = 1; + } + spin_unlock_irqrestore(&lweventlist_lock, flags); + if (clean) + linkwatch_do_dev(dev); } @@ -216,8 +238,6 @@ void linkwatch_fire_event(struct net_device *dev) bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { - dev_hold(dev); - linkwatch_add_event(dev); } else if (!urgent) return; -- cgit v1.2.2 From d90310243fd750240755e217c5faa13e24f41536 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 18 Nov 2009 02:36:59 +0000 Subject: net: device name allocation cleanups Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 69 ++++++++++++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 45 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c128af708ebf..9977288583b8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -893,7 +893,8 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) free_page((unsigned long) inuse); } - snprintf(buf, IFNAMSIZ, name, i); + if (buf != name) + snprintf(buf, IFNAMSIZ, name, i); if (!__dev_get_by_name(net, buf)) return i; @@ -933,6 +934,21 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); +static int dev_get_valid_name(struct net *net, const char *name, char *buf, + bool fmt) +{ + if (!dev_valid_name(name)) + return -EINVAL; + + if (fmt && strchr(name, '%')) + return __dev_alloc_name(net, name, buf); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (buf != name) + strlcpy(buf, name, IFNAMSIZ); + + return 0; +} /** * dev_change_name - change name of a device @@ -956,22 +972,14 @@ int dev_change_name(struct net_device *dev, const char *newname) if (dev->flags & IFF_UP) return -EBUSY; - if (!dev_valid_name(newname)) - return -EINVAL; - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) return 0; memcpy(oldname, dev->name, IFNAMSIZ); - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - } else if (__dev_get_by_name(net, newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); + err = dev_get_valid_name(net, newname, dev->name, 1); + if (err < 0) + return err; rollback: /* For now only devices in the initial network namespace @@ -4883,8 +4891,6 @@ EXPORT_SYMBOL(netdev_fix_features); int register_netdevice(struct net_device *dev) { - struct hlist_head *head; - struct hlist_node *p; int ret; struct net *net = dev_net(dev); @@ -4913,26 +4919,14 @@ int register_netdevice(struct net_device *dev) } } - if (!dev_valid_name(dev->name)) { - ret = -EINVAL; + ret = dev_get_valid_name(net, dev->name, dev->name, 0); + if (ret) goto err_uninit; - } dev->ifindex = dev_new_index(net); if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Check for existence of name */ - head = dev_name_hash(net, dev->name); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; - goto err_uninit; - } - } - /* Fix illegal checksum combinations */ if ((dev->features & NETIF_F_HW_CSUM) && (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { @@ -5460,8 +5454,6 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - char buf[IFNAMSIZ]; - const char *destname; int err; ASSERT_RTNL(); @@ -5494,20 +5486,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * we can use it in the destination network namespace. */ err = -EEXIST; - destname = dev->name; - if (__dev_get_by_name(net, destname)) { + if (__dev_get_by_name(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; - if (!dev_valid_name(pat)) - goto out; - if (strchr(pat, '%')) { - if (__dev_alloc_name(net, pat, buf) < 0) - goto out; - destname = buf; - } else - destname = pat; - if (__dev_get_by_name(net, destname)) + if (dev_get_valid_name(net, pat, dev->name, 1)) goto out; } @@ -5544,10 +5527,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Actually switch the network namespace */ dev_net_set(dev, net); - /* Assign the new device name */ - if (destname != dev->name) - strcpy(dev->name, destname); - /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) { int iflink = (dev->iflink == dev->ifindex); -- cgit v1.2.2 From 8964be4a9a5ca8cab1219bb046db2f6d1936227c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Nov 2009 15:35:04 -0800 Subject: net: rename skb->iif to skb->skb_iif To help grep games, rename iif to skb_iif Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/skbuff.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9977288583b8..09f3d6b9c0c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2287,7 +2287,7 @@ static int ing_filter(struct sk_buff *skb) if (MAX_RED_LOOP < ttl++) { printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n", - skb->iif, dev->ifindex); + skb->skb_iif, dev->ifindex); return TC_ACT_SHOT; } @@ -2395,8 +2395,8 @@ int netif_receive_skb(struct sk_buff *skb) if (netpoll_receive_skb(skb)) return NET_RX_DROP; - if (!skb->iif) - skb->iif = skb->dev->ifindex; + if (!skb->skb_iif) + skb->skb_iif = skb->dev->ifindex; null_or_orig = NULL; orig_dev = skb->dev; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 739b8f4dd327..bfa3e7865a8c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -549,7 +549,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->protocol = old->protocol; new->mark = old->mark; - new->iif = old->iif; + new->skb_iif = old->skb_iif; __nf_copy(new, old); #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) -- cgit v1.2.2 From 6ebfbc065624790772398f5b327ac33a7ae3880b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 22 Nov 2009 20:43:13 -0800 Subject: net: Fix missing kernel-doc notation Fix the following htmldocs warning: Warning(net/core/dev.c:5378): bad line: Signed-off-by: Jaswinder Singh Rajput Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 09f3d6b9c0c8..ccefa2473c39 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5375,7 +5375,7 @@ EXPORT_SYMBOL(synchronize_net); * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list - + * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. -- cgit v1.2.2 From 09ad9bc752519cc167d0a573e1acf69b5c707c67 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Nov 2009 15:14:13 -0800 Subject: net: use net_eq to compare nets Generated with the following semantic patch @@ struct net *n1; struct net *n2; @@ - n1 == n2 + net_eq(n1, n2) @@ struct net *n1; struct net *n2; @@ - n1 != n2 + !net_eq(n1, n2) applied over {include,net,drivers/net}. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/neighbour.c | 2 +- net/core/net-sysfs.c | 4 ++-- net/core/net_namespace.c | 2 +- net/core/sysctl_net_core.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ccefa2473c39..e65af6041415 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -985,7 +985,7 @@ rollback: /* For now only devices in the initial network namespace * are in sysfs. */ - if (net == &init_net) { + if (net_eq(net, &init_net)) { ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); @@ -4792,7 +4792,7 @@ static void rollback_registered_many(struct list_head *head) list_for_each_entry_safe(dev, aux, head, unreg_list) { int new_net = 1; list_for_each_entry(fdev, &pernet_list, unreg_list) { - if (dev_net(dev) == dev_net(fdev)) { + if (net_eq(dev_net(dev), dev_net(fdev))) { new_net = 0; dev_put(dev); break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e587e6819698..a08a35bf0a7b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2092,7 +2092,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { - if (dev_net(n->dev) != net) + if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) goto next; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 157645c0da73..fbc1c7472c5e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -525,7 +525,7 @@ void netdev_unregister_kobject(struct net_device * net) kobject_get(&dev->kobj); - if (dev_net(net) != &init_net) + if (!net_eq(dev_net(net), &init_net)) return; device_del(dev); @@ -559,7 +559,7 @@ int netdev_register_kobject(struct net_device *net) #endif #endif /* CONFIG_SYSFS */ - if (dev_net(net) != &init_net) + if (!net_eq(dev_net(net), &init_net)) return 0; return device_add(dev); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1c1af2756f38..86ed7f44d083 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -280,7 +280,7 @@ out_undo: list_del(&ops->list); if (ops->exit) { for_each_net(undo_net) { - if (undo_net == net) + if (net_eq(undo_net, net)) goto undone; ops->exit(undo_net); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7db1de0497c6..fcfc5458c399 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -134,7 +134,7 @@ static __net_init int sysctl_core_net_init(struct net *net) net->core.sysctl_somaxconn = SOMAXCONN; tbl = netns_core_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; -- cgit v1.2.2 From 445409602c09219767c06497c0dc2285eac244ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Nov 2009 06:07:08 +0000 Subject: veth: move loopback logic to common location The veth driver contains code to forward an skb from the start_xmit function of one network device into the receive path of another device. Moving that code into a common location lets us reuse the code for direct forwarding of data between macvlan ports, and possibly in other drivers. Signed-off-by: Arnd Bergmann Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e65af6041415..7775e8b48deb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include #include @@ -1419,6 +1420,45 @@ static inline void net_timestamp(struct sk_buff *skb) skb->tstamp.tv64 = 0; } +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + skb_orphan(skb); + + if (!(dev->flags & IFF_UP)) + return NET_RX_DROP; + + if (skb->len > (dev->mtu + dev->hard_header_len)) + return NET_RX_DROP; + + skb_dst_drop(skb); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, dev); + skb->mark = 0; + secpath_reset(skb); + nf_reset(skb); + return netif_rx(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + /* * Support routine. Sends outgoing frames to any network * taps currently in use. -- cgit v1.2.2 From 3291b9db567e1ec38362024049cbd02987b1e277 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2009 00:44:33 -0800 Subject: pktgen: NUMA aware pktgen threads are bound to given CPU, we can allocate memory for these threads in a NUMA aware way. After a pktgen session on two threads, we can check flows memory was allocated on right node, instead of a not related one. # grep pktgen_thread_write /proc/vmallocinfo 0xffffc90007204000-0xffffc90007385000 1576960 pktgen_thread_write+0x3a4/0x6b0 [pktgen] pages=384 vmalloc N0=384 0xffffc90007386000-0xffffc90007507000 1576960 pktgen_thread_write+0x3a4/0x6b0 [pktgen] pages=384 vmalloc N1=384 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b44104d3e6be..19ee493ec178 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3626,6 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *pkt_dev; int err; + int node = cpu_to_node(t->cpu); /* We don't allow a device to be on several threads */ @@ -3635,12 +3636,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) return -EBUSY; } - pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL); + pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node); if (!pkt_dev) return -ENOMEM; strcpy(pkt_dev->odevname, ifname); - pkt_dev->flows = vmalloc(MAX_CFLOWS * sizeof(struct flow_state)); + pkt_dev->flows = vmalloc_node(MAX_CFLOWS * sizeof(struct flow_state), + node); if (pkt_dev->flows == NULL) { kfree(pkt_dev); return -ENOMEM; @@ -3702,7 +3704,8 @@ static int __init pktgen_create_thread(int cpu) struct proc_dir_entry *pe; struct task_struct *p; - t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL); + t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL, + cpu_to_node(cpu)); if (!t) { printk(KERN_ERR "pktgen: ERROR: out of memory, can't " "create new thread.\n"); -- cgit v1.2.2 From f64f9e719261a87818dd192a3a2352e5b20fbd0f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 29 Nov 2009 16:55:45 -0800 Subject: net: Move && and || to end of previous line Not including net/atm/ Compiled tested x86 allyesconfig only Added a > 80 column line or two, which I ignored. Existing checkpatch plaints willfully, cheerfully ignored. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++--- net/core/pktgen.c | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7775e8b48deb..5d131c2f84cc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2677,9 +2677,10 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return GRO_NORMAL; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev) - && !compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = + (p->dev == skb->dev) && + !compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 19ee493ec178..a23b45f08ec9 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2052,9 +2052,8 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) read_lock_bh(&idev->lock); for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { - if (ifp->scope == IFA_LINK - && !(ifp-> - flags & IFA_F_TENTATIVE)) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & IFA_F_TENTATIVE)) { ipv6_addr_copy(&pkt_dev-> cur_in6_saddr, &ifp->addr); -- cgit v1.2.2 From a5ee155136b4a8f4ab0e4c9c064b661da475e298 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 15:45:58 +0000 Subject: net: NETDEV_UNREGISTER_PERNET -> NETDEV_UNREGISTER_BATCH The motivation for an additional notifier in batched netdevice notification (rt_do_flush) only needs to be called once per batch not once per namespace. For further batching improvements I need a guarantee that the netdevices are unregistered in order allowing me to unregister an all of the network devices in a network namespace at the same time with the guarantee that the loopback device is really and truly unregistered last. Additionally it appears that we moved the route cache flush after the final synchronize_net, which seems wrong and there was no explanation. So I have restored the original location of the final synchronize_net. Cc: Octavian Purdila Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5d131c2f84cc..bb37ee1e0901 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1353,7 +1353,7 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } @@ -4771,8 +4771,7 @@ static void net_set_todo(struct net_device *dev) static void rollback_registered_many(struct list_head *head) { - struct net_device *dev, *aux, *fdev; - LIST_HEAD(pernet_list); + struct net_device *dev; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -4828,26 +4827,14 @@ static void rollback_registered_many(struct list_head *head) netdev_unregister_kobject(dev); } - synchronize_net(); + /* Process any work delayed until the end of the batch */ + dev = list_entry(head->next, struct net_device, unreg_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - list_for_each_entry_safe(dev, aux, head, unreg_list) { - int new_net = 1; - list_for_each_entry(fdev, &pernet_list, unreg_list) { - if (net_eq(dev_net(dev), dev_net(fdev))) { - new_net = 0; - dev_put(dev); - break; - } - } - if (new_net) - list_move(&dev->unreg_list, &pernet_list); - } + synchronize_net(); - list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) { - call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); - list_move(&dev->unreg_list, head); + list_for_each_entry(dev, head, unreg_list) dev_put(dev); - } } static void rollback_registered(struct net_device *dev) @@ -5129,7 +5116,7 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users + /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users * should have already handle it the first time */ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, @@ -5442,11 +5429,6 @@ EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdevice_many - unregister many devices * @head: list of devices - * - * WARNING: Calling this modifies the given list - * (in rollback_registered_many). It may change the order of the elements - * in the list. However, you can assume it does not add or delete elements - * to/from the list. */ void unregister_netdevice_many(struct list_head *head) { @@ -5555,7 +5537,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); /* * Flush the unicast and multicast chains -- cgit v1.2.2 From 2b035b39970740722598f7a9d548835f9bdd730f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 22:25:27 +0000 Subject: net: Batch network namespace destruction. It is fairly common to kill several network namespaces at once. Either because they are nested one inside the other or because they are cooperating in multiple machine networking experiments. As the network stack control logic does not parallelize easily batch up multiple network namespaces existing together. To get the full benefit of batching the virtual network devices to be removed must be all removed in one batch. For that purpose I have added a loop after the last network device operations have run that batches up all remaining network devices and deletes them. An extra benefit is that the reorganization slightly shrinks the size of the per network namespace data structures replaceing a work_struct with a list_head. In a trivial test with 4K namespaces this change reduced the cost of a destroying 4K namespaces from 7+ minutes (at 12% cpu) to 44 seconds (at 60% cpu). The bulk of that 44s was spent in inet_twsk_purge. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 66 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 86ed7f44d083..a42caa2b909b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include /* * Our network namespace constructor/destructor lists @@ -27,6 +29,20 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +static void unregister_netdevices(struct net *net, struct list_head *list) +{ + struct net_device *dev; + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registeration. + */ + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, list); + else + unregister_netdevice_queue(dev, list); + } +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -59,6 +75,13 @@ out_undo: list_for_each_entry_continue_reverse(ops, &pernet_list, list) { if (ops->exit) ops->exit(net); + if (&ops->list == first_device) { + LIST_HEAD(dev_kill_list); + rtnl_lock(); + unregister_netdevices(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } } rcu_barrier(); @@ -147,18 +170,26 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) return net_create(); } +static DEFINE_SPINLOCK(cleanup_list_lock); +static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ + static void cleanup_net(struct work_struct *work) { struct pernet_operations *ops; - struct net *net; + struct net *net, *tmp; + LIST_HEAD(net_kill_list); - net = container_of(work, struct net, work); + /* Atomically snapshot the list of namespaces to cleanup */ + spin_lock_irq(&cleanup_list_lock); + list_replace_init(&cleanup_list, &net_kill_list); + spin_unlock_irq(&cleanup_list_lock); mutex_lock(&net_mutex); /* Don't let anyone else find us. */ rtnl_lock(); - list_del_rcu(&net->list); + list_for_each_entry(net, &net_kill_list, cleanup_list) + list_del_rcu(&net->list); rtnl_unlock(); /* @@ -170,8 +201,18 @@ static void cleanup_net(struct work_struct *work) /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); + if (ops->exit) { + list_for_each_entry(net, &net_kill_list, cleanup_list) + ops->exit(net); + } + if (&ops->list == first_device) { + LIST_HEAD(dev_kill_list); + rtnl_lock(); + list_for_each_entry(net, &net_kill_list, cleanup_list) + unregister_netdevices(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } } mutex_unlock(&net_mutex); @@ -182,14 +223,23 @@ static void cleanup_net(struct work_struct *work) rcu_barrier(); /* Finally it is safe to free my network namespace structure */ - net_free(net); + list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) { + list_del_init(&net->cleanup_list); + net_free(net); + } } +static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { /* Cleanup the network namespace in process context */ - INIT_WORK(&net->work, cleanup_net); - queue_work(netns_wq, &net->work); + unsigned long flags; + + spin_lock_irqsave(&cleanup_list_lock, flags); + list_add(&net->cleanup_list, &cleanup_list); + spin_unlock_irqrestore(&cleanup_list_lock, flags); + + queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); -- cgit v1.2.2 From f875bae065334907796da12523f9df85c89f5712 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 22:25:28 +0000 Subject: net: Automatically allocate per namespace data. To get the full benefit of batched network namespace cleanup netowrk device deletion needs to be performed by the generic code. When using register_pernet_gen_device and freeing the data in exit_net it is impossible to delay allocation until after exit_net has called as the device uninit methods are no longer safe. To correct this, and to simplify working with per network namespace data I have moved allocation and deletion of per network namespace data into the network namespace core. The core now frees the data only after all of the network namespace exit routines have run. Now it is only required to set the new fields .id and .size in the pernet_operations structure if you want network namespace data to be managed for you automatically. This makes the current register_pernet_gen_device and register_pernet_gen_subsys routines unnecessary. For the moment I have left them as compatibility wrappers in net_namespace.h They will be removed once all of the users have been updated. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 188 +++++++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 86 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a42caa2b909b..9679ad292da9 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -43,13 +43,40 @@ static void unregister_netdevices(struct net *net, struct list_head *list) } } +static int ops_init(const struct pernet_operations *ops, struct net *net) +{ + int err; + if (ops->id && ops->size) { + void *data = kzalloc(ops->size, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = net_assign_generic(net, *ops->id, data); + if (err) { + kfree(data); + return err; + } + } + if (ops->init) + return ops->init(net); + return 0; +} + +static void ops_free(const struct pernet_operations *ops, struct net *net) +{ + if (ops->id && ops->size) { + int id = *ops->id; + kfree(net_generic(net, id)); + } +} + /* * setup_net runs the initializers for the network namespace object. */ static __net_init int setup_net(struct net *net) { /* Must be called with net_mutex held */ - struct pernet_operations *ops; + const struct pernet_operations *ops, *saved_ops; int error = 0; atomic_set(&net->count, 1); @@ -59,11 +86,9 @@ static __net_init int setup_net(struct net *net) #endif list_for_each_entry(ops, &pernet_list, list) { - if (ops->init) { - error = ops->init(net); - if (error < 0) - goto out_undo; - } + error = ops_init(ops, net); + if (error < 0) + goto out_undo; } out: return error; @@ -72,6 +97,7 @@ out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ + saved_ops = ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) { if (ops->exit) ops->exit(net); @@ -83,6 +109,9 @@ out_undo: rtnl_unlock(); } } + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_free(ops, net); rcu_barrier(); goto out; @@ -175,7 +204,7 @@ static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { - struct pernet_operations *ops; + const struct pernet_operations *ops; struct net *net, *tmp; LIST_HEAD(net_kill_list); @@ -214,6 +243,13 @@ static void cleanup_net(struct work_struct *work) rtnl_unlock(); } } + /* Free the net generic variables */ + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->size && ops->id) { + list_for_each_entry(net, &net_kill_list, cleanup_list) + ops_free(ops, net); + } + } mutex_unlock(&net_mutex); @@ -309,16 +345,16 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); #ifdef CONFIG_NET_NS -static int register_pernet_operations(struct list_head *list, - struct pernet_operations *ops) +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) { struct net *net, *undo_net; int error; list_add_tail(&ops->list, list); - if (ops->init) { + if (ops->init || (ops->id && ops->size)) { for_each_net(net) { - error = ops->init(net); + error = ops_init(ops, net); if (error) goto out_undo; } @@ -336,10 +372,18 @@ out_undo: } } undone: + if (ops->size && ops->id) { + for_each_net(undo_net) { + if (net_eq(undo_net, net)) + goto freed; + ops_free(ops, undo_net); + } + } +freed: return error; } -static void unregister_pernet_operations(struct pernet_operations *ops) +static void __unregister_pernet_operations(struct pernet_operations *ops) { struct net *net; @@ -347,27 +391,66 @@ static void unregister_pernet_operations(struct pernet_operations *ops) if (ops->exit) for_each_net(net) ops->exit(net); + if (ops->id && ops->size) + for_each_net(net) + ops_free(ops, net); } #else -static int register_pernet_operations(struct list_head *list, - struct pernet_operations *ops) +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) { - if (ops->init == NULL) - return 0; - return ops->init(&init_net); + int err = 0; + err = ops_init(ops, &init_net); + if (err) + ops_free(ops, &init_net); + return err; + } -static void unregister_pernet_operations(struct pernet_operations *ops) +static void __unregister_pernet_operations(struct pernet_operations *ops) { if (ops->exit) ops->exit(&init_net); + ops_free(ops, &init_net); } -#endif + +#endif /* CONFIG_NET_NS */ static DEFINE_IDA(net_generic_ids); +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + int error; + + if (ops->id) { +again: + error = ida_get_new_above(&net_generic_ids, 1, ops->id); + if (error < 0) { + if (error == -EAGAIN) { + ida_pre_get(&net_generic_ids, GFP_KERNEL); + goto again; + } + return error; + } + } + error = __register_pernet_operations(list, ops); + if (error && ops->id) + ida_remove(&net_generic_ids, *ops->id); + + return error; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + + __unregister_pernet_operations(ops); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); +} + /** * register_pernet_subsys - register a network namespace subsystem * @ops: pernet operations structure for the subsystem @@ -414,38 +497,6 @@ void unregister_pernet_subsys(struct pernet_operations *module) } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); -int register_pernet_gen_subsys(int *id, struct pernet_operations *ops) -{ - int rv; - - mutex_lock(&net_mutex); -again: - rv = ida_get_new_above(&net_generic_ids, 1, id); - if (rv < 0) { - if (rv == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - rv = register_pernet_operations(first_device, ops); - if (rv < 0) - ida_remove(&net_generic_ids, *id); -out: - mutex_unlock(&net_mutex); - return rv; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_subsys); - -void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys); - /** * register_pernet_device - register a network namespace device * @ops: pernet operations structure for the subsystem @@ -477,30 +528,6 @@ int register_pernet_device(struct pernet_operations *ops) } EXPORT_SYMBOL_GPL(register_pernet_device); -int register_pernet_gen_device(int *id, struct pernet_operations *ops) -{ - int error; - mutex_lock(&net_mutex); -again: - error = ida_get_new_above(&net_generic_ids, 1, id); - if (error) { - if (error == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - error = register_pernet_operations(&pernet_list, ops); - if (error) - ida_remove(&net_generic_ids, *id); - else if (first_device == &pernet_list) - first_device = &ops->list; -out: - mutex_unlock(&net_mutex); - return error; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_device); - /** * unregister_pernet_device - unregister a network namespace netdevice * @ops: pernet operations structure to manipulate @@ -520,17 +547,6 @@ void unregister_pernet_device(struct pernet_operations *ops) } EXPORT_SYMBOL_GPL(unregister_pernet_device); -void unregister_pernet_gen_device(int id, struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - if (&ops->list == first_device) - first_device = first_device->next; - unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_gen_device); - static void net_generic_release(struct rcu_head *rcu) { struct net_generic *ng; -- cgit v1.2.2 From e008b5fc8dc7f46d9904001c7a2155eb1e7d35ab Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 22:25:30 +0000 Subject: net: Simplfy default_device_exit and improve batching. - Defer dellink to net_cleanup() allowing for batching. - Fix comment. - Use for_each_netdev_safe again as dev_change_net_namespace touches at most one network device (unlike veth dellink). Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bb37ee1e0901..e3e18dee0bd3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5736,14 +5736,13 @@ static struct pernet_operations __net_initdata netdev_net_ops = { static void __net_exit default_device_exit(struct net *net) { - struct net_device *dev; + struct net_device *dev, *aux; /* - * Push all migratable of the network devices back to the + * Push all migratable network devices back to the * initial network namespace */ rtnl_lock(); -restart: - for_each_netdev(net, dev) { + for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -5751,11 +5750,9 @@ restart: if (dev->features & NETIF_F_NETNS_LOCAL) continue; - /* Delete virtual devices */ - if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { - dev->rtnl_link_ops->dellink(dev, NULL); - goto restart; - } + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops) + continue; /* Push remaing network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); @@ -5765,7 +5762,6 @@ restart: __func__, dev->name, err); BUG(); } - goto restart; } rtnl_unlock(); } -- cgit v1.2.2 From c81c2d95449cd218c2022ce6014c52fef1eb1f66 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 2 Dec 2009 16:49:02 +0000 Subject: skbuff: remove skb_dma_map/unmap The two functions skb_dma_map/unmap are unsafe to use as they cause problems when packets are cloned and sent to multiple devices while a HW IOMMU is enabled. Due to this it is best to remove the code so it is not used by any other network driver maintainters. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/Makefile | 1 - net/core/skb_dma_map.c | 65 -------------------------------------------------- 2 files changed, 66 deletions(-) delete mode 100644 net/core/skb_dma_map.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 796f46eece5f..08791ac3e05a 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -6,7 +6,6 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-$(CONFIG_HAS_DMA) += skb_dma_map.o obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c deleted file mode 100644 index 79687dfd6957..000000000000 --- a/net/core/skb_dma_map.c +++ /dev/null @@ -1,65 +0,0 @@ -/* skb_dma_map.c: DMA mapping helpers for socket buffers. - * - * Copyright (C) David S. Miller - */ - -#include -#include -#include -#include - -int skb_dma_map(struct device *dev, struct sk_buff *skb, - enum dma_data_direction dir) -{ - struct skb_shared_info *sp = skb_shinfo(skb); - dma_addr_t map; - int i; - - map = dma_map_single(dev, skb->data, - skb_headlen(skb), dir); - if (dma_mapping_error(dev, map)) - goto out_err; - - sp->dma_head = map; - for (i = 0; i < sp->nr_frags; i++) { - skb_frag_t *fp = &sp->frags[i]; - - map = dma_map_page(dev, fp->page, fp->page_offset, - fp->size, dir); - if (dma_mapping_error(dev, map)) - goto unwind; - sp->dma_maps[i] = map; - } - - return 0; - -unwind: - while (--i >= 0) { - skb_frag_t *fp = &sp->frags[i]; - - dma_unmap_page(dev, sp->dma_maps[i], - fp->size, dir); - } - dma_unmap_single(dev, sp->dma_head, - skb_headlen(skb), dir); -out_err: - return -ENOMEM; -} -EXPORT_SYMBOL(skb_dma_map); - -void skb_dma_unmap(struct device *dev, struct sk_buff *skb, - enum dma_data_direction dir) -{ - struct skb_shared_info *sp = skb_shinfo(skb); - int i; - - dma_unmap_single(dev, sp->dma_head, - skb_headlen(skb), dir); - for (i = 0; i < sp->nr_frags; i++) { - skb_frag_t *fp = &sp->frags[i]; - - dma_unmap_page(dev, sp->dma_maps[i], - fp->size, dir); - } -} -EXPORT_SYMBOL(skb_dma_unmap); -- cgit v1.2.2 From 491deb24bf5bf7124141287aaf02c3219783ceab Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 3 Dec 2009 01:25:54 +0000 Subject: net 02/05: fib_rules: rename ifindex/ifname/FRA_IFNAME to iifindex/iifname/FRA_IIFNAME commit 229e77eec406ad68662f18e49fda8b5d366768c5 Author: Patrick McHardy Date: Thu Dec 3 12:05:23 2009 +0100 net: fib_rules: rename ifindex/ifname/FRA_IFNAME to iifindex/iifname/FRA_IIFNAME The next patch will add oif classification, rename interface related members and attributes to reflect that they're used for iif classification. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bd309384f8b8..8e8028cdc87f 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -135,7 +135,7 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, { int ret = 0; - if (rule->ifindex && (rule->ifindex != fl->iif)) + if (rule->iifindex && (rule->iifindex != fl->iif)) goto out; if ((rule->mark ^ fl->mark) & rule->mark_mask) @@ -248,14 +248,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (tb[FRA_PRIORITY]) rule->pref = nla_get_u32(tb[FRA_PRIORITY]); - if (tb[FRA_IFNAME]) { + if (tb[FRA_IIFNAME]) { struct net_device *dev; - rule->ifindex = -1; - nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->ifname); + rule->iifindex = -1; + nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->iifname); if (dev) - rule->ifindex = dev->ifindex; + rule->iifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { @@ -388,8 +388,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) continue; - if (tb[FRA_IFNAME] && - nla_strcmp(tb[FRA_IFNAME], rule->ifname)) + if (tb[FRA_IIFNAME] && + nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) continue; if (tb[FRA_FWMARK] && @@ -447,7 +447,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) - + nla_total_size(IFNAMSIZ) /* FRA_IFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_FWMARK */ @@ -481,11 +481,11 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) frh->flags |= FIB_RULE_UNRESOLVED; - if (rule->ifname[0]) { - NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname); + if (rule->iifname[0]) { + NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname); - if (rule->ifindex == -1) - frh->flags |= FIB_RULE_DEV_DETACHED; + if (rule->iifindex == -1) + frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->pref) @@ -600,9 +600,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) { - if (rule->ifindex == -1 && - strcmp(dev->name, rule->ifname) == 0) - rule->ifindex = dev->ifindex; + if (rule->iifindex == -1 && + strcmp(dev->name, rule->iifname) == 0) + rule->iifindex = dev->ifindex; } } @@ -611,8 +611,8 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) - if (rule->ifindex == dev->ifindex) - rule->ifindex = -1; + if (rule->iifindex == dev->ifindex) + rule->iifindex = -1; } -- cgit v1.2.2 From 1b038a5e60c7812f19818e8a5df96d029e49c38f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 3 Dec 2009 01:25:56 +0000 Subject: net 03/05: fib_rules: add oif classification commit 68144d350f4f6c348659c825cde6a82b34c27a91 Author: Patrick McHardy Date: Thu Dec 3 12:05:25 2009 +0100 net: fib_rules: add oif classification Support routing table lookup based on the flow's oif. This is useful to classify packets originating from sockets bound to interfaces differently. The route cache already includes the oif and needs no changes. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 8e8028cdc87f..d1a70ad4b544 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -138,6 +138,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->iifindex && (rule->iifindex != fl->iif)) goto out; + if (rule->oifindex && (rule->oifindex != fl->oif)) + goto out; + if ((rule->mark ^ fl->mark) & rule->mark_mask) goto out; @@ -258,6 +261,16 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->iifindex = dev->ifindex; } + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + rule->oifindex = -1; + nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->oifname); + if (dev) + rule->oifindex = dev->ifindex; + } + if (tb[FRA_FWMARK]) { rule->mark = nla_get_u32(tb[FRA_FWMARK]); if (rule->mark) @@ -392,6 +405,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) continue; + if (tb[FRA_OIFNAME] && + nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) + continue; + if (tb[FRA_FWMARK] && (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) continue; @@ -448,6 +465,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_FWMARK */ @@ -488,6 +506,13 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->flags |= FIB_RULE_IIF_DETACHED; } + if (rule->oifname[0]) { + NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname); + + if (rule->oifindex == -1) + frh->flags |= FIB_RULE_OIF_DETACHED; + } + if (rule->pref) NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref); @@ -603,6 +628,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; + if (rule->oifindex == -1 && + strcmp(dev->name, rule->oifname) == 0) + rule->oifindex = dev->ifindex; } } @@ -610,9 +638,12 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; - list_for_each_entry(rule, rules, list) + list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; + if (rule->oifindex == dev->ifindex) + rule->oifindex = -1; + } } -- cgit v1.2.2 From 5adef1809147a9c39119ffd5a13a1ca4fe23a411 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 3 Dec 2009 01:25:57 +0000 Subject: net 04/05: fib_rules: allow to delete local rule commit d124356ce314fff22a047ea334379d5105b2d834 Author: Patrick McHardy Date: Thu Dec 3 12:16:35 2009 +0100 net: fib_rules: allow to delete local rule Allow to delete the local rule and recreate it with a higher priority. This can be used to force packets with a local destination out on the wire instead of routing them to loopback. Additionally this patch allows to recreate rules with a priority of 0. Combined with the previous patch to allow oif classification, a socket can be bound to the desired interface and packets routed to the wire like this: # move local rule to lower priority ip rule add pref 1000 lookup local ip rule del pref 0 # route packets of sockets bound to eth0 to the wire independant # of the destination address ip rule add pref 100 oif eth0 lookup 100 ip route add default dev eth0 table 100 Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index d1a70ad4b544..ef0e7d9e664b 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -287,7 +287,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); - if (!rule->pref && ops->default_pref) + if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); err = -EINVAL; -- cgit v1.2.2 From 72ad937abd0a43b7cf2c557ba1f2ec75e608c516 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 02:29:03 +0000 Subject: net: Add support for batching network namespace cleanups - Add exit_list to struct net to support building lists of network namespaces to cleanup. - Add exit_batch to pernet_operations to allow running operations only once during a network namespace exit. Instead of once per network namespace. - Factor opt ops_exit_list and ops_exit_free so the logic with cleanup up a network namespace does not need to be duplicated. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 122 +++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 61 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9679ad292da9..6c7f6e04dbbf 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -70,6 +70,36 @@ static void ops_free(const struct pernet_operations *ops, struct net *net) } } +static void ops_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->exit(net); + } + if (&ops->list == first_device) { + LIST_HEAD(dev_kill_list); + rtnl_lock(); + list_for_each_entry(net, net_exit_list, exit_list) + unregister_netdevices(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } + if (ops->exit_batch) + ops->exit_batch(net_exit_list); +} + +static void ops_free_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->size && ops->id) { + list_for_each_entry(net, net_exit_list, exit_list) + ops_free(ops, net); + } +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -78,6 +108,7 @@ static __net_init int setup_net(struct net *net) /* Must be called with net_mutex held */ const struct pernet_operations *ops, *saved_ops; int error = 0; + LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); @@ -97,21 +128,14 @@ out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ + list_add(&net->exit_list, &net_exit_list); saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); - if (&ops->list == first_device) { - LIST_HEAD(dev_kill_list); - rtnl_lock(); - unregister_netdevices(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - } + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free(ops, net); + ops_free_list(ops, &net_exit_list); rcu_barrier(); goto out; @@ -207,6 +231,7 @@ static void cleanup_net(struct work_struct *work) const struct pernet_operations *ops; struct net *net, *tmp; LIST_HEAD(net_kill_list); + LIST_HEAD(net_exit_list); /* Atomically snapshot the list of namespaces to cleanup */ spin_lock_irq(&cleanup_list_lock); @@ -217,8 +242,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) + list_for_each_entry(net, &net_kill_list, cleanup_list) { list_del_rcu(&net->list); + list_add_tail(&net->exit_list, &net_exit_list); + } rtnl_unlock(); /* @@ -229,27 +256,12 @@ static void cleanup_net(struct work_struct *work) synchronize_rcu(); /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit) { - list_for_each_entry(net, &net_kill_list, cleanup_list) - ops->exit(net); - } - if (&ops->list == first_device) { - LIST_HEAD(dev_kill_list); - rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) - unregister_netdevices(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - } + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->size && ops->id) { - list_for_each_entry(net, &net_kill_list, cleanup_list) - ops_free(ops, net); - } - } + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); mutex_unlock(&net_mutex); @@ -259,8 +271,8 @@ static void cleanup_net(struct work_struct *work) rcu_barrier(); /* Finally it is safe to free my network namespace structure */ - list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) { - list_del_init(&net->cleanup_list); + list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + list_del_init(&net->exit_list); net_free(net); } } @@ -348,8 +360,9 @@ pure_initcall(net_ns_init); static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { - struct net *net, *undo_net; + struct net *net; int error; + LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { @@ -357,6 +370,7 @@ static int __register_pernet_operations(struct list_head *list, error = ops_init(ops, net); if (error) goto out_undo; + list_add_tail(&net->exit_list, &net_exit_list); } } return 0; @@ -364,36 +378,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - if (ops->exit) { - for_each_net(undo_net) { - if (net_eq(undo_net, net)) - goto undone; - ops->exit(undo_net); - } - } -undone: - if (ops->size && ops->id) { - for_each_net(undo_net) { - if (net_eq(undo_net, net)) - goto freed; - ops_free(ops, undo_net); - } - } -freed: + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { struct net *net; + LIST_HEAD(net_exit_list); list_del(&ops->list); - if (ops->exit) - for_each_net(net) - ops->exit(net); - if (ops->id && ops->size) - for_each_net(net) - ops_free(ops, net); + for_each_net(net) + list_add_tail(&net->exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); } #else @@ -411,9 +410,10 @@ static int __register_pernet_operations(struct list_head *list, static void __unregister_pernet_operations(struct pernet_operations *ops) { - if (ops->exit) - ops->exit(&init_net); - ops_free(ops, &init_net); + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); } #endif /* CONFIG_NET_NS */ -- cgit v1.2.2 From 04dc7f6be3a7b308f8545bb45772c9fb75f71aca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 02:29:04 +0000 Subject: net: Move network device exit batching Move network device exit batching from a special case in net_namespace.c to using common mechanisms in dev.c Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 25 +++++++++++++++++++++++++ net/core/net_namespace.c | 24 ------------------------ 2 files changed, 25 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e3e18dee0bd3..0913a08a87d6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5766,8 +5766,33 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registeration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); +} + static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, + .exit_batch = default_device_exit_batch, }; /* diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6c7f6e04dbbf..4026a4cff93c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -8,10 +8,8 @@ #include #include #include -#include #include #include -#include /* * Our network namespace constructor/destructor lists @@ -29,20 +27,6 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ -static void unregister_netdevices(struct net *net, struct list_head *list) -{ - struct net_device *dev; - /* At exit all network devices most be removed from a network - * namespace. Do this in the reverse order of registeration. - */ - for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) - dev->rtnl_link_ops->dellink(dev, list); - else - unregister_netdevice_queue(dev, list); - } -} - static int ops_init(const struct pernet_operations *ops, struct net *net) { int err; @@ -78,14 +62,6 @@ static void ops_exit_list(const struct pernet_operations *ops, list_for_each_entry(net, net_exit_list, exit_list) ops->exit(net); } - if (&ops->list == first_device) { - LIST_HEAD(dev_kill_list); - rtnl_lock(); - list_for_each_entry(net, net_exit_list, exit_list) - unregister_netdevices(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } if (ops->exit_batch) ops->exit_batch(net_exit_list); } -- cgit v1.2.2 From 3a765edadb28cc736d185f67d1ba6bedcc85f4b9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 02:29:06 +0000 Subject: netns: Add an explicit rcu_barrier to unregister_pernet_{device|subsys} This allows namespace exit methods to batch work that comes requires an rcu barrier using call_rcu without having to treat the unregister_pernet_operations cases specially. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4026a4cff93c..bd8c4712ea24 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -413,8 +413,11 @@ again: } } error = __register_pernet_operations(list, ops); - if (error && ops->id) - ida_remove(&net_generic_ids, *ops->id); + if (error) { + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); + } return error; } @@ -423,6 +426,7 @@ static void unregister_pernet_operations(struct pernet_operations *ops) { __unregister_pernet_operations(ops); + rcu_barrier(); if (ops->id) ida_remove(&net_generic_ids, *ops->id); } -- cgit v1.2.2 From e9c5158ac26affd5d8ce006521bdfb7148090e18 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 12:22:55 -0800 Subject: net: Allow fib_rule_unregister to batch Refactor the code so fib_rules_register always takes a template instead of the actual fib_rules_ops structure that will be used. This is required for network namespace support so 2 out of the 3 callers already do this, it allows the error handling to be made common, and it allows fib_rules_unregister to free the template for hte caller. Modify fib_rules_unregister to use call_rcu instead of syncrhonize_rcu to allw multiple namespaces to be cleaned up in the same rcu grace period. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/fib_rules.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ef0e7d9e664b..02a3b2c69c1e 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -72,7 +72,7 @@ static void flush_route_cache(struct fib_rules_ops *ops) ops->flush_cache(ops); } -int fib_rules_register(struct fib_rules_ops *ops) +static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; @@ -102,6 +102,28 @@ errout: return err; } +struct fib_rules_ops * +fib_rules_register(struct fib_rules_ops *tmpl, struct net *net) +{ + struct fib_rules_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof (*ops), GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + err = __fib_rules_register(ops); + if (err) { + kfree(ops); + ops = ERR_PTR(err); + } + + return ops; +} + EXPORT_SYMBOL_GPL(fib_rules_register); void fib_rules_cleanup_ops(struct fib_rules_ops *ops) @@ -115,6 +137,15 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops) } EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops); +static void fib_rules_put_rcu(struct rcu_head *head) +{ + struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); + struct net *net = ops->fro_net; + + release_net(net); + kfree(ops); +} + void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; @@ -124,8 +155,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops) fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); - synchronize_rcu(); - release_net(net); + call_rcu(&ops->rcu, fib_rules_put_rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); -- cgit v1.2.2 From fc4a7489663250360cd40d5adf06a08d1c5d54df Mon Sep 17 00:00:00 2001 From: Patrick Mullaney Date: Thu, 3 Dec 2009 15:59:22 -0800 Subject: netdevice: provide common routine for macvlan and vlan operstate management Provide common routine for the transition of operational state for a leaf device during a root device transition. Signed-off-by: Patrick Mullaney Acked-by: Arnd Bergmann Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0913a08a87d6..c36a17aafcf3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4900,6 +4900,33 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +/** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + /** * register_netdevice - register a network device * @dev: device to register -- cgit v1.2.2 From e93737b0f0159a61772894943199fd3b6f315641 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Tue, 8 Dec 2009 22:26:02 +0000 Subject: net: Handle NETREG_UNINITIALIZED devices correctly Fix two problems: 1. If unregister_netdevice_many() is called with both registered and unregistered devices, rollback_registered_many() bails out when it reaches the first unregistered device. The processing of the prior registered devices is unfinished, and the remaining devices are skipped, and possible registered netdev's are leaked/unregistered. 2. System hangs or panics depending on how the devices are passed, since when netdev_run_todo() runs, some devices were not fully processed. Tested by passing intermingled unregistered and registered vlan devices to unregister_netdevice_many() as follows: 1. dev, fake_dev1, fake_dev2: hangs in run_todo ("unregister_netdevice: waiting for eth1.100 to become free. Usage count = 1") 2. fake_dev1, dev, fake_dev2: failure during de-registration and next registration, followed by a vlan driver Oops during subsequent registration. Confirmed that the patch fixes both cases. Signed-off-by: Krishna Kumar Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c36a17aafcf3..6fe7d739e59b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4771,21 +4771,23 @@ static void net_set_todo(struct net_device *dev) static void rollback_registered_many(struct list_head *head) { - struct net_device *dev; + struct net_device *dev, *tmp; BUG_ON(dev_boot_phase); ASSERT_RTNL(); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry_safe(dev, tmp, head, unreg_list) { /* Some devices call without registering - * for initialization unwind. + * for initialization unwind. Remove those + * devices and proceed with the remaining. */ if (dev->reg_state == NETREG_UNINITIALIZED) { pr_debug("unregister_netdevice: device %s/%p never " "was registered\n", dev->name, dev); WARN_ON(1); - return; + list_del(&dev->unreg_list); + continue; } BUG_ON(dev->reg_state != NETREG_REGISTERED); -- cgit v1.2.2 From d90a909e1f3e006a1d57fe11fd417173b6494701 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 12 Dec 2009 22:11:15 +0000 Subject: net: Fix userspace RTM_NEWLINK notifications. I received some bug reports about userspace programs having problems because after RTM_NEWLINK was received they could not immediate access files under /proc/sys/net/ because they had not been registered yet. The original problem was trivially fixed by moving the userspace notification from rtnetlink_event() to the end of register_netdevice(). When testing that change I discovered I was still getting RTM_NEWLINK events before I could access proc and I was also getting RTM_NEWLINK events after I was seeing RTM_DELLINK. Things practically guaranteed to confuse userspace. After a little more investigation these extra notifications proved to be from the new notifiers NETDEV_POST_INIT and NETDEV_UNREGISTER_BATCH hitting the default case in rtnetlink_event, and triggering unnecessary RTM_NEWLINK messages. rtnetlink_event now explicitly handles NETDEV_UNREGISTER_BATCH and NETDEV_POST_INIT to avoid sending the incorrect userspace notifications. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++++++++ net/core/rtnetlink.c | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6fe7d739e59b..be9924f60ec3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5035,6 +5035,11 @@ int register_netdevice(struct net_device *dev) rollback_registered(dev); dev->reg_state = NETREG_UNREGISTERED; } + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); out: return ret; @@ -5597,6 +5602,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + synchronize_net(); err = 0; out: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 33148a568199..794bcb897ff0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1364,15 +1364,15 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_UNREGISTER: rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); break; - case NETDEV_REGISTER: - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); - break; case NETDEV_UP: case NETDEV_DOWN: rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); break; + case NETDEV_POST_INIT: + case NETDEV_REGISTER: case NETDEV_CHANGE: case NETDEV_GOING_DOWN: + case NETDEV_UNREGISTER_BATCH: break; default: rtmsg_ifinfo(RTM_NEWLINK, dev, 0); -- cgit v1.2.2 From 28dfef8febe48f59cf1e7596e1992a6a1893ca24 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 15 Dec 2009 16:46:48 -0800 Subject: const: constify remaining pipe_buf_operations Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bfa3e7865a8c..93c4e060c91e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -93,7 +93,7 @@ static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket. */ -static struct pipe_buf_operations sock_pipe_buf_ops = { +static const struct pipe_buf_operations sock_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, -- cgit v1.2.2 From 068a2de57ddf4f472e32e7af868613c574ad1d88 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Wed, 9 Dec 2009 20:59:58 +0000 Subject: net: release dst entry while cache-hot for GSO case too Non-GSO code drops dst entry for performance reasons, but the same is missing for GSO code. Drop dst while cache-hot for GSO case too. Signed-off-by: Krishna Kumar Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index be9924f60ec3..a8d68cdedbbe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1853,6 +1853,14 @@ gso: skb->next = nskb->next; nskb->next = NULL; + + /* + * If device doesnt need nskb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(nskb); + rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) -- cgit v1.2.2 From f466dba1832f05006cf6caa9be41fb98d11cb848 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 23 Dec 2009 22:02:57 -0800 Subject: pktgen: ndo_start_xmit can return NET_XMIT_xxx values This updates pktgen so that it does not decrement skb->users when it receives valid NET_XMIT_xxx values. These are now valid return values from ndo_start_xmit in net-next-2.6. They also indicate the skb has been consumed. This fixes pktgen to work correctly with vlan devices. Signed-off-by: John Fastabend Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/pktgen.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a23b45f08ec9..de0c2c726420 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -250,8 +250,7 @@ struct pktgen_dev { __u64 count; /* Default No packets to send */ __u64 sofar; /* How many pkts we've sent so far */ __u64 tx_bytes; /* How many bytes we've transmitted */ - __u64 errors; /* Errors when trying to transmit, - pkts will be re-sent */ + __u64 errors; /* Errors when trying to transmit, */ /* runtime counters relating to clone_skb */ @@ -3465,6 +3464,12 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->seq_num++; pkt_dev->tx_bytes += pkt_dev->last_pkt_size; break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + case NET_XMIT_POLICED: + /* skb has been consumed */ + pkt_dev->errors++; + break; default: /* Drivers are not supposed to return other values! */ if (net_ratelimit()) pr_info("pktgen: %s xmit error: %d\n", -- cgit v1.2.2 From 1f3c8804acba841b5573b953f5560d2683d2db0d Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Mon, 14 Dec 2009 10:48:58 +0000 Subject: bonding: allow arp_ip_targets on separate vlans to use arp validation This allows a bond device to specify an arp_ip_target as a host that is not on the same vlan as the base bond device and still use arp validation. A configuration like this, now works: BONDING_OPTS="mode=active-backup arp_interval=1000 arp_ip_target=10.0.100.1 arp_validate=3" 1: lo: mtu 16436 qdisc noqueue link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: eth1: mtu 1500 qdisc pfifo_fast master bond0 qlen 1000 link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff 3: eth0: mtu 1500 qdisc pfifo_fast master bond0 qlen 1000 link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff 8: bond0: mtu 1500 qdisc noqueue link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff inet6 fe80::213:21ff:febe:33e9/64 scope link valid_lft forever preferred_lft forever 9: bond0.100@bond0: mtu 1500 qdisc noqueue link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff inet 10.0.100.2/24 brd 10.0.100.255 scope global bond0.100 inet6 fe80::213:21ff:febe:33e9/64 scope link valid_lft forever preferred_lft forever Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009) Bonding Mode: fault-tolerance (active-backup) Primary Slave: None Currently Active Slave: eth1 MII Status: up MII Polling Interval (ms): 0 Up Delay (ms): 0 Down Delay (ms): 0 ARP Polling Interval (ms): 1000 ARP IP target/s (n.n.n.n form): 10.0.100.1 Slave Interface: eth1 MII Status: up Link Failure Count: 1 Permanent HW addr: 00:40:05:30:ff:30 Slave Interface: eth0 MII Status: up Link Failure Count: 0 Permanent HW addr: 00:13:21:be:33:e9 Signed-off-by: Andy Gospodarek Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a8d68cdedbbe..f9aa699ab6cb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2495,12 +2495,26 @@ ncls: if (!skb) goto out; + /* + * Make sure frames received on VLAN interfaces stacked on + * bonding interfaces still make their way to any base bonding + * device that may have registered for a specific ptype. The + * handler may have to adjust skb->dev and orig_dev. + * + * null_or_orig can be overloaded since it will not be set when + * using VLANs on top of bonding. Putting it here prevents + * disturbing the ptype_all handlers above. + */ + if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && + (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { + null_or_orig = vlan_dev_real_dev(skb->dev); + } + type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_orig || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { + if (ptype->type == type && (ptype->dev == null_or_orig || + ptype->dev == skb->dev || ptype->dev == orig_dev)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; -- cgit v1.2.2 From ca8d9ea30bc79b2965a1d169dcb2f48f02af4d2d Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Wed, 6 Jan 2010 12:56:37 +0000 Subject: fix bonding: allow arp_ip_targets on separate vlans to use arp validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Wed, Jan 06, 2010 at 10:10:03PM +0100, Eric Dumazet wrote: > Le 06/01/2010 19:38, Eric Dumazet a écrit : > > > > (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection) > > David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d > (bonding: allow arp_ip_targets on separate vlans to use arp validation) > > Or else, my vlan devices dont work (unfortunatly I dont have much time > these days to debug the thing) > > My config : > > +---------+ > vlan.103 -----+ bond0 +--- eth1 (bnx2) > | + > vlan.825 -----+ +--- eth2 (tg3) > +---------+ > > $ cat /proc/net/bonding/bond0 > Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009) > > Bonding Mode: fault-tolerance (active-backup) > Primary Slave: None > Currently Active Slave: eth2 > MII Status: up > MII Polling Interval (ms): 100 > Up Delay (ms): 0 > Down Delay (ms): 0 > > Slave Interface: eth1 (bnx2) > MII Status: down > Link Failure Count: 1 > Permanent HW addr: 00:1e:0b:ec:d3:d2 > > Slave Interface: eth2 (tg3) > MII Status: up > Link Failure Count: 0 > Permanent HW addr: 00:1e:0b:92:78:50 > This patch fixes up a problem with found with commit 1f3c8804acba841b5573b953f5560d2683d2db0d. The original change overloaded null_or_orig, but doing that prevented any packet handlers that were not tied to a specific device (i.e. ptype->dev == NULL) from ever receiving any frames. The null_or_orig variable cannot be overloaded, and must be kept as NULL to prevent the frame from being ignored by packet handlers designed to accept frames on any interface. Signed-off-by: Andy Gospodarek Signed-off-by: Jay Vosburgh Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f9aa699ab6cb..d9ab9be0c323 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2430,6 +2430,7 @@ int netif_receive_skb(struct sk_buff *skb) struct packet_type *ptype, *pt_prev; struct net_device *orig_dev; struct net_device *null_or_orig; + struct net_device *null_or_bond; int ret = NET_RX_DROP; __be16 type; @@ -2500,21 +2501,19 @@ ncls: * bonding interfaces still make their way to any base bonding * device that may have registered for a specific ptype. The * handler may have to adjust skb->dev and orig_dev. - * - * null_or_orig can be overloaded since it will not be set when - * using VLANs on top of bonding. Putting it here prevents - * disturbing the ptype_all handlers above. */ + null_or_bond = NULL; if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { - null_or_orig = vlan_dev_real_dev(skb->dev); + null_or_bond = vlan_dev_real_dev(skb->dev); } type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && (ptype->dev == null_or_orig || - ptype->dev == skb->dev || ptype->dev == orig_dev)) { + ptype->dev == skb->dev || ptype->dev == orig_dev || + ptype->dev == null_or_bond)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; -- cgit v1.2.2 From 2d13bafeba24f732e89b818b8c66b07893457570 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 Jan 2010 05:50:52 +0000 Subject: net: Make it easier to parse /proc/net/dev contents. The contents of /proc/net/dev is annoying to parse, because it changes whether there is a space after the "ethX:" or not. It depends upon the size of the "Receive bytes" counter, if the number is below 7 digits, then there is whitespaces else if the number is 8 digits or above there is no space between the ":" and the number. This patch changes the output to assure there is always a space between the ":" and the number. Given that all existing userspace application already need to handle the whitespaces, I see no breakage of existing tools. Signed-off-by: Jesper Dangaard Brouer Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d9ab9be0c323..a008f6987a95 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3206,7 +3206,7 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { const struct net_device_stats *stats = dev_get_stats(dev); - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, -- cgit v1.2.2 From 704da560c0a0120d8869187f511491a00951a1d3 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Fri, 8 Jan 2010 00:00:09 -0800 Subject: tcp: update the netstamp_needed counter when cloning sockets This fixes a netstamp_needed accounting issue when the listen socket has SO_TIMESTAMP set: s = socket(AF_INET, SOCK_STREAM, 0); setsockopt(s, SOL_SOCKET, SO_TIMESTAMP, 1); -> netstamp_needed = 1 bind(s, ...); listen(s, ...); s2 = accept(s, ...); -> netstamp_needed = 1 close(s2); -> netstamp_needed = 0 close(s); -> netstamp_needed = -1 Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 76ff58d43e26..e1f6f225f012 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1205,6 +1205,10 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) if (newsk->sk_prot->sockets_allocated) percpu_counter_inc(newsk->sk_prot->sockets_allocated); + + if (sock_flag(newsk, SOCK_TIMESTAMP) || + sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + net_enable_timestamp(); } out: return newsk; -- cgit v1.2.2 From 508e14b4a4fb1a824a14f2c5b8d7df67b313f8e4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2010 14:27:30 +0000 Subject: netpoll: allow execution of multiple rx_hooks per interface Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/netpoll.c | 169 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 106 insertions(+), 63 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 0b4d0d35ef40..7aa697253765 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -407,11 +407,24 @@ static void arp_reply(struct sk_buff *skb) __be32 sip, tip; unsigned char *sha; struct sk_buff *send_skb; - struct netpoll *np = NULL; + struct netpoll *np, *tmp; + unsigned long flags; + int hits = 0; + + if (list_empty(&npinfo->rx_np)) + return; + + /* Before checking the packet, we do some early + inspection whether this is interesting at all */ + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->dev == skb->dev) + hits++; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); - if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev) - np = npinfo->rx_np; - if (!np) + /* No netpoll struct is using this dev */ + if (!hits) return; /* No arp on this interface */ @@ -437,77 +450,91 @@ static void arp_reply(struct sk_buff *skb) arp_ptr += skb->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; - /* if we actually cared about dst hw addr, it would get copied here */ + /* If we actually cared about dst hw addr, + it would get copied here */ arp_ptr += skb->dev->addr_len; memcpy(&tip, arp_ptr, 4); /* Should we ignore arp? */ - if (tip != np->local_ip || - ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) return; size = arp_hdr_len(skb->dev); - send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), - LL_RESERVED_SPACE(np->dev)); - if (!send_skb) - return; - - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (tip != np->local_ip) + continue; - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ptype, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - return; - } + send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), + LL_RESERVED_SPACE(np->dev)); + if (!send_skb) + continue; - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should always equal 1 (Ethernet). - */ + skb_reset_network_header(send_skb); + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); + /* Fill the device header for the ARP frame */ + if (dev_hard_header(send_skb, skb->dev, ptype, + sha, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } - arp_ptr=(unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should + * always equal 1 (Ethernet). + */ - netpoll_send_skb(np, send_skb); + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, sha, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); } int __netpoll_rx(struct sk_buff *skb) { int proto, len, ulen; + int hits = 0; struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npi = skb->dev->npinfo; - struct netpoll *np = npi->rx_np; + struct netpoll_info *npinfo = skb->dev->npinfo; + struct netpoll *np, *tmp; - if (!np) + if (list_empty(&npinfo->rx_np)) goto out; + if (skb->dev->type != ARPHRD_ETHER) goto out; /* check if netpoll clients need ARP */ if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { - skb_queue_tail(&npi->arp_tx, skb); + skb_queue_tail(&npinfo->arp_tx, skb); return 1; } @@ -551,16 +578,23 @@ int __netpoll_rx(struct sk_buff *skb) goto out; if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; - if (np->local_ip && np->local_ip != iph->daddr) - goto out; - if (np->remote_ip && np->remote_ip != iph->saddr) - goto out; - if (np->local_port && np->local_port != ntohs(uh->dest)) - goto out; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->local_ip && np->local_ip != iph->daddr) + continue; + if (np->remote_ip && np->remote_ip != iph->saddr) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } + + if (!hits) + goto out; kfree_skb(skb); return 1; @@ -684,6 +718,7 @@ int netpoll_setup(struct netpoll *np) struct net_device *ndev = NULL; struct in_device *in_dev; struct netpoll_info *npinfo; + struct netpoll *npe, *tmp; unsigned long flags; int err; @@ -704,7 +739,7 @@ int netpoll_setup(struct netpoll *np) } npinfo->rx_flags = 0; - npinfo->rx_np = NULL; + INIT_LIST_HEAD(&npinfo->rx_np); spin_lock_init(&npinfo->rx_lock); skb_queue_head_init(&npinfo->arp_tx); @@ -785,7 +820,7 @@ int netpoll_setup(struct netpoll *np) if (np->rx_hook) { spin_lock_irqsave(&npinfo->rx_lock, flags); npinfo->rx_flags |= NETPOLL_RX_ENABLED; - npinfo->rx_np = np; + list_add_tail(&np->rx, &npinfo->rx_np); spin_unlock_irqrestore(&npinfo->rx_lock, flags); } @@ -801,9 +836,16 @@ int netpoll_setup(struct netpoll *np) return 0; release: - if (!ndev->npinfo) + if (!ndev->npinfo) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(npe, tmp, &npinfo->rx_np, rx) { + npe->dev = NULL; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + kfree(npinfo); - np->dev = NULL; + } + dev_put(ndev); return err; } @@ -823,10 +865,11 @@ void netpoll_cleanup(struct netpoll *np) if (np->dev) { npinfo = np->dev->npinfo; if (npinfo) { - if (npinfo->rx_np == np) { + if (!list_empty(&npinfo->rx_np)) { spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_np = NULL; - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; + list_del(&np->rx); + if (list_empty(&npinfo->rx_np)) + npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -- cgit v1.2.2 From 4d0392be21d4710121f855c0caf57d32ffd1ced0 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 15 Jan 2010 01:08:58 -0800 Subject: net/core/sock.c: quiet sparse noise In sock_getsockopt the symbol 'lv' is declared as an unsigned int type, probably due to sizeof returning a size_t which is really an unsigned int. This produces a sparse warning for SO_PEERNAME due to the sock->ops->getname() call: warning: incorrect type in argument 3 (different signedness) expected int *sockaddr_len got unsigned int * Quiet the warning by changing the type of 'lv' to an int. Signed-off-by: H Hartley Sweeten Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index e1f6f225f012..10b1d3243a72 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -741,7 +741,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct timeval tm; } v; - unsigned int lv = sizeof(int); + int lv = sizeof(int); int len; if (get_user(len, optlen)) -- cgit v1.2.2 From 2c8c1e7297e19bdef3c178c3ea41d898a7716e3e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 17 Jan 2010 03:35:32 +0000 Subject: net: spread __net_init, __net_exit __net_init/__net_exit are apparently not going away, so use them to full extent. In some cases __net_init was removed, because it was called from __net_exit code. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- net/core/rtnetlink.c | 4 ++-- net/core/sock.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 02a3b2c69c1e..9a24377146bf 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -708,7 +708,7 @@ static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; -static int fib_rules_net_init(struct net *net) +static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 794bcb897ff0..62f3878a6010 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1386,7 +1386,7 @@ static struct notifier_block rtnetlink_dev_notifier = { }; -static int rtnetlink_net_init(struct net *net) +static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, @@ -1397,7 +1397,7 @@ static int rtnetlink_net_init(struct net *net) return 0; } -static void rtnetlink_net_exit(struct net *net) +static void __net_exit rtnetlink_net_exit(struct net *net) { netlink_kernel_release(net->rtnl); net->rtnl = NULL; diff --git a/net/core/sock.c b/net/core/sock.c index 10b1d3243a72..ceef50bd131b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2140,13 +2140,13 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); -static int sock_inuse_init_net(struct net *net) +static int __net_init sock_inuse_init_net(struct net *net) { net->core.inuse = alloc_percpu(struct prot_inuse); return net->core.inuse ? 0 : -ENOMEM; } -static void sock_inuse_exit_net(struct net *net) +static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.inuse); } -- cgit v1.2.2 From 11380a4b2d86fae9a6bce75c9373668cc323fe57 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jan 2010 13:46:10 -0800 Subject: net: Unexport napi_gro_flush(). Nothing outside of net/core/dev.c uses it. Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a008f6987a95..5747b9edc1bb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2582,7 +2582,7 @@ out: return netif_receive_skb(skb); } -void napi_gro_flush(struct napi_struct *napi) +static void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -2595,7 +2595,6 @@ void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } -EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.2.2 From 4b258461c0b31ded170a1a56b944b0fded1c887b Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Thu, 21 Jan 2010 01:26:29 -0800 Subject: net: Optimize non-gso test checks Avoid checking twice whether skb needs to be linearized, if one skb_linearize was already done. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/dev.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5747b9edc1bb..4fad9db417b1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1982,6 +1982,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +/* + * Returns true if either: + * 1. skb has frag_list and the device doesn't support FRAGLIST, or + * 2. skb is fragmented and the device does not support SG, or if + * at least one of fragments is in highmem and device does not + * support DMA from it. + */ +static inline int skb_needs_linearize(struct sk_buff *skb, + struct net_device *dev) +{ + return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || + illegal_highdma(dev, skb))); +} + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2018,18 +2033,8 @@ int dev_queue_xmit(struct sk_buff *skb) if (netif_needs_gso(dev, skb)) goto gso; - if (skb_has_frags(skb) && - !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb)) - goto out_kfree_skb; - - /* Fragmented skb is linearized if device does not support SG, - * or if at least one of fragments is in highmem and device - * does not support DMA from it. - */ - if (skb_shinfo(skb)->nr_frags && - (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb)) + /* Convert a paged skb to linear, if required */ + if (skb_needs_linearize(skb, dev) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not support -- cgit v1.2.2 From 81c1ebfc4379f529b001e23164dd5c2282bdc0ec Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 22 Jan 2010 10:16:05 +0000 Subject: neigh: simplify seq_file code Simpily pass 'struct neigh_table' with seq_file private pointer, and save one dereference. Proc entry itself isn't interesting. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/neighbour.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f35377b643e4..f2efd72da799 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2417,8 +2417,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; int cpu; if (*pos == 0) @@ -2435,8 +2434,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -2455,8 +2453,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { @@ -2501,7 +2498,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode); + sf->private = PDE(inode)->data; } return ret; }; -- cgit v1.2.2 From 32e7bfc41110bc8f29ec0f293c3bcee6645fef34 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Jan 2010 13:36:10 -0800 Subject: net: use helpers to access uc list V2 This patch introduces three macros to work with uc list from net drivers. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4fad9db417b1..2cba5c521e56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3665,10 +3665,10 @@ void __dev_set_rx_mode(struct net_device *dev) /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ - if (dev->uc.count > 0 && !dev->uc_promisc) { + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1); dev->uc_promisc = 1; - } else if (dev->uc.count == 0 && dev->uc_promisc) { + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1); dev->uc_promisc = 0; } -- cgit v1.2.2 From 8a83a00b0735190384a348156837918271034144 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 30 Jan 2010 12:23:03 +0000 Subject: net: maintain namespace isolation between vlan and real device In the vlan and macvlan drivers, the start_xmit function forwards data to the dev_queue_xmit function for another device, which may potentially belong to a different namespace. To make sure that classification stays within a single namespace, this resets the potentially critical fields. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/core/dev.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2cba5c521e56..94c1eeed25e5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) if (skb->len > (dev->mtu + dev->hard_header_len)) return NET_RX_DROP; - skb_dst_drop(skb); + skb_set_dev(skb, dev); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) return false; } +/** + * skb_dev_set -- assign a new device to a buffer + * @skb: buffer for the new device + * @dev: network device + * + * If an skb is owned by a device already, we have to reset + * all data private to the namespace a device belongs to + * before assigning it a new device. + */ +#ifdef CONFIG_NET_NS +void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb_dst_drop(skb); + if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { + secpath_reset(skb); + nf_reset(skb); + skb_init_secmark(skb); + skb->mark = 0; + skb->priority = 0; + skb->nf_trace = 0; + skb->ipvs_property = 0; +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#endif + } + skb->dev = dev; +} +EXPORT_SYMBOL(skb_set_dev); +#endif /* CONFIG_NET_NS */ + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. -- cgit v1.2.2 From 1b3f720bf033fde1fbb6231f9b156b918c5f68d8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 4 Feb 2010 14:00:41 -0800 Subject: pktgen: Fix freezing problem Add missing try_to_freeze() to one of the pktgen_thread_worker() code paths so that it doesn't block suspend/hibernation. Fixes http://bugzilla.kernel.org/show_bug.cgi?id=15006 Signed-off-by: Rafael J. Wysocki Reported-and-tested-by: Ciprian Dorin Craciun Signed-off-by: David S. Miller --- net/core/pktgen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index de0c2c726420..2e692afdc55d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3524,6 +3524,7 @@ static int pktgen_thread_worker(void *arg) wait_event_interruptible_timeout(t->queue, t->control != 0, HZ/10); + try_to_freeze(); continue; } -- cgit v1.2.2 From 2fc1b5dd99f66d93ffc23fd8df82d384c1a354c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Feb 2010 15:00:39 -0800 Subject: dst: call cond_resched() in dst_gc_task() Kernel bugzilla #15239 On some workloads, it is quite possible to get a huge dst list to process in dst_gc_task(), and trigger soft lockup detection. Fix is to call cond_resched(), as we run in process context. Reported-by: Pawel Staszewski Tested-by: Pawel Staszewski Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 57bc4d5b8d08..cb1b3488b739 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -79,6 +80,7 @@ loop: while ((dst = next) != NULL) { next = dst->next; prefetch(&next->next); + cond_resched(); if (likely(atomic_read(&dst->__refcnt))) { last->next = dst; last = dst; -- cgit v1.2.2 From 15682bc488d4af8c9bb998844a94281025e0a333 Mon Sep 17 00:00:00 2001 From: Peter P Waskiewicz Jr Date: Wed, 10 Feb 2010 20:03:05 -0800 Subject: ethtool: Introduce n-tuple filter programming support This patchset enables the ethtool layer to program n-tuple filters to an underlying device. The idea is to allow capable hardware to have static rules applied that can assist steering flows into appropriate queues. Hardware that is known to support these types of filters today are ixgbe and niu. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/dev.c | 5 + net/core/ethtool.c | 329 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 333 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 94c1eeed25e5..ae75f25ac0a5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5419,6 +5419,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); + INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); + dev->ethtool_ntuple_list.count = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); @@ -5455,6 +5457,9 @@ void free_netdev(struct net_device *dev) /* Flush device addresses */ dev_addr_flush(dev); + /* Clear ethtool n-tuple list */ + ethtool_ntuple_flush(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d8aee584e8d1..6ec73d3983a3 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -120,7 +120,7 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data) * NETIF_F_xxx values in include/linux/netdevice.h */ static const u32 flags_dup_features = - ETH_FLAG_LRO; + (ETH_FLAG_LRO | ETH_FLAG_NTUPLE); u32 ethtool_op_get_flags(struct net_device *dev) { @@ -139,9 +139,26 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data) else dev->features &= ~NETIF_F_LRO; + if (data & ETH_FLAG_NTUPLE) + dev->features |= NETIF_F_NTUPLE; + else + dev->features &= ~NETIF_F_NTUPLE; + return 0; } +void ethtool_ntuple_flush(struct net_device *dev) +{ + struct ethtool_rx_ntuple_flow_spec_container *fsc, *f; + + list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) { + list_del(&fsc->list); + kfree(fsc); + } + dev->ethtool_ntuple_list.count = 0; +} +EXPORT_SYMBOL(ethtool_ntuple_flush); + /* Handlers for each ethtool command */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) @@ -266,6 +283,307 @@ err_out: return ret; } +static int __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, + struct ethtool_rx_ntuple_flow_spec *spec) +{ + struct ethtool_rx_ntuple_flow_spec_container *fsc; + + /* don't add filters forever */ + if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) + return 0; + + fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC); + if (!fsc) + return -ENOMEM; + + /* Copy the whole filter over */ + fsc->fs.flow_type = spec->flow_type; + memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u)); + memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u)); + + fsc->fs.vlan_tag = spec->vlan_tag; + fsc->fs.vlan_tag_mask = spec->vlan_tag_mask; + fsc->fs.data = spec->data; + fsc->fs.data_mask = spec->data_mask; + fsc->fs.action = spec->action; + + /* add to the list */ + list_add_tail_rcu(&fsc->list, &list->list); + list->count++; + + return 0; +} + +static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_rx_ntuple cmd; + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + + if (!ops->set_rx_ntuple) + return -EOPNOTSUPP; + + if (!(dev->features & NETIF_F_NTUPLE)) + return -EINVAL; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + ret = ops->set_rx_ntuple(dev, &cmd); + + /* + * Cache filter in dev struct for GET operation only if + * the underlying driver doesn't have its own GET operation, and + * only if the filter was added successfully. + */ + if (!ops->get_rx_ntuple && !ret) + if (__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs)) + return -ENOMEM; + + return ret; +} + +static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rx_ntuple_flow_spec_container *fsc; + u8 *data; + char *p; + int ret, i, num_strings = 0; + + if (!ops->get_sset_count) + return -EOPNOTSUPP; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + ret = ops->get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; + + data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + if (ops->get_rx_ntuple) { + /* driver-specific filter grab */ + ret = ops->get_rx_ntuple(dev, gstrings.string_set, data); + goto copy; + } + + /* default ethtool filter grab */ + i = 0; + p = (char *)data; + list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) { + sprintf(p, "Filter %d:\n", i); + p += ETH_GSTRING_LEN; + num_strings++; + + switch (fsc->fs.flow_type) { + case TCP_V4_FLOW: + sprintf(p, "\tFlow Type: TCP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case UDP_V4_FLOW: + sprintf(p, "\tFlow Type: UDP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case SCTP_V4_FLOW: + sprintf(p, "\tFlow Type: SCTP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case AH_ESP_V4_FLOW: + sprintf(p, "\tFlow Type: AH ESP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case ESP_V4_FLOW: + sprintf(p, "\tFlow Type: ESP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IP_USER_FLOW: + sprintf(p, "\tFlow Type: Raw IP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IPV4_FLOW: + sprintf(p, "\tFlow Type: IPv4\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + default: + sprintf(p, "\tFlow Type: Unknown\n"); + p += ETH_GSTRING_LEN; + num_strings++; + goto unknown_filter; + }; + + /* now the rest of the filters */ + switch (fsc->fs.flow_type) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: + case SCTP_V4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.tcp_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.tcp_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc Port: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.psrc, + fsc->fs.m_u.tcp_ip4_spec.psrc); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest Port: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.pdst, + fsc->fs.m_u.tcp_ip4_spec.pdst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.tos, + fsc->fs.m_u.tcp_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case AH_ESP_V4_FLOW: + case ESP_V4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.ah_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.ah_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSPI: %d, mask: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.spi, + fsc->fs.m_u.ah_ip4_spec.spi); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.tos, + fsc->fs.m_u.ah_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IP_USER_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.raw_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.raw_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.raw_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.raw_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IPV4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.l4_4_bytes, + fsc->fs.m_u.usr_ip4_spec.l4_4_bytes); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.tos, + fsc->fs.m_u.usr_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tIP Version: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip_ver, + fsc->fs.m_u.usr_ip4_spec.ip_ver); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tProtocol: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.proto, + fsc->fs.m_u.usr_ip4_spec.proto); + p += ETH_GSTRING_LEN; + num_strings++; + break; + }; + sprintf(p, "\tVLAN: %d, mask: 0x%x\n", + fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask); + p += ETH_GSTRING_LEN; + num_strings++; + if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP) + sprintf(p, "\tAction: Drop\n"); + else + sprintf(p, "\tAction: Direct to queue %d\n", + fsc->fs.action); + p += ETH_GSTRING_LEN; + num_strings++; +unknown_filter: + i++; + } +copy: + /* indicate to userspace how many strings we actually have */ + gstrings.len = num_strings; + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; @@ -313,6 +631,9 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr) if (copy_from_user(&reset, useraddr, sizeof(reset))) return -EFAULT; + /* Clear ethtool n-tuple list */ + ethtool_ntuple_flush(dev); + ret = dev->ethtool_ops->reset(dev, &reset.data); if (ret) return ret; @@ -1112,6 +1433,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_RESET: rc = ethtool_reset(dev, useraddr); break; + case ETHTOOL_SRXNTUPLE: + rc = ethtool_set_rx_ntuple(dev, useraddr); + break; + case ETHTOOL_GRXNTUPLE: + rc = ethtool_get_rx_ntuple(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.2 From 8e5574211d96c0552f84c757718475fdb4021be7 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 11 Feb 2010 12:14:23 -0800 Subject: ethtool: Use explicit designated initializers for .cmd Initialize the .cmd member of various ethtool using a designated struct initializer rather. This makes things a teeny bit more robust, although the chance of a struct layout changing is extremely remote, and also makes the code a little easier to read. Signed-off-by: Roland Dreier Signed-off-by: David S. Miller --- net/core/ethtool.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6ec73d3983a3..a1280f643bf4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -163,7 +163,7 @@ EXPORT_SYMBOL(ethtool_ntuple_flush); static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { - struct ethtool_cmd cmd = { ETHTOOL_GSET }; + struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; int err; if (!dev->ethtool_ops->get_settings) @@ -645,7 +645,7 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr) static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { - struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; @@ -779,7 +779,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { - struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE }; + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; @@ -806,7 +806,7 @@ static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM }; + struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; @@ -1160,7 +1160,7 @@ static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) static int ethtool_get_value(struct net_device *dev, char __user *useraddr, u32 cmd, u32 (*actor)(struct net_device *)) { - struct ethtool_value edata = { cmd }; + struct ethtool_value edata = { .cmd = cmd }; if (!actor) return -EOPNOTSUPP; -- cgit v1.2.2 From 4cd24eaf0c6ee7f0242e34ee77ec899f255e66b5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Feb 2010 04:30:35 +0000 Subject: net: use netdev_mc_count and netdev_mc_empty when appropriate This patch replaces dev->mc_count in all drivers (hopefully I didn't miss anything). Used spatch and did small tweaks and conding style changes when it was suitable. Jirka Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ae75f25ac0a5..d1cf53d0d597 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4263,7 +4263,7 @@ static void dev_addr_discard(struct net_device *dev) netif_addr_lock_bh(dev); __dev_addr_discard(&dev->mc_list); - dev->mc_count = 0; + netdev_mc_count(dev) = 0; netif_addr_unlock_bh(dev); } -- cgit v1.2.2 From ebc08a6f47ee76ecad8e9f26c26e6ec9b46ca659 Mon Sep 17 00:00:00 2001 From: "Williams, Mitch A" Date: Wed, 10 Feb 2010 01:44:05 +0000 Subject: rtnetlink: Add VF config code to rtnetlink Add code to allow rtnetlink clients to query and set VF information through the PF driver. Signed-off-by: Mitch Williams Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 62f3878a6010..42da96a4eeee 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -580,6 +581,15 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->tx_compressed = b->tx_compressed; }; +static inline int rtnl_vfinfo_size(const struct net_device *dev) +{ + if (dev->dev.parent && dev_is_pci(dev->dev.parent)) + return dev_num_vf(dev->dev.parent) * + sizeof(struct ifla_vf_info); + else + return 0; +} + static inline size_t if_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) @@ -597,6 +607,8 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(4) /* IFLA_NUM_VF */ + + nla_total_size(rtnl_vfinfo_size(dev)) /* IFLA_VFINFO */ + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ } @@ -665,6 +677,17 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, stats = dev_get_stats(dev); copy_rtnl_link_stats(nla_data(attr), stats); + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { + int i; + struct ifla_vf_info ivi; + + NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); + for (i = 0; i < dev_num_vf(dev->dev.parent); i++) { + if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) + break; + NLA_PUT(skb, IFLA_VFINFO, sizeof(ivi), &ivi); + } + } if (dev->rtnl_link_ops) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; @@ -725,6 +748,12 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, + [IFLA_VF_MAC] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_VLAN] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_tx_rate) }, }; EXPORT_SYMBOL(ifla_policy); @@ -898,6 +927,44 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, write_unlock_bh(&dev_base_lock); } + if (tb[IFLA_VF_MAC]) { + struct ifla_vf_mac *ivm; + ivm = nla_data(tb[IFLA_VF_MAC]); + write_lock_bh(&dev_base_lock); + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); + write_unlock_bh(&dev_base_lock); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_VF_VLAN]) { + struct ifla_vf_vlan *ivv; + ivv = nla_data(tb[IFLA_VF_VLAN]); + write_lock_bh(&dev_base_lock); + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, + (u16)ivv->vlan, + (u8)ivv->qos); + write_unlock_bh(&dev_base_lock); + if (err < 0) + goto errout; + modified = 1; + } + err = 0; + + if (tb[IFLA_VF_TX_RATE]) { + struct ifla_vf_tx_rate *ivt; + ivt = nla_data(tb[IFLA_VF_TX_RATE]); + write_lock_bh(&dev_base_lock); + if (ops->ndo_set_vf_tx_rate) + err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate); + write_unlock_bh(&dev_base_lock); + if (err < 0) + goto errout; + modified = 1; + } err = 0; errout: -- cgit v1.2.2 From e858911804f5ecadb41afd61582a11f68d416328 Mon Sep 17 00:00:00 2001 From: Peter Waskiewicz Date: Fri, 12 Feb 2010 13:48:05 +0000 Subject: ethtool: Fix filter addition when caching n-tuple filters We can allow a filter to be added successfully to the underlying hardware, but still return an error if the cached list memory allocation fails. This patch fixes that condition. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/ethtool.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a1280f643bf4..fbbe4b49116b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -283,18 +283,17 @@ err_out: return ret; } -static int __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, - struct ethtool_rx_ntuple_flow_spec *spec) +static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, + struct ethtool_rx_ntuple_flow_spec *spec, + struct ethtool_rx_ntuple_flow_spec_container *fsc) { - struct ethtool_rx_ntuple_flow_spec_container *fsc; /* don't add filters forever */ - if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) - return 0; - - fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC); - if (!fsc) - return -ENOMEM; + if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) { + /* free the container */ + kfree(fsc); + return; + } /* Copy the whole filter over */ fsc->fs.flow_type = spec->flow_type; @@ -310,14 +309,13 @@ static int __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, /* add to the list */ list_add_tail_rcu(&fsc->list, &list->list); list->count++; - - return 0; } static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) { struct ethtool_rx_ntuple cmd; const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; int ret; if (!ops->set_rx_ntuple) @@ -329,16 +327,26 @@ static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; - ret = ops->set_rx_ntuple(dev, &cmd); - /* * Cache filter in dev struct for GET operation only if * the underlying driver doesn't have its own GET operation, and - * only if the filter was added successfully. + * only if the filter was added successfully. First make sure we + * can allocate the filter, then continue if successful. */ - if (!ops->get_rx_ntuple && !ret) - if (__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs)) + if (!ops->get_rx_ntuple) { + fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC); + if (!fsc) return -ENOMEM; + } + + ret = ops->set_rx_ntuple(dev, &cmd); + if (ret) { + kfree(fsc); + return ret; + } + + if (!ops->get_rx_ntuple) + __rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc); return ret; } -- cgit v1.2.2 From 0d643e1fb4207711d9c148b5c6a2820550a4a154 Mon Sep 17 00:00:00 2001 From: Peter Waskiewicz Date: Fri, 12 Feb 2010 13:48:25 +0000 Subject: ethtool: Move n-tuple capability check into set_flags set_flags should check if the underlying device supports n-tuple filter programming before setting the device flags on the netdevice. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/ethtool.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index fbbe4b49116b..794cf57078cd 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -134,15 +134,21 @@ u32 ethtool_op_get_flags(struct net_device *dev) int ethtool_op_set_flags(struct net_device *dev, u32 data) { + const struct ethtool_ops *ops = dev->ethtool_ops; + if (data & ETH_FLAG_LRO) dev->features |= NETIF_F_LRO; else dev->features &= ~NETIF_F_LRO; - if (data & ETH_FLAG_NTUPLE) + if (data & ETH_FLAG_NTUPLE) { + if (!ops->set_rx_ntuple) + return -EOPNOTSUPP; dev->features |= NETIF_F_NTUPLE; - else + } else { + /* safe to clear regardless */ dev->features &= ~NETIF_F_NTUPLE; + } return 0; } @@ -318,9 +324,6 @@ static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; int ret; - if (!ops->set_rx_ntuple) - return -EOPNOTSUPP; - if (!(dev->features & NETIF_F_NTUPLE)) return -EINVAL; -- cgit v1.2.2 From 339c6e99853d2ef1f02ad8a313e079050a300427 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Feb 2010 21:51:33 -0800 Subject: ethtool: reduce stack usage dev_ethtool() is currently using 604 bytes of stack, even with gcc-4.4.2 objdump -d vmlinux | scripts/checkstack.pl ... 0xc04bbc33 dev_ethtool [vmlinux]: 604 ... Adding noinline attributes to selected functions can reduce stack usage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/ethtool.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 794cf57078cd..82cae3bca78d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -197,7 +197,10 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_settings(dev, &cmd); } -static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) { struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -232,7 +235,10 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) return 0; } -static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc cmd; @@ -245,7 +251,10 @@ static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_rxnfc(dev, &cmd); } -static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc info; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -317,7 +326,10 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, list->count++; } -static int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) { struct ethtool_rx_ntuple cmd; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -788,7 +800,10 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) return ret; } -static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; @@ -802,7 +817,10 @@ static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) return 0; } -static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce; @@ -1212,7 +1230,10 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr, return actor(dev, edata.data); } -static int ethtool_flash_device(struct net_device *dev, char __user *useraddr) +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_flash_device(struct net_device *dev, char __user *useraddr) { struct ethtool_flash efl; -- cgit v1.2.2 From 1cab819b5e244e1b853c7b440981e6a960da3bfb Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 11 Feb 2010 13:48:29 +0000 Subject: ethtool: allow non-admin user to read GRO settings. Looks like an oversight in GRO design. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d8aee584e8d1..236a9988ea91 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -927,6 +927,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: + case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: -- cgit v1.2.2 From 54716e3beb0ab20c49471348dfe399a71bfc8fd3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 14 Feb 2010 03:27:03 +0000 Subject: net neigh: Decouple per interface neighbour table controls from binary sysctls Stop computing the number of neighbour table settings we have by counting the number of binary sysctls. This behaviour was silly and meant that we could not add another neighbour table setting without also adding another binary sysctl. Don't pass the binary sysctl path for neighour table entries into neigh_sysctl_register. These parameters are no longer used and so are just dead code. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/neighbour.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f2efd72da799..d102f6d9abdc 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2556,9 +2556,11 @@ EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL +#define NEIGH_VARS_MAX 19 + static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[__NET_NEIGH_MAX]; + struct ctl_table neigh_vars[NEIGH_VARS_MAX]; char *dev_name; } neigh_sysctl_template __read_mostly = { .neigh_vars = { @@ -2675,8 +2677,7 @@ static struct neigh_sysctl_table { }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, - int p_id, int pdev_id, char *p_name, - proc_handler *handler) + char *p_name, proc_handler *handler) { struct neigh_sysctl_table *t; const char *dev_name_source = NULL; -- cgit v1.2.2 From dc4c2c31053ba5bf685d273cd62ecca406dddb2d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 12 Feb 2010 11:41:39 +0000 Subject: net: remove INIT_RCU_HEAD() usage call_rcu() will unconditionally reinitialize RCU head anyway. Signed-off-by: Alexey Dobriyan Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b8e9d3a86887..f8c874975350 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -296,7 +296,6 @@ static int dropmon_net_event(struct notifier_block *ev_block, new_stat->dev = dev; new_stat->last_rx = jiffies; - INIT_RCU_HEAD(&new_stat->rcu); spin_lock(&trace_state_lock); list_add_rcu(&new_stat->list, &hw_stats_list); spin_unlock(&trace_state_lock); -- cgit v1.2.2 From faf234220fb79a05891477a75180e1d9f7ab4105 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 17 Feb 2010 09:34:12 +0000 Subject: net: use kasprintf() for socket cache names kasprintf() makes code smaller. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/sock.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index ceef50bd131b..472a59f205b0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2228,13 +2228,10 @@ int proto_register(struct proto *prot, int alloc_slab) } if (prot->rsk_prot != NULL) { - static const char mask[] = "request_sock_%s"; - - prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab; - sprintf(prot->rsk_prot->slab_name, mask, prot->name); prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL); @@ -2247,14 +2244,11 @@ int proto_register(struct proto *prot, int alloc_slab) } if (prot->twsk_prot != NULL) { - static const char mask[] = "tw_sock_%s"; - - prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (prot->twsk_prot->twsk_slab_name == NULL) goto out_free_request_sock_slab; - sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name); prot->twsk_prot->twsk_slab = kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, -- cgit v1.2.2 From 7af3351f71f4b3b5dbccb66cdc9b097052760a7f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 17 Feb 2010 09:26:54 +0000 Subject: ethtool: Don't flush n-tuple list from ethtool_reset() The n-tuple list should be flushed if and only if the ETH_RESET_FILTER flag is set and the driver is able to reset filtering/flow direction hardware without also resetting a component whose flag is not set. This test is best left to the driver. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d08a0c7675bf..31b1eddc1b84 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -654,9 +654,6 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr) if (copy_from_user(&reset, useraddr, sizeof(reset))) return -EFAULT; - /* Clear ethtool n-tuple list */ - ethtool_ntuple_flush(dev); - ret = dev->ethtool_ops->reset(dev, &reset.data); if (ret) return ret; -- cgit v1.2.2 From e76b69cc0133952c98aa1ad6330cacacd269fd64 Mon Sep 17 00:00:00 2001 From: Ajit Khaparde Date: Tue, 16 Feb 2010 20:25:43 +0000 Subject: net: bug fix for vlan + gro issue Traffic (tcp) doesnot start on a vlan interface when gro is enabled. Even the tcp handshake was not taking place. This is because, the eth_type_trans call before the netif_receive_skb in napi_gro_finish() resets the skb->dev to napi->dev from the previously set vlan netdev interface. This causes the ip_route_input to drop the incoming packet considering it as a packet coming from a martian source. I could repro this on 2.6.32.7 (stable) and 2.6.33-rc7. With this fix, the traffic starts and the test runs fine on both vlan and non-vlan interfaces. CC: Herbert Xu CC: Patrick McHardy Signed-off-by: Ajit Khaparde Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d1cf53d0d597..1968980f513a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2813,7 +2813,7 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, switch (ret) { case GRO_NORMAL: case GRO_HELD: - skb->protocol = eth_type_trans(skb, napi->dev); + skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_HELD) skb_gro_pull(skb, -ETH_HLEN); -- cgit v1.2.2 From 5ff3f073670b544a9c0547cc6fef1f7eed5762ed Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 14 Feb 2010 01:01:00 +0000 Subject: net: export attach/detach filter routines Export sk_attach_filter/sk_detach_filter routines, so that tun module can use them. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 08db7b9143a3..7517110ff4ae 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -529,6 +529,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) sk_filter_delayed_uncharge(sk, old_fp); return 0; } +EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_detach_filter(struct sock *sk) { @@ -545,3 +546,4 @@ int sk_detach_filter(struct sock *sk) rcu_read_unlock_bh(); return ret; } +EXPORT_SYMBOL_GPL(sk_detach_filter); -- cgit v1.2.2 From b8afe6416101549e877f8470f2a160df69676166 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 19 Feb 2010 13:23:47 +0000 Subject: net-sysfs: Use rtnl_trylock in wireless sysfs methods. The wireless sysfs methods like the rest of the networking sysfs methods are removed with the rtnl_lock held and block until the existing methods stop executing. So use rtnl_trylock and restart_syscall so that the code continues to work. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fbc1c7472c5e..099c753c4213 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -410,7 +410,8 @@ static ssize_t wireless_show(struct device *d, char *buf, const struct iw_statistics *iw; ssize_t ret = -EINVAL; - rtnl_lock(); + if (!rtnl_trylock()) + return restart_syscall(); if (dev_isalive(dev)) { iw = get_wireless_stats(dev); if (iw) -- cgit v1.2.2 From bd55775c8dd656fc69b3a42a1c4ab32abb7e8af9 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 22 Feb 2010 16:20:22 -0800 Subject: xfrm: SA lookups signature with mark pass mark to all SA lookups to prepare them for when we add code to have them search. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 2e692afdc55d..43923811bd6a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2188,12 +2188,13 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... */ +#define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { struct xfrm_state *x = pkt_dev->flows[flow].x; if (!x) { /*slow path: we dont already have xfrm_state*/ - x = xfrm_stateonly_find(&init_net, + x = xfrm_stateonly_find(&init_net, DUMMY_MARK, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, AF_INET, -- cgit v1.2.2 From c4d49794ff2838038fd9756eae39c39a5a685833 Mon Sep 17 00:00:00 2001 From: Ajit Khaparde Date: Tue, 16 Feb 2010 20:25:43 +0000 Subject: net: bug fix for vlan + gro issue Traffic (tcp) doesnot start on a vlan interface when gro is enabled. Even the tcp handshake was not taking place. This is because, the eth_type_trans call before the netif_receive_skb in napi_gro_finish() resets the skb->dev to napi->dev from the previously set vlan netdev interface. This causes the ip_route_input to drop the incoming packet considering it as a packet coming from a martian source. I could repro this on 2.6.32.7 (stable) and 2.6.33-rc7. With this fix, the traffic starts and the test runs fine on both vlan and non-vlan interfaces. CC: Herbert Xu CC: Patrick McHardy Signed-off-by: Ajit Khaparde Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index be9924f60ec3..ec874218b206 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2761,7 +2761,7 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, switch (ret) { case GRO_NORMAL: case GRO_HELD: - skb->protocol = eth_type_trans(skb, napi->dev); + skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_HELD) skb_gro_pull(skb, -ETH_HLEN); -- cgit v1.2.2 From a898def29e4119bc01ebe7ca97423181f4c0ea2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Feb 2010 17:04:49 -0800 Subject: net: Add checking to rcu_dereference() primitives Update rcu_dereference() primitives to use new lockdep-based checking. The rcu_dereference() in __in6_dev_get() may be protected either by rcu_read_lock() or RTNL, per Eric Dumazet. The rcu_dereference() in __sk_free() is protected by the fact that it is never reached if an update could change it. Check for this by using rcu_dereference_check() to verify that the struct sock's ->sk_wmem_alloc counter is zero. Acked-by: Eric Dumazet Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1266887105-1528-5-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- net/core/dev.c | 2 +- net/core/filter.c | 6 +++--- net/core/rtnetlink.c | 8 ++++++++ net/core/sock.c | 3 ++- 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ec874218b206..bb1f1da2b8a7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2041,7 +2041,7 @@ gso: rcu_read_lock_bh(); txq = dev_pick_tx(dev, skb); - q = rcu_dereference(txq->qdisc); + q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); diff --git a/net/core/filter.c b/net/core/filter.c index 08db7b9143a3..3541aa48d21d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -86,7 +86,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) return err; rcu_read_lock_bh(); - filter = rcu_dereference(sk->sk_filter); + filter = rcu_dereference_bh(sk->sk_filter); if (filter) { unsigned int pkt_len = sk_run_filter(skb, filter->insns, filter->len); @@ -521,7 +521,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } rcu_read_lock_bh(); - old_fp = rcu_dereference(sk->sk_filter); + old_fp = rcu_dereference_bh(sk->sk_filter); rcu_assign_pointer(sk->sk_filter, fp); rcu_read_unlock_bh(); @@ -536,7 +536,7 @@ int sk_detach_filter(struct sock *sk) struct sk_filter *filter; rcu_read_lock_bh(); - filter = rcu_dereference(sk->sk_filter); + filter = rcu_dereference_bh(sk->sk_filter); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); sk_filter_delayed_uncharge(sk, filter); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 794bcb897ff0..4c7d3f635ba7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -89,6 +89,14 @@ int rtnl_is_locked(void) } EXPORT_SYMBOL(rtnl_is_locked); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_rtnl_is_held(void) +{ + return lockdep_is_held(&rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_is_held); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ + static struct rtnl_link *rtnl_msg_handlers[NPROTO]; static inline int rtm_msgindex(int msgtype) diff --git a/net/core/sock.c b/net/core/sock.c index e1f6f225f012..305cba401ae6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1073,7 +1073,8 @@ static void __sk_free(struct sock *sk) if (sk->sk_destruct) sk->sk_destruct(sk); - filter = rcu_dereference(sk->sk_filter); + filter = rcu_dereference_check(sk->sk_filter, + atomic_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); rcu_assign_pointer(sk->sk_filter, NULL); -- cgit v1.2.2 From 4edb246626be6e031950205c885bdf29fb2ff1eb Mon Sep 17 00:00:00 2001 From: "Williams, Mitch A" Date: Wed, 24 Feb 2010 21:59:56 +0000 Subject: rtnetlink: clean up SR-IOV config interface This patch consists of a few minor cleanups to the SR-IOV configurion code in rtnetlink. - Remove unneccesary lock - Remove unneccesary casts - Return correct error code for no driver support These changes are based on comments from Patrick McHardy Signed-off-by: Mitch Williams Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 42da96a4eeee..4dd4c3cdc442 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -930,10 +930,9 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm; ivm = nla_data(tb[IFLA_VF_MAC]); - write_lock_bh(&dev_base_lock); + err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); - write_unlock_bh(&dev_base_lock); if (err < 0) goto errout; modified = 1; @@ -942,12 +941,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv; ivv = nla_data(tb[IFLA_VF_VLAN]); - write_lock_bh(&dev_base_lock); + err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, - (u16)ivv->vlan, - (u8)ivv->qos); - write_unlock_bh(&dev_base_lock); + ivv->vlan, + ivv->qos); if (err < 0) goto errout; modified = 1; @@ -957,10 +955,9 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt; ivt = nla_data(tb[IFLA_VF_TX_RATE]); - write_lock_bh(&dev_base_lock); + err = -EOPNOTSUPP; if (ops->ndo_set_vf_tx_rate) err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate); - write_unlock_bh(&dev_base_lock); if (err < 0) goto errout; modified = 1; -- cgit v1.2.2 From e5e26d75f490d7d41f25a4b39ed6db1713beb417 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 24 Feb 2010 14:01:38 +0000 Subject: netdev: use list_first_entry macro Use list_first_entry macro; no longer any need to use 'next' directly in list to find first entry. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1968980f513a..eb7f1a4fefc6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3018,7 +3018,7 @@ static void net_rx_action(struct softirq_action *h) * entries to the tail of this list, and only ->poll() * calls can remove this head entry from the list. */ - n = list_entry(list->next, struct napi_struct, poll_list); + n = list_first_entry(list, struct napi_struct, poll_list); have = netpoll_poll_lock(n); @@ -4882,7 +4882,7 @@ static void rollback_registered_many(struct list_head *head) } /* Process any work delayed until the end of the batch */ - dev = list_entry(head->next, struct net_device, unreg_list); + dev = list_first_entry(head, struct net_device, unreg_list); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); synchronize_net(); @@ -5268,7 +5268,7 @@ void netdev_run_todo(void) while (!list_empty(&list)) { struct net_device *dev - = list_entry(list.next, struct net_device, todo_list); + = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { -- cgit v1.2.2 From c79c5ffdce14abb4de3878c5aa8c3c6e5093c69b Mon Sep 17 00:00:00 2001 From: Peter Waskiewicz Date: Fri, 26 Feb 2010 01:54:20 +0000 Subject: ethtool: Add n-tuple string length to drvinfo and return it The drvinfo struct should include the number of strings that get_rx_ntuple will return. It will be variable if an underlying driver implements its own get_rx_ntuple routine, so userspace needs to know how much data is coming. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 31b1eddc1b84..1c94f48e27b5 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -224,6 +224,9 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) info.n_priv_flags = rc; + rc = ops->get_sset_count(dev, ETH_SS_NTUPLE_FILTERS); + if (rc >= 0) + info.n_ntuples = rc; } if (ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); -- cgit v1.2.2 From 6e17d45ae310758ab30623a42ad070858c9a48de Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 24 Feb 2010 22:49:15 +0000 Subject: net: add addr len check to dev_mc_add Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev_mcast.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 9e2fa39f22a3..fd91569e2394 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -96,6 +96,8 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) int err; netif_addr_lock_bh(dev); + if (alen != dev->addr_len) + return -EINVAL; err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl); if (!err) __dev_set_rx_mode(dev); -- cgit v1.2.2 From 738b0343e73604750feb107e063c28b3ca36cb84 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 26 Feb 2010 05:12:02 -0800 Subject: Revert "ethtool: Add n-tuple string length to drvinfo and return it" This reverts commit c79c5ffdce14abb4de3878c5aa8c3c6e5093c69b. As Jeff points out we can't break the user visible interface like this, we need to add this into the reserved[] thing. Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1c94f48e27b5..31b1eddc1b84 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -224,9 +224,6 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) info.n_priv_flags = rc; - rc = ops->get_sset_count(dev, ETH_SS_NTUPLE_FILTERS); - if (rc >= 0) - info.n_ntuples = rc; } if (ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); -- cgit v1.2.2 From 10de05afe01c12cedc42eb9ce05b111eed6c8210 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 26 Feb 2010 06:34:50 +0000 Subject: rtnetlink: ignore NETDEV_PRE_UP notifier in rtnetlink_event() Commit 3b8bcfd (net: introduce pre-up netdev notifier) added a new notifier which is run before a device is set UP for use by cfg80211. The patch missed to add the new notifier to the ignore list in rtnetlink_event(), so we currently get an unnecessary netlink notification before a device is set UP. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4dd4c3cdc442..b7c7dfd86507 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1432,6 +1432,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_DOWN: rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); break; + case NETDEV_PRE_UP: case NETDEV_POST_INIT: case NETDEV_REGISTER: case NETDEV_CHANGE: -- cgit v1.2.2 From a2835763e130c343ace5320c20d33c281e7097b7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 26 Feb 2010 06:34:51 +0000 Subject: rtnetlink: handle rtnl_link netlink notifications manually In order to support specifying device flags during device creation, we must be able to roll back device registration in case setting the flags fails without sending any notifications related to the device to userspace. This patch changes rollback_registered_many() and register_netdevice() to manually send netlink notifications for devices not handled by rtnl_link and allows to defer notifications for devices handled by rtnl_link until setup is complete. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++++- net/core/rtnetlink.c | 4 +--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index eb7f1a4fefc6..75332b089529 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4865,6 +4865,10 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + /* * Flush the unicast and multicast chains */ @@ -5091,7 +5095,9 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); out: return ret; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b7c7dfd86507..020e43bfef5f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1425,9 +1425,6 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi struct net_device *dev = ptr; switch (event) { - case NETDEV_UNREGISTER: - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); - break; case NETDEV_UP: case NETDEV_DOWN: rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); @@ -1437,6 +1434,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_REGISTER: case NETDEV_CHANGE: case NETDEV_GOING_DOWN: + case NETDEV_UNREGISTER: case NETDEV_UNREGISTER_BATCH: break; default: -- cgit v1.2.2 From bd38081160bb3d036db98472e537b6a7dd4da51a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 26 Feb 2010 06:34:53 +0000 Subject: dev: support deferring device flag change notifications Split dev_change_flags() into two functions: __dev_change_flags() to perform the actual changes and __dev_notify_flags() to invoke netdevice notifiers. This will be used by rtnl_link to defer netlink notifications until the device has been fully configured. This changes ordering of some operations, in particular: - netlink notifications are sent after all changes have been performed. As a side effect this surpresses one unnecessary netlink message when the IFF_UP and other flags are changed simultaneously. - The NETDEV_UP/NETDEV_DOWN and NETDEV_CHANGE notifiers are invoked after all changes have been performed. Their relative is unchanged. - net_dmaengine_put() is invoked before the NETDEV_DOWN notifier instead of afterwards. This should not make any difference since both RX and TX are already shut down at this point. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 163 ++++++++++++++++++++++++++++++++------------------- net/core/rtnetlink.c | 2 - 2 files changed, 104 insertions(+), 61 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 75332b089529..e5972f7f7e1b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1113,32 +1113,13 @@ void dev_load(struct net *net, const char *name) } EXPORT_SYMBOL(dev_load); -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); - /* - * Is it already up? - */ - - if (dev->flags & IFF_UP) - return 0; - /* * Is it even present? */ @@ -1187,36 +1168,57 @@ int dev_open(struct net_device *dev) * Wakeup transmit queue engine */ dev_activate(dev); - - /* - * ... and announce new interface. - */ - call_netdevice_notifiers(NETDEV_UP, dev); } return ret; } -EXPORT_SYMBOL(dev_open); /** - * dev_close - shutdown an interface. - * @dev: device to shutdown + * dev_open - prepare an interface for use. + * @dev: device to open * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. */ -int dev_close(struct net_device *dev) +int dev_open(struct net_device *dev) +{ + int ret; + + /* + * Is it already up? + */ + if (dev->flags & IFF_UP) + return 0; + + /* + * Open device + */ + ret = __dev_open(dev); + if (ret < 0) + return ret; + + /* + * ... and announce new interface. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_UP, dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +static int __dev_close(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; - ASSERT_RTNL(); + ASSERT_RTNL(); might_sleep(); - if (!(dev->flags & IFF_UP)) - return 0; - /* * Tell people we are going down, so that they can * prepare to death, when device is still operating. @@ -1252,14 +1254,34 @@ int dev_close(struct net_device *dev) dev->flags &= ~IFF_UP; /* - * Tell people we are down + * Shutdown NET_DMA */ - call_netdevice_notifiers(NETDEV_DOWN, dev); + net_dmaengine_put(); + + return 0; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + if (!(dev->flags & IFF_UP)) + return 0; + + __dev_close(dev); /* - * Shutdown NET_DMA + * Tell people we are down */ - net_dmaengine_put(); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_DOWN, dev); return 0; } @@ -4299,18 +4321,10 @@ unsigned dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags) { - int ret, changes; int old_flags = dev->flags; + int ret; ASSERT_RTNL(); @@ -4341,17 +4355,12 @@ int dev_change_flags(struct net_device *dev, unsigned flags) ret = 0; if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); if (!ret) dev_set_rx_mode(dev); } - if (dev->flags & IFF_UP && - ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | - IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); - if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; @@ -4370,11 +4379,47 @@ int dev_change_flags(struct net_device *dev, unsigned flags) dev_set_allmulti(dev, inc); } - /* Exclude state transition flags, already notified */ - changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); + return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +{ + unsigned int changes = dev->flags ^ old_flags; + + if (changes & IFF_UP) { + if (dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_UP, dev); + else + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + if (dev->flags & IFF_UP && + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) + call_netdevice_notifiers(NETDEV_CHANGE, dev); +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned flags) +{ + int ret, changes; + int old_flags = dev->flags; + + ret = __dev_change_flags(dev, flags); + if (ret < 0) + return ret; + + changes = old_flags ^ dev->flags; if (changes) rtmsg_ifinfo(RTM_NEWLINK, dev, changes); + __dev_notify_flags(dev, old_flags); return ret; } EXPORT_SYMBOL(dev_change_flags); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 020e43bfef5f..c21ec4236dd0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1427,8 +1427,6 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi switch (event) { case NETDEV_UP: case NETDEV_DOWN: - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); - break; case NETDEV_PRE_UP: case NETDEV_POST_INIT: case NETDEV_REGISTER: -- cgit v1.2.2 From 3729d5021257b283f7fce33d957893162ccb2c9d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 26 Feb 2010 06:34:54 +0000 Subject: rtnetlink: support specifying device flags on device creation commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f Author: Patrick McHardy Date: Tue Feb 23 20:41:30 2010 +0100 Support specifying the initial device flags when creating a device though rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING in order to surpress netlink registration notifications. To complete setup, rtnl_configure_link() must be called, which performs the device flag changes and invokes the deferred notifiers if everything went well. Two examples: # add macvlan to eth0 # $ ip link add link eth0 up allmulticast on type macvlan [LINK]11: macvlan0@eth0: mtu 1500 qdisc noqueue state UNKNOWN link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff [ROUTE]ff00::/8 dev macvlan0 table local metric 256 mtu 1500 advmss 1440 hoplimit 0 [ROUTE]fe80::/64 dev macvlan0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit 0 [LINK]11: macvlan0@eth0: mtu 1500 link/ether 26:f8:84:02:f9:2a [ADDR]11: macvlan0 inet6 fe80::24f8:84ff:fe02:f92a/64 scope link valid_lft forever preferred_lft forever [ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo table local proto none metric 0 mtu 16436 advmss 16376 hoplimit 0 [ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0 proto kernel metric 1024 mtu 1500 advmss 1440 hoplimit 0 [NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE [ROUTE]2001:6f8:974::/64 dev macvlan0 proto kernel metric 256 expires 0sec mtu 1500 advmss 1440 hoplimit 0 [PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084 [ADDR]11: macvlan0 inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic valid_lft 86399sec preferred_lft 14399sec # add VLAN to eth1, eth1 is down # $ ip link add link eth1 up type vlan id 1000 RTNETLINK answers: Network is down Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c21ec4236dd0..d1472a423323 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -549,6 +549,19 @@ static void set_operstate(struct net_device *dev, unsigned char transition) } } +static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, + const struct ifinfomsg *ifm) +{ + unsigned int flags = ifm->ifi_flags; + + /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ + if (ifm->ifi_change) + flags = (flags & ifm->ifi_change) | + (dev->flags & ~ifm->ifi_change); + + return flags; +} + static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct net_device_stats *b) { @@ -904,13 +917,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, } if (ifm->ifi_flags || ifm->ifi_change) { - unsigned int flags = ifm->ifi_flags; - - /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ - if (ifm->ifi_change) - flags = (flags & ifm->ifi_change) | - (dev->flags & ~ifm->ifi_change); - err = dev_change_flags(dev, flags); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); if (err < 0) goto errout; } @@ -1053,6 +1060,26 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return 0; } +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) +{ + unsigned int old_flags; + int err; + + old_flags = dev->flags; + if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + if (err < 0) + return err; + } + + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + + __dev_notify_flags(dev, old_flags); + return 0; +} +EXPORT_SYMBOL(rtnl_configure_link); + struct net_device *rtnl_create_link(struct net *src_net, struct net *net, char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { @@ -1074,6 +1101,7 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, dev_net_set(dev, net); dev->rtnl_link_ops = ops; + dev->rtnl_link_state = RTNL_LINK_INITIALIZING; dev->real_num_tx_queues = real_num_queues; if (strchr(dev->name, '%')) { @@ -1203,7 +1231,7 @@ replay: if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -ENODEV; - if (ifm->ifi_index || ifm->ifi_flags || ifm->ifi_change) + if (ifm->ifi_index) return -EOPNOTSUPP; if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; @@ -1234,9 +1262,15 @@ replay: err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); - if (err < 0 && !IS_ERR(dev)) + if (err < 0 && !IS_ERR(dev)) { free_netdev(dev); + goto out; + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + unregister_netdevice(dev); +out: put_net(dest_net); return err; } -- cgit v1.2.2 From 9675478bbafed08848bf8d7e28400d5e46330b23 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Fri, 26 Feb 2010 21:43:38 +0000 Subject: ethtool: do not set some flags, if others failed NETIF_F_NTUPLE flag setting introduced a bug: non-ntuple flags like LRO may be successfully set, before ioctl(2) returns failure to userspace. The set-flags operation should be all-or-none, rather than leaving things in an inconsistent state prior to reporting failure to userspace. Signed-off-by: Jeff Garzik Signed-off-by: David S. Miller --- net/core/ethtool.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 31b1eddc1b84..0f2f82185ec4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -135,21 +135,23 @@ u32 ethtool_op_get_flags(struct net_device *dev) int ethtool_op_set_flags(struct net_device *dev, u32 data) { const struct ethtool_ops *ops = dev->ethtool_ops; + unsigned long features = dev->features; if (data & ETH_FLAG_LRO) - dev->features |= NETIF_F_LRO; + features |= NETIF_F_LRO; else - dev->features &= ~NETIF_F_LRO; + features &= ~NETIF_F_LRO; if (data & ETH_FLAG_NTUPLE) { if (!ops->set_rx_ntuple) return -EOPNOTSUPP; - dev->features |= NETIF_F_NTUPLE; + features |= NETIF_F_NTUPLE; } else { /* safe to clear regardless */ - dev->features &= ~NETIF_F_NTUPLE; + features &= ~NETIF_F_NTUPLE; } + dev->features = features; return 0; } -- cgit v1.2.2 From 76dadd76c265a0cdb5a76aa4eef03fcc9639b388 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 28 Feb 2010 01:20:36 +0000 Subject: scm: Only support SCM_RIGHTS on unix domain sockets. We use scm_send and scm_recv on both unix domain and netlink sockets, but only unix domain sockets support everything required for file descriptor passing, so error if someone attempts to pass file descriptors over netlink sockets. Cc: stable@kernel.org Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/scm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index b7ba91b074b3..9b264634acfd 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -156,6 +156,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: + if (!sock->ops || sock->ops->family != PF_UNIX) + goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) goto error; -- cgit v1.2.2 From 8eae939f1400326b06d0c9afe53d2a484a326871 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Thu, 4 Mar 2010 18:01:40 +0000 Subject: net: add limit for socket backlog We got system OOM while running some UDP netperf testing on the loopback device. The case is multiple senders sent stream UDP packets to a single receiver via loopback on local host. Of course, the receiver is not able to handle all the packets in time. But we surprisingly found that these packets were not discarded due to the receiver's sk->sk_rcvbuf limit. Instead, they are kept queuing to sk->sk_backlog and finally ate up all the memory. We believe this is a secure hole that a none privileged user can crash the system. The root cause for this problem is, when the receiver is doing __release_sock() (i.e. after userspace recv, kernel udp_recvmsg -> skb_free_datagram_locked -> release_sock), it moves skbs from backlog to sk_receive_queue with the softirq enabled. In the above case, multiple busy senders will almost make it an endless loop. The skbs in the backlog end up eat all the system memory. The issue is not only for UDP. Any protocols using socket backlog is potentially affected. The patch adds limit for socket backlog so that the backlog size cannot be expanded endlessly. Reported-by: Alex Shi Cc: David Miller Cc: Arnaldo Carvalho de Melo Cc: Alexey Kuznetsov Cc: Patrick McHardy Cc: Vlad Yasevich Cc: Sridhar Samudrala Cc: Jon Maloy Cc: Allan Stephens Cc: Andrew Hendry Signed-off-by: Zhu Yi Signed-off-by: Eric Dumazet Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/core/sock.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index fcd397a762ff..6e22dc973d23 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -340,8 +340,12 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else - sk_add_backlog(sk, skb); + } else if (sk_add_backlog_limited(sk, skb)) { + bh_unlock_sock(sk); + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + bh_unlock_sock(sk); out: sock_put(sk); @@ -1139,6 +1143,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* @@ -1542,6 +1547,12 @@ static void __release_sock(struct sock *sk) bh_lock_sock(sk); } while ((skb = sk->sk_backlog.head) != NULL); + + /* + * Doing the zeroing here guarantee we can not loop forever + * while a wild producer attempts to flood us. + */ + sk->sk_backlog.len = 0; } /** @@ -1874,6 +1885,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_backlog.limit = sk->sk_rcvbuf << 1; sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); -- cgit v1.2.2 From a3a858ff18a72a8d388e31ab0d98f7e944841a62 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Thu, 4 Mar 2010 18:01:47 +0000 Subject: net: backlog functions rename sk_add_backlog -> __sk_add_backlog sk_add_backlog_limited -> sk_add_backlog Signed-off-by: Zhu Yi Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 6e22dc973d23..61a65a2e0455 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -340,7 +340,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else if (sk_add_backlog_limited(sk, skb)) { + } else if (sk_add_backlog(sk, skb)) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; -- cgit v1.2.2 From 723b2f57ad83ee7087acf9a95e8e289414b1f521 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Wed, 3 Mar 2010 22:51:50 +0000 Subject: ethtool: Add direct access to ops->get_sset_count This patch is an alternative approach for accessing string counts, vs. the drvinfo indirect approach. This way the drvinfo space doesn't run out, and we don't break ABI later. Signed-off-by: Jeff Garzik Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/ethtool.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 0f2f82185ec4..70075c47ada8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -214,6 +214,10 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use info.cmd = ETHTOOL_GDRVINFO; ops->get_drvinfo(dev, &info); + /* + * this method of obtaining string set info is deprecated; + * consider using ETHTOOL_GSSET_INFO instead + */ if (ops->get_sset_count) { int rc; @@ -237,6 +241,71 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use return 0; } +/* + * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() + */ +static noinline int ethtool_get_sset_info(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_sset_info info; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 sset_mask; + int i, idx = 0, n_bits = 0, ret, rc; + u32 *info_buf = NULL; + + if (!ops->get_sset_count) + return -EOPNOTSUPP; + + if (copy_from_user(&info, useraddr, sizeof(info))) + return -EFAULT; + + /* store copy of mask, because we zero struct later on */ + sset_mask = info.sset_mask; + if (!sset_mask) + return 0; + + /* calculate size of return buffer */ + for (i = 0; i < 64; i++) + if (sset_mask & (1ULL << i)) + n_bits++; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GSSET_INFO; + + info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER); + if (!info_buf) + return -ENOMEM; + + /* + * fill return buffer based on input bitmask and successful + * get_sset_count return + */ + for (i = 0; i < 64; i++) { + if (!(sset_mask & (1ULL << i))) + continue; + + rc = ops->get_sset_count(dev, i); + if (rc >= 0) { + info.sset_mask |= (1ULL << i); + info_buf[idx++] = rc; + } + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, sizeof(info))) + goto out; + + useraddr += offsetof(struct ethtool_sset_info, data); + if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) + goto out; + + ret = 0; + +out: + kfree(info_buf); + return ret; +} + /* * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() */ @@ -1471,6 +1540,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXNTUPLE: rc = ethtool_get_rx_ntuple(dev, useraddr); break; + case ETHTOOL_GSSET_INFO: + rc = ethtool_get_sset_info(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.2 From d17792ebdf90289c9fd1bce888076d3d60ecd53b Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Thu, 4 Mar 2010 08:21:53 +0000 Subject: ethtool: Add direct access to ops->get_sset_count On 03/04/2010 09:26 AM, Ben Hutchings wrote: > On Thu, 2010-03-04 at 00:51 -0800, Jeff Kirsher wrote: >> From: Jeff Garzik >> >> This patch is an alternative approach for accessing string >> counts, vs. the drvinfo indirect approach. This way the drvinfo >> space doesn't run out, and we don't break ABI later. > [...] >> --- a/net/core/ethtool.c >> +++ b/net/core/ethtool.c >> @@ -214,6 +214,10 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use >> info.cmd = ETHTOOL_GDRVINFO; >> ops->get_drvinfo(dev,&info); >> >> + /* >> + * this method of obtaining string set info is deprecated; >> + * consider using ETHTOOL_GSSET_INFO instead >> + */ > > This comment belongs on the interface (ethtool.h) not the > implementation. Debatable -- the current comment is located at the callsite of ops->get_sset_count(), which is where an implementor might think to add a new call. Not all the numeric fields in ethtool_drvinfo are obtained from ->get_sset_count(). Hence the "some" in the attached patch to include/linux/ethtool.h, addressing your comment. > [...] >> +static noinline int ethtool_get_sset_info(struct net_device *dev, >> + void __user *useraddr) >> +{ > [...] >> + /* calculate size of return buffer */ >> + for (i = 0; i< 64; i++) >> + if (sset_mask& (1ULL<< i)) >> + n_bits++; > [...] > > We have a function for this: > > n_bits = hweight64(sset_mask); Agreed. I've attached a follow-up patch, which should enable my/Jeff's kernel patch to be applied, followed by this one. Signed-off-by: Jeff Garzik Signed-off-by: David S. Miller --- net/core/ethtool.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 70075c47ada8..33d2ded50f84 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -17,6 +17,7 @@ #include #include #include +#include #include /* @@ -216,7 +217,7 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use /* * this method of obtaining string set info is deprecated; - * consider using ETHTOOL_GSSET_INFO instead + * Use ETHTOOL_GSSET_INFO instead. */ if (ops->get_sset_count) { int rc; @@ -265,9 +266,7 @@ static noinline int ethtool_get_sset_info(struct net_device *dev, return 0; /* calculate size of return buffer */ - for (i = 0; i < 64; i++) - if (sset_mask & (1ULL << i)) - n_bits++; + n_bits = hweight64(sset_mask); memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GSSET_INFO; -- cgit v1.2.2 From 72150e9b7fec217fbd646a29ea2f65a3d4d55ea9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 6 Mar 2010 01:04:45 +0000 Subject: sock.c: potential null dereference We test that "prot->rsk_prot" is non-null right before we dereference it on this line. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/sock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 61a65a2e0455..c5812bbc2cc9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2288,7 +2288,8 @@ out_free_request_sock_slab: prot->rsk_prot->slab = NULL; } out_free_request_sock_slab_name: - kfree(prot->rsk_prot->slab_name); + if (prot->rsk_prot) + kfree(prot->rsk_prot->slab_name); out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; -- cgit v1.2.2 From f5c445ed4148434f142be0263a8ad7cb58503e8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Mar 2010 12:17:04 -0800 Subject: ethtool: Use noinline_for_stack Use self documenting noinline_for_stack instead of duplicated comments. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/ethtool.c | 40 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 33d2ded50f84..f4cb6b6299d9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -200,10 +200,7 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_settings(dev, &cmd); } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) { struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -242,10 +239,7 @@ static noinline int ethtool_get_drvinfo(struct net_device *dev, void __user *use return 0; } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_get_sset_info(struct net_device *dev, +static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, void __user *useraddr) { struct ethtool_sset_info info; @@ -305,10 +299,7 @@ out: return ret; } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc cmd; @@ -321,10 +312,7 @@ static noinline int ethtool_set_rxnfc(struct net_device *dev, void __user *usera return dev->ethtool_ops->set_rxnfc(dev, &cmd); } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc info; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -396,10 +384,7 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, list->count++; } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) { struct ethtool_rx_ntuple cmd; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -867,10 +852,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) return ret; } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; @@ -884,10 +866,7 @@ static noinline int ethtool_get_coalesce(struct net_device *dev, void __user *us return 0; } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce; @@ -1297,10 +1276,7 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr, return actor(dev, edata.data); } -/* - * noinline attribute so that gcc doesnt use too much stack in dev_ethtool() - */ -static noinline int ethtool_flash_device(struct net_device *dev, char __user *useraddr) +static noinline_for_stack int ethtool_flash_device(struct net_device *dev, char __user *useraddr) { struct ethtool_flash efl; -- cgit v1.2.2 From 0a141509ede48ac33ef756ac1640f4d3f46fa2db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Mar 2010 19:40:54 +0000 Subject: net: Annotates neigh_invalidate() Annotates neigh_invalidate() with __releases() and __acquires() for sparse sake. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d102f6d9abdc..6cee6434da67 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -771,6 +771,8 @@ static __inline__ int neigh_max_probes(struct neighbour *n) } static void neigh_invalidate(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) { struct sk_buff *skb; -- cgit v1.2.2 From 3041f5170751e3522aa1bd6e8ca5d98e846720b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Mar 2010 19:09:08 +0000 Subject: net: Fix dev_mc_add() Commit 6e17d45a (net: add addr len check to dev_mc_add) added a bug in dev_mc_add(), since it can now exit with a lock imbalance. Signed-off-by: Eric Dumazet CC: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev_mcast.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index fd91569e2394..3dc295beb483 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -97,8 +97,9 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) netif_addr_lock_bh(dev); if (alen != dev->addr_len) - return -EINVAL; - err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl); + err = -EINVAL; + else + err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); -- cgit v1.2.2 From 21edbb223ed2af88b090e7945af7d91d672e3aa6 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 16 Mar 2010 05:29:54 +0000 Subject: NET: netpoll, fix potential NULL ptr dereference Stanse found that one error path in netpoll_setup dereferences npinfo even though it is NULL. Avoid that by adding new label and go to that instead. Signed-off-by: Jiri Slaby Cc: Daniel Borkmann Cc: David S. Miller Acked-by: chavey@google.com Acked-by: Matt Mackall Signed-off-by: David S. Miller --- net/core/netpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 7aa697253765..d4ec38fa64e6 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -735,7 +735,7 @@ int netpoll_setup(struct netpoll *np) npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; - goto release; + goto put; } npinfo->rx_flags = 0; @@ -845,7 +845,7 @@ int netpoll_setup(struct netpoll *np) kfree(npinfo); } - +put: dev_put(ndev); return err; } -- cgit v1.2.2 From 0641e4fbf2f824faee00ea74c459a088d94905fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Mar 2010 21:16:45 -0700 Subject: net: Potential null skb->dev dereference When doing "ifenslave -d bond0 eth0", there is chance to get NULL dereference in netif_receive_skb(), because dev->master suddenly becomes NULL after we tested it. We should use ACCESS_ONCE() to avoid this (or rcu_dereference()) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bcc490cc9452..59d4394d2ce8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2483,6 +2483,7 @@ int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; struct net_device *orig_dev; + struct net_device *master; struct net_device *null_or_orig; struct net_device *null_or_bond; int ret = NET_RX_DROP; @@ -2503,11 +2504,12 @@ int netif_receive_skb(struct sk_buff *skb) null_or_orig = NULL; orig_dev = skb->dev; - if (orig_dev->master) { - if (skb_bond_should_drop(skb)) + master = ACCESS_ONCE(orig_dev->master); + if (master) { + if (skb_bond_should_drop(skb, master)) null_or_orig = orig_dev; /* deliver only exact match */ else - skb->dev = orig_dev->master; + skb->dev = master; } __get_cpu_var(netdev_rx_stat).total++; -- cgit v1.2.2 From 5fc05f8764f301138003ff562a31ad3721f1675f Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Sun, 21 Mar 2010 22:59:58 +0000 Subject: netpoll: warn when there are spaces in parameters v2: update according to Frans' comments. Currently, if we leave spaces before dst port, netconsole will silently accept it as 0. Warn about this. Also, when spaces appear in other places, make them visible in error messages. Signed-off-by: WANG Cong Cc: David Miller Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netpoll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d4ec38fa64e6..6f9206b36dc2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -614,7 +614,7 @@ void netpoll_print_options(struct netpoll *np) np->name, np->local_port); printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip); - printk(KERN_INFO "%s: interface %s\n", + printk(KERN_INFO "%s: interface '%s'\n", np->name, np->dev_name); printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port); @@ -661,6 +661,9 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; + if (*cur == ' ' || *cur == '\t') + printk(KERN_INFO "%s: warning: whitespace" + "is not allowed\n", np->name); np->remote_port = simple_strtol(cur, NULL, 10); cur = delim; } @@ -708,7 +711,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) return 0; parse_failed: - printk(KERN_INFO "%s: couldn't parse config at %s!\n", + printk(KERN_INFO "%s: couldn't parse config at '%s'!\n", np->name, cur); return -1; } -- cgit v1.2.2 From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Mar 2010 17:04:11 +0900 Subject: include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo Guess-its-ok-by: Christoph Lameter Cc: Ingo Molnar Cc: Lee Schermerhorn --- net/core/datagram.c | 1 + net/core/dev.c | 1 + net/core/drop_monitor.c | 1 + net/core/dst.c | 1 + net/core/ethtool.c | 1 + net/core/fib_rules.c | 1 + net/core/filter.c | 1 + net/core/gen_estimator.c | 1 + net/core/iovec.c | 1 - net/core/link_watch.c | 1 - net/core/neighbour.c | 1 + net/core/net-sysfs.c | 1 + net/core/net-traces.c | 1 + net/core/netpoll.c | 1 + net/core/scm.c | 1 + net/core/sysctl_net_core.c | 1 + 16 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 95c2e0840d0d..2dccd4ee591b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include diff --git a/net/core/dev.c b/net/core/dev.c index 59d4394d2ce8..1c8a0ce473a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index f8c874975350..cf208d8042b1 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/net/core/dst.c b/net/core/dst.c index cb1b3488b739..f307bc18f6a0 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f4cb6b6299d9..9d55c57f318a 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a24377146bf..d2c3e7dc2e5f 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/net/core/filter.c b/net/core/filter.c index d38ef7fd50f0..ff943bed21af 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 493775f4f2f1..cf8e70392fe0 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/net/core/iovec.c b/net/core/iovec.c index 16ad45d4882b..1e7f4e91a935 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 5910b555a54a..bdbce2f5875b 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6cee6434da67..bff37908bd55 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -15,6 +15,7 @@ * Harald Welte Add neighbour cache statistics like rtstat */ +#include #include #include #include diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 099c753c4213..59cfc7d8fc45 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/net-traces.c b/net/core/net-traces.c index f1e982c508bb..afa6380ed88a 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6f9206b36dc2..a58f59b97597 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/scm.c b/net/core/scm.c index 9b264634acfd..b88f6f9d0b97 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 06124872af5b..b7b6b8208f75 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include -- cgit v1.2.2 From 8728c544a9cbdcb0034aa5c45706c5f953f030ee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 11 Apr 2010 21:18:17 +0000 Subject: net: dev_pick_tx() fix When dev_pick_tx() caches tx queue_index on a socket, we must check socket dst_entry matches skb one, or risk a crash later, as reported by Denys Fedorysychenko, if old packets are in flight during a route change, involving devices with different number of queues. Bug introduced by commit a4ee3ce3 (net: Use sk_tx_queue_mapping for connected sockets) Reported-by: Denys Fedorysychenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1c8a0ce473a8..92584bfef09b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1989,8 +1989,12 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues > 1) queue_index = skb_tx_hash(dev, skb); - if (sk && sk->sk_dst_cache) - sk_tx_queue_set(sk, queue_index); + if (sk) { + struct dst_entry *dst = rcu_dereference(sk->sk_dst_cache); + + if (dst && skb_dst(skb) == dst) + sk_tx_queue_set(sk, queue_index); + } } } -- cgit v1.2.2 From 05d17608a69b3ae653ea5c9857283bef3439c733 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 20 Apr 2010 00:25:58 +0000 Subject: net: Fix an RCU warning in dev_pick_tx() Fix the following RCU warning in dev_pick_tx(): =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- net/core/dev.c:1993 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by swapper/0: #0: (&idev->mc_ifc_timer){+.-...}, at: [] run_timer_softirq+0x17b/0x278 #1: (rcu_read_lock_bh){.+....}, at: [] dev_queue_xmit+0x14e/0x4dc stack backtrace: Pid: 0, comm: swapper Not tainted 2.6.34-rc5-cachefs #4 Call Trace: [] lockdep_rcu_dereference+0xaa/0xb2 [] dev_queue_xmit+0x259/0x4dc [] ? dev_queue_xmit+0x14e/0x4dc [] ? trace_hardirqs_on+0xd/0xf [] ? local_bh_enable_ip+0xbc/0xc1 [] neigh_resolve_output+0x24b/0x27c [] ip6_output_finish+0x7c/0xb4 [] ip6_output2+0x256/0x261 [] ? trace_hardirqs_on+0xd/0xf [] ip6_output+0xbbc/0xbcb [] ? fib6_force_start_gc+0x2b/0x2d [] mld_sendpack+0x273/0x39d [] ? mld_sendpack+0x0/0x39d [] ? mark_held_locks+0x52/0x70 [] mld_ifc_timer_expire+0x24f/0x288 [] run_timer_softirq+0x1ec/0x278 [] ? run_timer_softirq+0x17b/0x278 [] ? mld_ifc_timer_expire+0x0/0x288 [] ? __do_softirq+0x69/0x140 [] __do_softirq+0xa2/0x140 [] call_softirq+0x1c/0x28 [] do_softirq+0x38/0x80 [] irq_exit+0x45/0x47 [] smp_apic_timer_interrupt+0x88/0x96 [] apic_timer_interrupt+0x13/0x20 [] ? __atomic_notifier_call_chain+0x0/0x86 [] ? mwait_idle+0x6e/0x78 [] ? mwait_idle+0x65/0x78 [] cpu_idle+0x4d/0x83 [] rest_init+0xb9/0xc0 [] ? rest_init+0x0/0xc0 [] start_kernel+0x392/0x39d [] x86_64_start_reservations+0xb3/0xb7 [] x86_64_start_kernel+0xe4/0xeb An rcu_dereference() should be an rcu_dereference_bh(). Signed-off-by: David Howells Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 92584bfef09b..f769098774b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1990,7 +1990,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, queue_index = skb_tx_hash(dev, skb); if (sk) { - struct dst_entry *dst = rcu_dereference(sk->sk_dst_cache); + struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache); if (dst && skb_dst(skb) == dst) sk_tx_queue_set(sk, queue_index); -- cgit v1.2.2 From 80032cffb95edff4fc216b1cb21682257be326b7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 21 Apr 2010 23:53:27 +0000 Subject: rtnetlink: potential ERR_PTR dereference In the original code, if rtnl_create_link() returned an ERR_PTR then that would get passed to rtnl_configure_link() which dereferences it. Signed-off-by: Dan Carpenter Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4568120d8533..fe776c9ddeca 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1270,10 +1270,11 @@ replay: err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); - if (err < 0 && !IS_ERR(dev)) { + + if (err < 0 && !IS_ERR(dev)) free_netdev(dev); + if (err < 0) goto out; - } err = rtnl_configure_link(dev, ifm); if (err < 0) -- cgit v1.2.2 From 6ec82562ffc6f297d0de36d65776cff8e5704867 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 May 2010 00:53:53 -0700 Subject: veth: Dont kfree_skb() after dev_forward_skb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of congestion, netif_rx() frees the skb, so we must assume dev_forward_skb() also consume skb. Bug introduced by commit 445409602c092 (veth: move loopback logic to common location) We must change dev_forward_skb() to always consume skb, and veth to not double free it. Bug report : http://marc.info/?l=linux-netdev&m=127310770900442&w=3 Reported-by: Martín Ferrari Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f769098774b7..264137fce3a2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1451,7 +1451,7 @@ static inline void net_timestamp(struct sk_buff *skb) * * return values: * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) + * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue @@ -1465,12 +1465,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { skb_orphan(skb); - if (!(dev->flags & IFF_UP)) - return NET_RX_DROP; - - if (skb->len > (dev->mtu + dev->hard_header_len)) + if (!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len))) { + kfree_skb(skb); return NET_RX_DROP; - + } skb_set_dev(skb, dev); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; -- cgit v1.2.2 From c02db8c6290bb992442fec1407643c94cc414375 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sun, 16 May 2010 01:05:45 -0700 Subject: rtnetlink: make SR-IOV VF interface symmetric Now we have a set of nested attributes: IFLA_VFINFO_LIST (NESTED) IFLA_VF_INFO (NESTED) IFLA_VF_MAC IFLA_VF_VLAN IFLA_VF_TX_RATE This allows a single set to operate on multiple attributes if desired. Among other things, it means a dump can be replayed to set state. The current interface has yet to be released, so this seems like something to consider for 2.6.34. Signed-off-by: Chris Wright Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 159 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 110 insertions(+), 49 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fe776c9ddeca..31e85d327aa2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -602,12 +602,19 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->tx_compressed = b->tx_compressed; }; +/* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent)) - return dev_num_vf(dev->dev.parent) * - sizeof(struct ifla_vf_info); - else + if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { + + int num_vfs = dev_num_vf(dev->dev.parent); + size_t size = nlmsg_total_size(sizeof(struct nlattr)); + size += nlmsg_total_size(num_vfs * sizeof(struct nlattr)); + size += num_vfs * (sizeof(struct ifla_vf_mac) + + sizeof(struct ifla_vf_vlan) + + sizeof(struct ifla_vf_tx_rate)); + return size; + } else return 0; } @@ -629,7 +636,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_NUM_VF */ - + nla_total_size(rtnl_vfinfo_size(dev)) /* IFLA_VFINFO */ + + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ } @@ -700,14 +707,37 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { int i; - struct ifla_vf_info ivi; - NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); - for (i = 0; i < dev_num_vf(dev->dev.parent); i++) { + struct nlattr *vfinfo, *vf; + int num_vfs = dev_num_vf(dev->dev.parent); + + NLA_PUT_U32(skb, IFLA_NUM_VF, num_vfs); + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + goto nla_put_failure; + for (i = 0; i < num_vfs; i++) { + struct ifla_vf_info ivi; + struct ifla_vf_mac vf_mac; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_tx_rate vf_tx_rate; if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; - NLA_PUT(skb, IFLA_VFINFO, sizeof(ivi), &ivi); + vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.tx_rate; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + goto nla_put_failure; + } + NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); + NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); + NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); + nla_nest_end(skb, vf); } + nla_nest_end(skb, vfinfo); } if (dev->rtnl_link_ops) { if (rtnl_link_fill(skb, dev) < 0) @@ -769,12 +799,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, - [IFLA_VF_MAC] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_mac) }, - [IFLA_VF_VLAN] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_vlan) }, - [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_tx_rate) }, + [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, }; EXPORT_SYMBOL(ifla_policy); @@ -783,6 +808,19 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_DATA] = { .type = NLA_NESTED }, }; +static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { + [IFLA_VF_INFO] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { + [IFLA_VF_MAC] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_VLAN] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_tx_rate) }, +}; + struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; @@ -812,6 +850,52 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return 0; } +static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) +{ + int rem, err = -EINVAL; + struct nlattr *vf; + const struct net_device_ops *ops = dev->netdev_ops; + + nla_for_each_nested(vf, attr, rem) { + switch (nla_type(vf)) { + case IFLA_VF_MAC: { + struct ifla_vf_mac *ivm; + ivm = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + break; + } + case IFLA_VF_VLAN: { + struct ifla_vf_vlan *ivv; + ivv = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, + ivv->vlan, + ivv->qos); + break; + } + case IFLA_VF_TX_RATE: { + struct ifla_vf_tx_rate *ivt; + ivt = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_tx_rate) + err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, + ivt->rate); + break; + } + default: + err = -EINVAL; + break; + } + if (err) + break; + } + return err; +} + static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { @@ -942,40 +1026,17 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, write_unlock_bh(&dev_base_lock); } - if (tb[IFLA_VF_MAC]) { - struct ifla_vf_mac *ivm; - ivm = nla_data(tb[IFLA_VF_MAC]); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_mac) - err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); - if (err < 0) - goto errout; - modified = 1; - } - - if (tb[IFLA_VF_VLAN]) { - struct ifla_vf_vlan *ivv; - ivv = nla_data(tb[IFLA_VF_VLAN]); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_vlan) - err = ops->ndo_set_vf_vlan(dev, ivv->vf, - ivv->vlan, - ivv->qos); - if (err < 0) - goto errout; - modified = 1; - } - err = 0; - - if (tb[IFLA_VF_TX_RATE]) { - struct ifla_vf_tx_rate *ivt; - ivt = nla_data(tb[IFLA_VF_TX_RATE]); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_tx_rate) - err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate); - if (err < 0) - goto errout; - modified = 1; + if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *attr; + int rem; + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { + if (nla_type(attr) != IFLA_VF_INFO) + goto errout; + err = do_setvfinfo(dev, attr); + if (err < 0) + goto errout; + modified = 1; + } } err = 0; -- cgit v1.2.2