From bfb564e7391340638afe4ad67744a8f3858e7566 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Wed, 4 Aug 2010 06:15:52 +0000 Subject: core: Factor out flow calculation from get_rps_cpu Factor out flow calculation code from get_rps_cpu, since other functions can use the same code. Revisions: v2 (Ben): Separate flow calcuation out and use in select queue. v3 (Arnd): Don't re-implement MIN. v4 (Changli): skb->data points to ethernet header in macvtap, and make a fast path. Tested macvtap with this patch. v5 (Changli): - Cache skb->rxhash in skb_get_rxhash - macvtap may not have pow(2) queues, so change code for queue selection. (Arnd): - Use first available queue if all fails. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/dev.c | 106 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 44 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1ae65439144..586a11cb439 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2259,69 +2259,41 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); - /* - * get_rps_cpu is called from netif_receive_skb and returns the target - * CPU from the RPS map of the receiving queue for a given skb. - * rcu_read_lock must be held on entry. + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow **rflowp) +__u32 __skb_get_rxhash(struct sk_buff *skb) { + int nhoff, hash = 0; struct ipv6hdr *ip6; struct iphdr *ip; - struct netdev_rx_queue *rxqueue; - struct rps_map *map; - struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; - int cpu = -1; u8 ip_proto; - u16 tcpu; u32 addr1, addr2, ihl; union { u32 v32; u16 v16[2]; } ports; - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); - goto done; - } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; - - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) - goto done; - - if (skb->rxhash) - goto got_hash; /* Skip hash computation on packet header */ + nhoff = skb_network_offset(skb); switch (skb->protocol) { case __constant_htons(ETH_P_IP): - if (!pskb_may_pull(skb, sizeof(*ip))) + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data; + ip = (struct iphdr *) skb->data + nhoff; ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): - if (!pskb_may_pull(skb, sizeof(*ip6))) + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data; + ip6 = (struct ipv6hdr *) skb->data + nhoff; ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2330,6 +2302,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, default: goto done; } + switch (ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -2338,8 +2311,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) { - ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff + + (ihl * 4)); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); break; @@ -2352,11 +2326,55 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* get a consistent hash (same value on both flow directions) */ if (addr2 < addr1) swap(addr1, addr2); - skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); - if (!skb->rxhash) - skb->rxhash = 1; -got_hash: + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " + "on queue %u, but number of RX queues is %u\n", + dev->name, index, dev->num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + goto done; + + if (!skb_get_rxhash(skb)) + goto done; + flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { -- cgit v1.2.2 From 2244d07bfa2097cb00600da91c715a8aa547917e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 17 Aug 2010 08:59:14 +0000 Subject: net: simplify flags for tx timestamping This patch removes the abstraction introduced by the union skb_shared_tx in the shared skb data. The access of the different union elements at several places led to some confusion about accessing the shared tx_flags e.g. in skb_orphan_try(). http://marc.info/?l=linux-netdev&m=128084897415886&w=2 Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 586a11cb439..c1dc8a95f6f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1902,14 +1902,14 @@ static int dev_gso_segment(struct sk_buff *skb) /* * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested, since - * drivers need to call skb_tstamp_tx() to send the timestamp. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c */ static inline void skb_orphan_try(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (sk && !skb_tx(skb)->flags) { + if (sk && !skb_shinfo(skb)->tx_flags) { /* skb_tx_hash() wont be able to get sk. * We copy sk_hash into skb->rxhash */ -- cgit v1.2.2 From 2d47b45951af087c1a4439c559309b0bf90a0718 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:00:56 +0000 Subject: net: rps: reset network header before calling skb_get_rxhash() skb_get_rxhash() assumes the network header pointer of the skb is set properly after the commit: commit bfb564e7391340638afe4ad67744a8f3858e7566 Author: Krishna Kumar Date: Wed Aug 4 06:15:52 2010 +0000 core: Factor out flow calculation from get_rps_cpu Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index c1dc8a95f6f..cf87fde3a29 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2372,6 +2372,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!rxqueue->rps_map && !rxqueue->rps_flow_table) goto done; + skb_reset_network_header(skb); if (!skb_get_rxhash(skb)) goto done; -- cgit v1.2.2 From dbe5775bbc00116ed5699babfe17c54f32eb34c3 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:01:38 +0000 Subject: net: rps: skip fragment when computing rxhash Fragmented IP packets may have no transfer header, so when computing rxhash, we should skip them. Signed-off-by: Changli Gao Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cf87fde3a29..7e97e891636 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2284,7 +2284,10 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) goto done; ip = (struct iphdr *) skb->data + nhoff; - ip_proto = ip->protocol; + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + else + ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; -- cgit v1.2.2 From 12fcdefb3643607c47f39906a49056cf608bb545 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:04:32 +0000 Subject: net: rps: use proto_ports_offset() to handle the AH message correctly The SPI isn't at the beginning of an AH message. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7e97e891636..da584f5ab3d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2266,7 +2266,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ __u32 __skb_get_rxhash(struct sk_buff *skb) { - int nhoff, hash = 0; + int nhoff, hash = 0, poff; struct ipv6hdr *ip6; struct iphdr *ip; u8 ip_proto; @@ -2306,24 +2306,15 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) goto done; } - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { - ports.v32 = * (__force u32 *) (skb->data + nhoff + - (ihl * 4)); + ports.v32 = 0; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += ihl * 4 + poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); - break; } - default: - ports.v32 = 0; - break; } /* get a consistent hash (same value on both flow directions) */ -- cgit v1.2.2 From 1003489e06c04d807c783a8958f2ccc9aed7a244 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 21 Aug 2010 06:13:28 +0000 Subject: net: rps: fix the wrong network header pointer __skb_get_rxhash() was broken after the commit: commit bfb564e7391340638afe4ad67744a8f3858e7566 Author: Krishna Kumar Date: Wed Aug 4 06:15:52 2010 +0000 core: Factor out flow calculation from get_rps_cpu Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index da584f5ab3d..4d74d26b8e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2283,7 +2283,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data + nhoff; + ip = (struct iphdr *) (skb->data + nhoff); if (ip->frag_off & htons(IP_MF | IP_OFFSET)) ip_proto = 0; else @@ -2296,7 +2296,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data + nhoff; + ip6 = (struct ipv6hdr *) (skb->data + nhoff); ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; -- cgit v1.2.2 From 05532121da0728eaedac2a0a5c3cecad3a95d765 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sun, 22 Aug 2010 21:03:33 -0700 Subject: net: 802.1q: make vlan_hwaccel_do_receive() return void vlan_hwaccel_do_receive() always returns 0, so make it return void. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7cd5237d982..d569f88bcf8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2841,8 +2841,8 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) - return NET_RX_SUCCESS; + if (vlan_tx_tag_present(skb)) + vlan_hwaccel_do_receive(skb); /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) -- cgit v1.2.2 From 21dc330157454046dd7c494961277d76e1c957fe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Aug 2010 00:13:46 -0700 Subject: net: Rename skb_has_frags to skb_has_frag_list SKBs can be "fragmented" in two ways, via a page array (called skb_shinfo(skb)->frags[]) and via a list of SKBs (called skb_shinfo(skb)->frag_list). Since skb_has_frags() tests the latter, it's name is confusing since it sounds more like it's testing the former. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d569f88bcf8..859e30ff044 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1930,7 +1930,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { return skb_is_nonlinear(skb) && - ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -3090,7 +3090,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) goto normal; - if (skb_is_gso(skb) || skb_has_frags(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; rcu_read_lock(); -- cgit v1.2.2 From 40d0802b3eb47d57e2d57a5244a18cbbe9632e13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Aug 2010 22:03:08 -0700 Subject: gro: __napi_gro_receive() optimizations compare_ether_header() can have a special implementation on 64 bit arches if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined. __napi_gro_receive() and vlan_gro_common() can avoid a conditional branch to perform device match. On x86_64, __napi_gro_receive() has now 38 instructions instead of 53 As gcc-4.4.3 still choose to not inline it, add inline keyword to this performance critical function. Signed-off-by: Eric Dumazet CC: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 859e30ff044..63bd20a7592 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3169,16 +3169,18 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static gro_result_t +static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - (p->dev == skb->dev) && - !compare_ether_header(skb_mac_header(p), + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.2.2 From 86cac58b71227cc34a3d0e78f19585c0eff49ea3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 18:25:32 +0000 Subject: skge: add GRO support - napi_gro_flush() is exported from net/core/dev.c, to avoid an irq_save/irq_restore in the packet receive path. - use napi_gro_receive() instead of netif_receive_skb() - use napi_gro_flush() before calling __napi_complete() - turn on NETIF_F_GRO by default - Tested on a Marvell 88E8001 Gigabit NIC Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 63bd20a7592..d8c43e73f0b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3063,7 +3063,7 @@ out: return netif_receive_skb(skb); } -static void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -3076,6 +3076,7 @@ static void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } +EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.2.2 From c07b68e841bd737e2ebeb57268d251c89ccc5010 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Sep 2010 03:53:46 +0000 Subject: net: dev_add_pack() & __dev_remove_pack() changes Add a small helper ptype_head() to get the head to manipulate dev_add_pack() & __dev_remove_pack() can use a spinlock without blocking BH, since softirq use RCU, and these functions are run from process context only. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d8c43e73f0b..efd318db11a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,6 +371,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) * --ANK (980803) */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + /** * dev_add_pack - add packet handler * @pt: packet type declaration @@ -386,16 +394,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) void dev_add_pack(struct packet_type *pt) { - int hash; + struct list_head *head = ptype_head(pt); - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) - list_add_rcu(&pt->list, &ptype_all); - else { - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); @@ -414,15 +417,10 @@ EXPORT_SYMBOL(dev_add_pack); */ void __dev_remove_pack(struct packet_type *pt) { - struct list_head *head; + struct list_head *head = ptype_head(pt); struct packet_type *pt1; - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) - head = &ptype_all; - else - head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { @@ -433,7 +431,7 @@ void __dev_remove_pack(struct packet_type *pt) printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); out: - spin_unlock_bh(&ptype_lock); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); -- cgit v1.2.2 From 6febfca98f25c7ee5c3ff7fc85e048bf82230ad5 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 3 Sep 2010 23:12:37 +0000 Subject: net: rps: add the shortcut for one rps_cpus When there is only one rps_cpus, skb_get_rxhash() can be eliminated. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index efd318db11a..cdbbea39c54 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2343,7 +2343,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue; - struct rps_map *map; + struct rps_map *map = NULL; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; int cpu = -1; @@ -2361,8 +2361,17 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } else rxqueue = dev->_rx; - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + if (rxqueue->rps_map) { + map = rcu_dereference(rxqueue->rps_map); + if (map && map->len == 1) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rxqueue->rps_flow_table) { goto done; + } skb_reset_network_header(skb); if (!skb_get_rxhash(skb)) @@ -2407,7 +2416,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } } - map = rcu_dereference(rxqueue->rps_map); if (map) { tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; -- cgit v1.2.2 From 95ae6b228f814fc0528d0506ee9f18ac333d6851 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 04:04:31 +0000 Subject: ipv4: ip_ptr cleanups dev->ip_ptr is protected by rtnl and rcu. Yet some places dont use appropriate primitives and/or locking rules. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fc2dc933bee..5bdce97b817 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5286,7 +5286,7 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(atomic_read(&dev->refcnt)); - WARN_ON(dev->ip_ptr); + WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); -- cgit v1.2.2 From 16c3ea785fe4a383c6675dfe7316f3c815755bdd Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Wed, 15 Sep 2010 09:24:24 +0000 Subject: net: enable GRO by default for vlan devices Currently vlan devices don't have GRO by default as none of the Ethernet drivers add NETIF_F_GRO to their vlan_features. As GRO is a software feature add GRO to dev->vlan_features in register_netdevice() and let vlan_dev_init() take care that it gets enabled only when dev->features has NETIF_F_GRO too. Signed-off-by: Brandon Philips Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5bdce97b817..09b3742c4c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5057,6 +5057,11 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; + /* Enable GRO for vlans by default if dev->features has GRO also. + * vlan_dev_init() will do the dev->features check. + */ + dev->vlan_features |= NETIF_F_GRO; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) -- cgit v1.2.2 From caeda9b926c608702c99f3432aae2c24298c3c1d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 16 Sep 2010 21:39:16 -0700 Subject: net: include inetdevice.h for rcu_dereference_raw api change rcu_dereference_raw() now needs to know the type of its argument. Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 09b3742c4c8..c09ff096525 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include "net-sysfs.h" -- cgit v1.2.2 From 3b27e105550f7c4a79ecb6d6a9c49c651c59ae9b Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Fri, 17 Sep 2010 03:22:19 +0000 Subject: netns: keep vlan slaves on master netns move previously, if a vlan master device was moved from one network namespace to another, all 802.1q and macvlan slaves were deleted. we can use dev->reg_state to figure out whether dev_change_net_namespace is happening, since that won't set dev->reg_state NETREG_UNREGISTERING. so, this changes 8021q and macvlan to ignore NETDEV_UNREGISTER when reg_state is not NETREG_UNREGISTERING. Signed-off-by: David Lamparter Reviewed-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index c09ff096525..2c7934f8cf3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5686,6 +5686,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that we are about to destroy this device. They should clean all the things. + + Note that dev->reg_state stays at NETREG_REGISTERED. + This is wanted because this way 8021q and macvlan know + the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); -- cgit v1.2.2 From c5256c51232d8312755e8de2b514c426b19b101a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 00:46:11 +0000 Subject: net: propagate NETIF_F_HIGHDMA to vlans Automatically allows vlans to get NETIF_F_HIGHDMA if underlying device supports it. On 32bit arches (and more precisely if CONFIG_HIGHMEM is enabled), it can help to reduce cost of illegal_highdma() and __skb_linearize() calls. Tested on tg3 , bnx2, bonding, this worked very well. This is a generalization of a patch provided by Yi Zou & Jeff Kirsher. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2c7934f8cf3..e0c0b86f57a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5058,10 +5058,11 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; - /* Enable GRO for vlans by default if dev->features has GRO also. - * vlan_dev_init() will do the dev->features check. + /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, + * vlan_dev_init() will do the dev->features check, so these features + * are enabled only if supported by underlying device. */ - dev->vlan_features |= NETIF_F_GRO; + dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); -- cgit v1.2.2 From 1b4bf461f05d56ced6d6b8f3b4831adc7076f565 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 17:26:35 +0000 Subject: rps: allocate rx queues in register_netdevice only Instead of having two places were we allocate dev->_rx, introduce netif_alloc_rx_queues() helper and call it only from register_netdevice(), not from alloc_netdev_mq() Goal is to let drivers change dev->num_rx_queues after allocating netdev and before registering it. This also removes a lot of ifdefs in net/core/dev.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 76 +++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 44 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e0c0b86f57a..72e99835e5b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4964,6 +4964,34 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); +static int netif_alloc_rx_queues(struct net_device *dev) +{ +#ifdef CONFIG_RPS + unsigned int i, count = dev->num_rx_queues; + + if (count) { + struct netdev_rx_queue *rx; + + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", + count); + return -ENOMEM; + } + dev->_rx = rx; + atomic_set(&rx->count, count); + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < count; i++) + rx[i].first = rx; + } +#endif + return 0; +} + /** * register_netdevice - register a network device * @dev: device to register @@ -5001,24 +5029,10 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; -#ifdef CONFIG_RPS - if (!dev->num_rx_queues) { - /* - * Allocate a single RX queue if driver never called - * alloc_netdev_mq - */ - - dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!dev->_rx) { - ret = -ENOMEM; - goto out; - } + ret = netif_alloc_rx_queues(dev); + if (ret) + goto out; - dev->_rx->first = dev->_rx; - atomic_set(&dev->_rx->count, 1); - dev->num_rx_queues = 1; - } -#endif /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5415,10 +5429,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, struct net_device *dev; size_t alloc_size; struct net_device *p; -#ifdef CONFIG_RPS - struct netdev_rx_queue *rx; - int i; -#endif BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5444,29 +5454,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } -#ifdef CONFIG_RPS - rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "rx queues.\n"); - goto free_tx; - } - - atomic_set(&rx->count, queue_count); - - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ - for (i = 0; i < queue_count; i++) - rx[i].first = rx; -#endif dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) - goto free_rx; + goto free_tx; dev_mc_init(dev); dev_uc_init(dev); @@ -5478,7 +5471,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->real_num_tx_queues = queue_count; #ifdef CONFIG_RPS - dev->_rx = rx; dev->num_rx_queues = queue_count; #endif @@ -5496,11 +5488,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; -free_rx: -#ifdef CONFIG_RPS - kfree(rx); free_tx: -#endif kfree(tx); free_p: kfree(p); -- cgit v1.2.2 From 62fe0b40abb3484413800edaef9b087a20059acf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 27 Sep 2010 08:24:33 +0000 Subject: net: Allow changing number of RX queues after device allocation For RPS, we create a kobject for each RX queue based on the number of queues passed to alloc_netdev_mq(). However, drivers generally do not determine the numbers of hardware queues to use until much later, so this usually represents the maximum number the driver may use and not the actual number in use. For TX queues, drivers can update the actual number using netif_set_real_num_tx_queues(). Add a corresponding function for RX queues, netif_set_real_num_rx_queues(). Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 45 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 42b200fdf12..48ad47f402a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1567,6 +1567,41 @@ void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); +#ifdef CONFIG_RPS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it also + * sets the maximum number of queues, and always succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + if (rxq > dev->num_rx_queues) + return -EINVAL; + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } else { + dev->num_rx_queues = rxq; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif + static inline void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; @@ -2352,10 +2387,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue = dev->_rx + index; @@ -5472,6 +5508,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, #ifdef CONFIG_RPS dev->num_rx_queues = queue_count; + dev->real_num_rx_queues = queue_count; #endif dev->gso_max_size = GSO_MAX_SIZE; -- cgit v1.2.2 From 745e20f1b626b1be4b100af5d4bf7b3439392f8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2010 13:23:09 -0700 Subject: net: add a recursion limit in xmit path As tunnel devices are going to be lockless, we need to make sure a misconfigured machine wont enter an infinite loop. Add a percpu variable, and limit to three the number of stacked xmits. Reported-by: Jesse Gross Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 48ad47f402a..50daccad6a5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2177,6 +2177,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 3 + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2242,10 +2245,15 @@ int dev_queue_xmit(struct sk_buff *skb) if (txq->xmit_lock_owner != cpu) { + if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + goto recursion_alert; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { + __this_cpu_inc(xmit_recursion); rc = dev_hard_start_xmit(skb, dev, txq); + __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -2257,7 +2265,9 @@ int dev_queue_xmit(struct sk_buff *skb) "queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, - * unfortunately */ + * unfortunately + */ +recursion_alert: if (net_ratelimit()) printk(KERN_CRIT "Dead loop on virtual device " "%s, fix it urgently!\n", dev->name); -- cgit v1.2.2 From bfa5ae63b823f4ffd3483a05f60a93a4a7b7d680 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 05:58:37 +0000 Subject: net: rename netdev rx_queue to ingress_queue There is some confusion with rx_queue name after RPS, and net drivers private rx_queue fields. I suggest to rename "struct net_device"->rx_queue to ingress_queue. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 50daccad6a5..a313bab1b75 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2720,7 +2720,7 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - rxq = &dev->rx_queue; + rxq = &dev->ingress_queue; q = rxq->qdisc; if (q != &noop_qdisc) { @@ -2737,7 +2737,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (skb->dev->rx_queue.qdisc == &noop_qdisc) + if (skb->dev->ingress_queue.qdisc == &noop_qdisc) goto out; if (*pt_prev) { @@ -4940,7 +4940,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, static void netdev_init_queue_locks(struct net_device *dev) { netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); + __netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -5452,7 +5452,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netdev_init_queues(struct net_device *dev) { - netdev_init_one_queue(dev, &dev->rx_queue, NULL); + netdev_init_one_queue(dev, &dev->ingress_queue, NULL); netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); } -- cgit v1.2.2 From 24824a09e35402b8d58dcc5be803a5ad3937bdba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 Oct 2010 06:11:55 +0000 Subject: net: dynamic ingress_queue allocation ingress being not used very much, and net_device->ingress_queue being quite a big object (128 or 256 bytes), use a dynamic allocation if needed (tc qdisc add dev eth0 ingress ...) dev_ingress_queue(dev) helper should be used only with RTNL taken. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a313bab1b75..ce6ad88c980 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); * the ingress scheduler, you just cant add policies on ingress. * */ -static int ing_filter(struct sk_buff *skb) +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) { struct net_device *dev = skb->dev; u32 ttl = G_TC_RTTL(skb->tc_verd); - struct netdev_queue *rxq; int result = TC_ACT_OK; struct Qdisc *q; @@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - rxq = &dev->ingress_queue; - q = rxq->qdisc; if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); @@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (skb->dev->ingress_queue.qdisc == &noop_qdisc) + struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + + if (!rxq || rxq->qdisc == &noop_qdisc) goto out; if (*pt_prev) { @@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, *pt_prev = NULL; } - switch (ing_filter(skb)) { + switch (ing_filter(skb, rxq)) { case TC_ACT_SHOT: case TC_ACT_STOLEN: kfree_skb(skb); @@ -4940,7 +4939,6 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, static void netdev_init_queue_locks(struct net_device *dev) { netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -5452,11 +5450,29 @@ static void netdev_init_one_queue(struct net_device *dev, static void netdev_init_queues(struct net_device *dev) { - netdev_init_one_queue(dev, &dev->ingress_queue, NULL); netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); } +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) +{ + struct netdev_queue *queue = dev_ingress_queue(dev); + +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + __netdev_init_queue_locks_one(dev, queue, NULL); + queue->qdisc = &noop_qdisc; + queue->qdisc_sleeping = &noop_qdisc; + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; +} + /** * alloc_netdev_mq - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -5559,6 +5575,8 @@ void free_netdev(struct net_device *dev) kfree(dev->_tx); + kfree(rcu_dereference_raw(dev->ingress_queue)); + /* Flush device addresses */ dev_addr_flush(dev); -- cgit v1.2.2 From caf586e5f23cebb2a68cbaf288d59dbbf2d74052 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2010 21:06:55 +0000 Subject: net: add a core netdev->rx_dropped counter In various situations, a device provides a packet to our stack and we drop it before it enters protocol stack : - softnet backlog full (accounted in /proc/net/softnet_stat) - bad vlan tag (not accounted) - unknown/unregistered protocol (not accounted) We can handle a per-device counter of such dropped frames at core level, and automatically adds it to the device provided stats (rx_dropped), so that standard tools can be used (ifconfig, ip link, cat /proc/net/dev) This is a generalization of commit 8990f468a (net: rx_dropped accounting), thus reverting it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ce6ad88c980..7d149550e8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1483,8 +1483,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len))) { + if (unlikely(!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len)))) { + atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2548,6 +2549,7 @@ enqueue: local_irq_restore(flags); + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2995,6 +2997,7 @@ ncls: if (pt_prev) { ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -5429,14 +5432,14 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); - return ops->ndo_get_stats64(dev, storage); - } - if (ops->ndo_get_stats) { + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); - return storage; + } else { + netdev_stats_to_stats64(storage, &dev->stats); + dev_txq_stats_fold(dev, storage); } - netdev_stats_to_stats64(storage, &dev->stats); - dev_txq_stats_fold(dev, storage); + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); -- cgit v1.2.2 From 3d3211ef5cf7558d289d1a1efbef8c45595639aa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 6 Oct 2010 23:35:15 -0700 Subject: net: netif_set_real_num_rx_queues may cap num_rx_queues at init time Do not set num_rx_queues in netif_set_real_num_rx_queues() some drivers will increase the real_num_rx_queues later due to a feature changes or available interrupts increasing. By setting num_rx_queues here this ends up creating a cap on the number of rx queues available. For example the ixgbe driver sets the max number of queues it intends to use ever then sets the current number in use with the netif_set_num_{rx|tx}_queues calls. With the current implementation the number of rx queues gets limited so when a feature such as DCB or FCoE is enabled the queues are no longer available. kobjects will only be allocated for real_num_rx_queues so the waste in memory is minimal. Signed-off-by: John Fastabend Signed-off-by: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7d149550e8d..fd1b75a47e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1593,8 +1593,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) rxq); if (rc) return rc; - } else { - dev->num_rx_queues = rxq; } dev->real_num_rx_queues = rxq; -- cgit v1.2.2 From 4e7f79511e7332ae4056eda9156a0299511ea41e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 8 Oct 2010 10:33:39 -0700 Subject: net: Update kernel-doc for netif_set_real_num_rx_queues() Synchronise the comment with the preceding implementation change. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fd1b75a47e8..4962c8afd60 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1576,8 +1576,8 @@ EXPORT_SYMBOL(netif_set_real_num_tx_queues); * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a - * negative error code. If called before registration, it also - * sets the maximum number of queues, and always succeeds. + * negative error code. If called before registration, it always + * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { -- cgit v1.2.2 From 4315d834c1496ddca977e9e22002b77c85bfec2c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 7 Oct 2010 10:09:10 +0000 Subject: net: Fix rxq ref counting The rx->count reference is used to track reference counts to the number of rx-queue kobjects created for the device. This patch eliminates initialization of the counter in netif_alloc_rx_queues and instead increments the counter each time a kobject is created. This is now symmetric with the decrement that is done when an object is released. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4962c8afd60..193eafaabd8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5024,7 +5024,6 @@ static int netif_alloc_rx_queues(struct net_device *dev) return -ENOMEM; } dev->_rx = rx; - atomic_set(&rx->count, count); /* * Set a pointer to first element in the array which holds the -- cgit v1.2.2 From 29b4433d991c88d86ca48a4c1cc33c671475be4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 10:22:12 +0000 Subject: net: percpu net_device refcount We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 193eafaabd8..04972a4783e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) */ dev->reg_state = NETREG_DUMMY; - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); @@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -5243,6 +5245,16 @@ out: } EXPORT_SYMBOL(register_netdev); +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + /* * netdev_wait_allrefs - wait until all references are gone. * @@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev) msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { printk(KERN_EMERG "unregister_netdevice: " "waiting for %s to become free. Usage " "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + dev->name, refcnt); warning_time = jiffies; } } @@ -5350,7 +5367,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); + BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); @@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - if (dev_addr_init(dev)) + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) goto free_tx; + if (dev_addr_init(dev)) + goto free_pcpu; + dev_mc_init(dev); dev_uc_init(dev); @@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, free_tx: kfree(tx); +free_pcpu: + free_percpu(dev->pcpu_refcnt); free_p: kfree(p); return NULL; @@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); -- cgit v1.2.2 From 55513fb4281464e97aa1ff2b9c906ca5aed917c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 17:55:58 +0000 Subject: net: fail alloc_netdev_mq if queue count < 1 In alloc_netdev_mq fail if requested queue_count < 1. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 04972a4783e..f44d29ae5d6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5511,6 +5511,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); + if (queue_count < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero queues.\n"); + return NULL; + } + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ -- cgit v1.2.2 From bd25fa7ba59cd26094319dfba0011b48465f7355 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 18:00:16 +0000 Subject: net: cleanups in RX queue allocation Clean up in RX queue allocation. In netif_set_real_num_rx_queues return error on attempt to set zero queues, or requested number is greater than number of allocated queues. In netif_alloc_rx_queues, do BUG_ON if queue_count is zero. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f44d29ae5d6..d33adecec44 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1583,12 +1583,12 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); - if (rxq > dev->num_rx_queues) - return -EINVAL; - rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) @@ -5013,25 +5013,23 @@ static int netif_alloc_rx_queues(struct net_device *dev) { #ifdef CONFIG_RPS unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; - if (count) { - struct netdev_rx_queue *rx; - - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - pr_err("netdev: Unable to allocate %u rx queues.\n", - count); - return -ENOMEM; - } - dev->_rx = rx; + BUG_ON(count < 1); - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ - for (i = 0; i < count; i++) - rx[i].first = rx; + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", count); + return -ENOMEM; } + dev->_rx = rx; + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < count; i++) + rx[i].first = rx; #endif return 0; } -- cgit v1.2.2 From e6484930d7c73d324bccda7d43d131088da697b9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 18:04:39 +0000 Subject: net: allocate tx queues in register_netdevice This patch introduces netif_alloc_netdev_queues which is called from register_device instead of alloc_netdev_mq. This makes TX queue allocation symmetric with RX allocation. Also, queue locks allocation is done in netdev_init_one_queue. Change set_real_num_tx_queues to fail if requested number < 1 or greater than number of allocated queues. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 106 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d33adecec44..4c3ac53e4b1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1553,18 +1553,20 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. */ -void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { - unsigned int real_num = dev->real_num_tx_queues; + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; - if (unlikely(txq > dev->num_tx_queues)) - ; - else if (txq > real_num) - dev->real_num_tx_queues = txq; - else if (txq < real_num) { - dev->real_num_tx_queues = txq; - qdisc_reset_all_tx_gt(dev, txq); + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + if (txq < dev->real_num_tx_queues) + qdisc_reset_all_tx_gt(dev, txq); } + + dev->real_num_tx_queues = txq; + return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); @@ -4928,20 +4930,6 @@ static void rollback_registered(struct net_device *dev) rollback_registered_many(&single); } -static void __netdev_init_queue_locks_one(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_unused) -{ - spin_lock_init(&dev_queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; -} - -static void netdev_init_queue_locks(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); -} - unsigned long netdev_fix_features(unsigned long features, const char *name) { /* Fix illegal SG+CSUM combinations. */ @@ -5034,6 +5022,41 @@ static int netif_alloc_rx_queues(struct net_device *dev) return 0; } +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + + BUG_ON(count < 1); + + tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + if (!tx) { + pr_err("netdev: Unable to allocate %u tx queues.\n", + count); + return -ENOMEM; + } + dev->_tx = tx; + return 0; +} + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, + void *_unused) +{ + queue->dev = dev; + + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; +} + +static void netdev_init_queues(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); +} + /** * register_netdevice - register a network device * @dev: device to register @@ -5067,7 +5090,6 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - netdev_init_queue_locks(dev); dev->iflink = -1; @@ -5075,6 +5097,12 @@ int register_netdevice(struct net_device *dev) if (ret) goto out; + ret = netif_alloc_netdev_queues(dev); + if (ret) + goto out; + + netdev_init_queues(dev); + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5456,19 +5484,6 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } EXPORT_SYMBOL(dev_get_stats); -static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue, - void *_unused) -{ - queue->dev = dev; -} - -static void netdev_init_queues(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); - spin_lock_init(&dev->tx_global_lock); -} - struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -5480,7 +5495,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); - __netdev_init_queue_locks_one(dev, queue, NULL); queue->qdisc = &noop_qdisc; queue->qdisc_sleeping = &noop_qdisc; rcu_assign_pointer(dev->ingress_queue, queue); @@ -5502,7 +5516,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { - struct netdev_queue *tx; struct net_device *dev; size_t alloc_size; struct net_device *p; @@ -5530,20 +5543,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, return NULL; } - tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "tx qdiscs.\n"); - goto free_p; - } - - dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_tx; + goto free_p; if (dev_addr_init(dev)) goto free_pcpu; @@ -5553,7 +5558,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->_tx = tx; dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; @@ -5564,8 +5568,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; - netdev_init_queues(dev); - INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); dev->ethtool_ntuple_list.count = 0; INIT_LIST_HEAD(&dev->napi_list); @@ -5576,8 +5578,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; -free_tx: - kfree(tx); free_pcpu: free_percpu(dev->pcpu_refcnt); free_p: -- cgit v1.2.2 From 7b9c60903714bf0a19d746b228864bad3497284e Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:04 +0000 Subject: vlan: Enable software emulation for vlan accleration. Currently users of hardware vlan accleration need to know whether the device supports it before generating packets. However, vlan acceleration will soon be available in a more flexible manner so knowing ahead of time becomes much more difficult. This adds a software fallback path for vlan packets on devices without the necessary offloading support, similar to other types of hardware accleration. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4c3ac53e4b1..1bfd96b1fbd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1694,7 +1694,12 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) { - if (can_checksum_protocol(dev->features, skb->protocol)) + int features = dev->features; + + if (vlan_tx_tag_present(skb)) + features &= dev->vlan_features; + + if (can_checksum_protocol(features, skb->protocol)) return true; if (skb->protocol == htons(ETH_P_8021Q)) { @@ -1793,6 +1798,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) __be16 type = skb->protocol; int err; + if (type == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh; + + if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) + return ERR_PTR(-EINVAL); + + veh = (struct vlan_ethhdr *)skb->data; + type = veh->h_vlan_encapsulated_proto; + } + skb_reset_mac_header(skb); skb->mac_len = skb->network_header - skb->mac_header; __skb_pull(skb, skb->mac_len); @@ -1964,9 +1979,14 @@ static inline void skb_orphan_try(struct sk_buff *skb) static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { + int features = dev->features; + + if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb)) + features &= dev->vlan_features; + return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || + ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -1989,6 +2009,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_orphan_try(skb); + if (vlan_tx_tag_present(skb) && + !(dev->features & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + goto out; + + skb->vlan_tci = 0; + } + if (netif_needs_gso(dev, skb)) { if (unlikely(dev_gso_segment(skb))) goto out_kfree_skb; @@ -2050,6 +2079,7 @@ out_kfree_gso_skb: skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); +out: return rc; } -- cgit v1.2.2 From 3701e51382a026cba10c60b03efabe534fba4ca4 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:06 +0000 Subject: vlan: Centralize handling of hardware acceleration. Currently each driver that is capable of vlan hardware acceleration must be aware of the vlan groups that are configured and then pass the stripped tag to a specialized receive function. This is different from other types of hardware offload in that it places a significant amount of knowledge in the driver itself rather keeping it in the networking core. This makes vlan offloading function more similarly to other forms of offloading (such as checksum offloading or TSO) by doing the following: * On receive, stripped vlans are passed directly to the network core, without attempting to check for vlan groups or reconstructing the header if no group * vlans are made less special by folding the logic into the main receive routines * On transmit, the device layer will add the vlan header in software if the hardware doesn't support it, instead of spreading that logic out in upper layers, such as bonding. There are a number of advantages to this: * Fixes all bugs with drivers incorrectly dropping vlan headers at once. * Avoids having to disable VLAN acceleration when in promiscuous mode (good for bridging since it always puts devices in promiscuous mode). * Keeps VLAN tag separate until given to ultimate consumer, which avoids needing to do header reconstruction as in tg3 unless absolutely necessary. * Consolidates common code in core networking. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/core/dev.c | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1bfd96b1fbd..97fd6bc2004 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2789,33 +2789,6 @@ out: } #endif -/* - * netif_nit_deliver - deliver received packets to network taps - * @skb: buffer - * - * This function is used to deliver incoming packets to network - * taps. It should be used when the normal netif_receive_skb path - * is bypassed, for example because of VLAN acceleration. - */ -void netif_nit_deliver(struct sk_buff *skb) -{ - struct packet_type *ptype; - - if (list_empty(&ptype_all)) - return; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) - deliver_skb(skb, ptype, skb->dev); - } - rcu_read_unlock(); -} - /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for @@ -2925,9 +2898,6 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb)) - vlan_hwaccel_do_receive(skb); - /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) return NET_RX_DROP; @@ -2940,8 +2910,7 @@ static int __netif_receive_skb(struct sk_buff *skb) * be delivered to pkt handlers that are exact matches. Also * the deliver_no_wcard flag will be set. If packet handlers * are sensitive to duplicate packets these skbs will need to - * be dropped at the handler. The vlan accel path may have - * already set the deliver_no_wcard flag. + * be dropped at the handler. */ null_or_orig = NULL; orig_dev = skb->dev; @@ -3000,6 +2969,18 @@ ncls: goto out; } + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_hwaccel_do_receive(&skb)) { + ret = __netif_receive_skb(skb); + goto out; + } else if (unlikely(!skb)) + goto out; + } + /* * Make sure frames received on VLAN interfaces stacked on * bonding interfaces still make their way to any base bonding @@ -3264,6 +3245,7 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->same_flow = !diffs; @@ -3323,6 +3305,7 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); + skb->vlan_tci = 0; napi->skb = skb; } -- cgit v1.2.2 From d2ed817766987fd05e69b7da65d4861b38f1aa2a Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 21 Oct 2010 04:06:29 -0700 Subject: net/core: Allow tagged VLAN packets to flow through VETH devices. When there are VLANs on a VETH device, the packets being transmitted through the VETH device may be 4 bytes bigger than MTU. A check in dev_forward_skb did not take this into account and so dropped these packets. This patch is needed at least as far back as 2.6.34.7 and should be considered for -stable. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 660dd41aaaa..8e07109cc0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1485,7 +1485,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) nf_reset(skb); if (!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len))) { + (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN))) { kfree_skb(skb); return NET_RX_DROP; } -- cgit v1.2.2 From d0c2b0d265a0f1f92922a99a31def9da582197ac Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 19 Oct 2010 07:12:10 +0000 Subject: napi: unexport napi_reuse_skb The function napi_reuse_skb is only used inside core. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 97fd6bc2004..b2269ac04cb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3301,7 +3301,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_gro_receive); -void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); @@ -3309,7 +3309,6 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) napi->skb = skb; } -EXPORT_SYMBOL(napi_reuse_skb); struct sk_buff *napi_get_frags(struct napi_struct *napi) { -- cgit v1.2.2 From 11a766ce915fc9f8663714eac6d59239388534ea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 25 Oct 2010 12:51:55 -0700 Subject: net: Increase xmit RECURSION_LIMIT to 10. Three is definitely too low, and we know from reports that GRE tunnels stacked as deeply as 37 levels cause stack overflows, so pick some reasonable value between those two. Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 78b5a89b0f4..2c7da3ab468 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2213,7 +2213,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } static DEFINE_PER_CPU(int, xmit_recursion); -#define RECURSION_LIMIT 3 +#define RECURSION_LIMIT 10 /** * dev_queue_xmit - transmit a buffer -- cgit v1.2.2 From 198caeca3eb4c81bbdbeb34a870868002f89b3d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Oct 2010 21:32:05 +0000 Subject: ipv6: ip6_ptr rcu annotations (struct net_device)->ip6_ptr is rcu protected : add __rcu annotation and proper rcu primitives. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2c7da3ab468..365f7f5077e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5416,7 +5416,7 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); - WARN_ON(dev->ip6_ptr); + WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) -- cgit v1.2.2 From 6e3f7faf3e8a3e226b1a6b955aac12abf2f2e1b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2010 03:02:02 +0000 Subject: rps: add __rcu annotations Add __rcu annotations to : (struct netdev_rx_queue)->rps_map (struct netdev_rx_queue)->rps_flow_table struct rps_sock_flow_table *rps_sock_flow_table; And use appropriate rcu primitives. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 365f7f5077e..e8a8dc19365 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2413,7 +2413,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); /* @@ -2425,7 +2425,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue; - struct rps_map *map = NULL; + struct rps_map *map; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; int cpu = -1; @@ -2444,15 +2444,15 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } else rxqueue = dev->_rx; - if (rxqueue->rps_map) { - map = rcu_dereference(rxqueue->rps_map); - if (map && map->len == 1) { + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; goto done; } - } else if (!rxqueue->rps_flow_table) { + } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { goto done; } -- cgit v1.2.2 From af1905dbec44445d75851996819ac2203670bd0f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 22 Oct 2010 04:12:19 +0000 Subject: net: Fix some corner cases in dev_can_checksum() dev_can_checksum() incorrectly returns true in these cases: 1. The skb has both out-of-band and in-band VLAN tags and the device supports checksum offload for the encapsulated protocol but only with one layer of encapsulation. 2. The skb has a VLAN tag and the device supports generic checksumming but not in conjunction with VLAN encapsulation. Rearrange the VLAN tag checks to avoid these. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e8a8dc19365..3c5fbfbb318 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1696,22 +1696,18 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) { + __be16 protocol = skb->protocol; int features = dev->features; - if (vlan_tx_tag_present(skb)) + if (vlan_tx_tag_present(skb)) { features &= dev->vlan_features; - - if (can_checksum_protocol(features, skb->protocol)) - return true; - - if (skb->protocol == htons(ETH_P_8021Q)) { + } else if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - if (can_checksum_protocol(dev->features & dev->vlan_features, - veh->h_vlan_encapsulated_proto)) - return true; + protocol = veh->h_vlan_encapsulated_proto; + features &= dev->vlan_features; } - return false; + return can_checksum_protocol(features, protocol); } /** -- cgit v1.2.2 From 66c68bcc489fadd4f5e8839e966e3a366e50d1d5 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 22 Oct 2010 04:38:26 +0000 Subject: net: NETIF_F_HW_CSUM does not imply FCoE CRC offload NETIF_F_HW_CSUM indicates the ability to update an TCP/IP-style 16-bit checksum with the checksum of an arbitrary part of the packet data, whereas the FCoE CRC is something entirely different. Signed-off-by: Ben Hutchings Cc: stable@kernel.org [2.6.32+] Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3c5fbfbb318..35dfb831848 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1685,10 +1685,10 @@ EXPORT_SYMBOL(netif_device_attach); static bool can_checksum_protocol(unsigned long features, __be16 protocol) { - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_IP_CSUM) && + return ((features & NETIF_F_NO_CSUM) || + ((features & NETIF_F_V4_CSUM) && protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_IPV6_CSUM) && + ((features & NETIF_F_V6_CSUM) && protocol == htons(ETH_P_IPV6)) || ((features & NETIF_F_FCOE_CRC) && protocol == htons(ETH_P_FCOE))); -- cgit v1.2.2