From 58e998c6d23988490162cef0784b19ea274d90bb Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 29 Oct 2010 12:14:55 +0000 Subject: offloading: Force software GSO for multiple vlan tags. We currently use vlan_features to check for TSO support if there is a vlan tag. However, it's quite likely that the NIC is not able to do TSO when there is an arbitrary number of tags. Therefore if there is more than one tag (in-band or out-of-band), fall back to software emulation. Signed-off-by: Jesse Gross CC: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 578debb801f4..6e4cfbc53d4c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2239,6 +2239,8 @@ unsigned long netdev_fix_features(unsigned long features, const char *name); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); +int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev); + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; @@ -2254,10 +2256,7 @@ static inline int skb_gso_ok(struct sk_buff *skb, int features) static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { if (skb_is_gso(skb)) { - int features = dev->features; - - if (skb->protocol == htons(ETH_P_8021Q) || skb->vlan_tci) - features &= dev->vlan_features; + int features = netif_get_vlan_features(skb, dev); return (!skb_gso_ok(skb, features) || unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); -- cgit v1.2.2 From fe8222406c8277a21172479d3a8283d31c209028 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 9 Nov 2010 10:47:38 +0000 Subject: net: Simplify RX queue allocation This patch move RX queue allocation to alloc_netdev_mq and freeing of the queues to free_netdev (symmetric to TX queue allocation). Each kobject RX queue takes a reference to the queue's device so that the device can't be freed before all the kobjects have been released-- this obviates the need for reference counts specific to RX queues. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6e4cfbc53d4c..fccb11f879e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -592,8 +592,7 @@ struct netdev_rx_queue { struct rps_map __rcu *rps_map; struct rps_dev_flow_table __rcu *rps_flow_table; struct kobject kobj; - struct netdev_rx_queue *first; - atomic_t count; + struct net_device *dev; } ____cacheline_aligned_in_smp; #endif /* CONFIG_RPS */ -- cgit v1.2.2 From 61391cde9eefac5cfcf6d214aa80c77e58b1626b Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 15 Nov 2010 06:38:12 +0000 Subject: netdev: add rcu annotations to receive handler hook Suggested by Eric's bridge RCU changes. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fccb11f879e5..b45c1b8b1d19 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -994,8 +994,8 @@ struct net_device { unsigned int real_num_rx_queues; #endif - rx_handler_func_t *rx_handler; - void *rx_handler_data; + rx_handler_func_t __rcu *rx_handler; + void __rcu *rx_handler_data; struct netdev_queue __rcu *ingress_queue; -- cgit v1.2.2 From 1d24eb4815d1e0e8b451ecc546645f8ef1176d4f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 21 Nov 2010 13:17:27 +0000 Subject: xps: Transmit Packet Steering This patch implements transmit packet steering (XPS) for multiqueue devices. XPS selects a transmit queue during packet transmission based on configuration. This is done by mapping the CPU transmitting the packet to a queue. This is the transmit side analogue to RPS-- where RPS is selecting a CPU based on receive queue, XPS selects a queue based on the CPU (previously there was an XPS patch from Eric Dumazet, but that might more appropriately be called transmit completion steering). Each transmit queue can be associated with a number of CPUs which will use the queue to send packets. This is configured as a CPU mask on a per queue basis in: /sys/class/net/eth/queues/tx-/xps_cpus The mappings are stored per device in an inverted data structure that maps CPUs to queues. In the netdevice structure this is an array of num_possible_cpu structures where each structure holds and array of queue_indexes for queues which that CPU can use. The benefits of XPS are improved locality in the per queue data structures. Also, transmit completions are more likely to be done nearer to the sending thread, so this should promote locality back to the socket on free (e.g. UDP). The benefits of XPS are dependent on cache hierarchy, application load, and other factors. XPS would nominally be configured so that a queue would only be shared by CPUs which are sharing a cache, the degenerative configuration woud be that each CPU has it's own queue. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. bnx2x on 16 core AMD XPS (16 queues, 1 TX queue per CPU) 1234K at 100% CPU No XPS (16 queues) 996K at 100% CPU Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b45c1b8b1d19..badf9285fe0d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -503,6 +503,10 @@ struct netdev_queue { struct Qdisc *qdisc; unsigned long state; struct Qdisc *qdisc_sleeping; +#ifdef CONFIG_RPS + struct kobject kobj; +#endif + /* * write mostly part */ @@ -529,6 +533,30 @@ struct rps_map { }; #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) +/* + * This structure holds an XPS map which can be of variable length. The + * map is an array of queues. + */ +struct xps_map { + unsigned int len; + unsigned int alloc_len; + struct rcu_head rcu; + u16 queues[0]; +}; +#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16))) +#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \ + / sizeof(u16)) + +/* + * This structure holds all XPS maps for device. Maps are indexed by CPU. + */ +struct xps_dev_maps { + struct rcu_head rcu; + struct xps_map *cpu_map[0]; +}; +#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ + (nr_cpu_ids * sizeof(struct xps_map *))) + /* * The rps_dev_flow structure contains the mapping of a flow to a CPU and the * tail pointer for that CPU's input queue at the time of last enqueue. @@ -1016,6 +1044,8 @@ struct net_device { unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; + struct xps_dev_maps *xps_maps; + /* These may be needed for future network-power-down code. */ /* -- cgit v1.2.2 From 5a0d2268d259886f0c87131639d19eb4a67b4532 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Nov 2010 10:42:02 +0000 Subject: net: add netif_tx_queue_frozen_or_stopped When testing struct netdev_queue state against FROZEN bit, we also test XOFF bit. We can test both bits at once and save some cycles. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index badf9285fe0d..7c6ae2f4b9ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -493,6 +493,8 @@ static inline void napi_synchronize(const struct napi_struct *n) enum netdev_queue_state_t { __QUEUE_STATE_XOFF, __QUEUE_STATE_FROZEN, +#define QUEUE_STATE_XOFF_OR_FROZEN ((1 << __QUEUE_STATE_XOFF) | \ + (1 << __QUEUE_STATE_FROZEN)) }; struct netdev_queue { @@ -1629,9 +1631,9 @@ static inline int netif_queue_stopped(const struct net_device *dev) return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0)); } -static inline int netif_tx_queue_frozen(const struct netdev_queue *dev_queue) +static inline int netif_tx_queue_frozen_or_stopped(const struct netdev_queue *dev_queue) { - return test_bit(__QUEUE_STATE_FROZEN, &dev_queue->state); + return dev_queue->state & QUEUE_STATE_XOFF_OR_FROZEN; } /** -- cgit v1.2.2 From bf26414510103448ad3dc069c7422462f03ea3d7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 26 Nov 2010 08:36:09 +0000 Subject: xps: Add CONFIG_XPS This patch adds XPS_CONFIG option to enable and disable XPS. This is done in the same manner as RPS_CONFIG. This is also fixes build failure in XPS code when SMP is not enabled. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 52 +++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7c6ae2f4b9ab..9ae4544f0cf0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -535,30 +535,6 @@ struct rps_map { }; #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) -/* - * This structure holds an XPS map which can be of variable length. The - * map is an array of queues. - */ -struct xps_map { - unsigned int len; - unsigned int alloc_len; - struct rcu_head rcu; - u16 queues[0]; -}; -#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16))) -#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \ - / sizeof(u16)) - -/* - * This structure holds all XPS maps for device. Maps are indexed by CPU. - */ -struct xps_dev_maps { - struct rcu_head rcu; - struct xps_map *cpu_map[0]; -}; -#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ - (nr_cpu_ids * sizeof(struct xps_map *))) - /* * The rps_dev_flow structure contains the mapping of a flow to a CPU and the * tail pointer for that CPU's input queue at the time of last enqueue. @@ -626,6 +602,32 @@ struct netdev_rx_queue { } ____cacheline_aligned_in_smp; #endif /* CONFIG_RPS */ +#ifdef CONFIG_XPS +/* + * This structure holds an XPS map which can be of variable length. The + * map is an array of queues. + */ +struct xps_map { + unsigned int len; + unsigned int alloc_len; + struct rcu_head rcu; + u16 queues[0]; +}; +#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16))) +#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \ + / sizeof(u16)) + +/* + * This structure holds all XPS maps for device. Maps are indexed by CPU. + */ +struct xps_dev_maps { + struct rcu_head rcu; + struct xps_map *cpu_map[0]; +}; +#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ + (nr_cpu_ids * sizeof(struct xps_map *))) +#endif /* CONFIG_XPS */ + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1046,7 +1048,9 @@ struct net_device { unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; +#ifdef CONFIG_XPS struct xps_dev_maps *xps_maps; +#endif /* These may be needed for future network-power-down code. */ -- cgit v1.2.2 From a41778694806ac1ccd4b1dafed1abef8d5ba98ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Nov 2010 21:43:02 +0000 Subject: xps: add __rcu annotations Avoid sparse warnings : add __rcu annotations and use rcu_dereference_protected() where necessary. Signed-off-by: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9ae4544f0cf0..4b0c7f3aa32b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -622,7 +622,7 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; - struct xps_map *cpu_map[0]; + struct xps_map __rcu *cpu_map[0]; }; #define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * sizeof(struct xps_map *))) @@ -1049,7 +1049,7 @@ struct net_device { spinlock_t tx_global_lock; #ifdef CONFIG_XPS - struct xps_dev_maps *xps_maps; + struct xps_dev_maps __rcu *xps_maps; #endif /* These may be needed for future network-power-down code. */ -- cgit v1.2.2 From f2cd2d3e9b3ef960612e362f0ad129d735452df2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Nov 2010 08:14:37 +0000 Subject: net sched: use xps information for qdisc NUMA affinity Allocate qdisc memory according to NUMA properties of cpus included in xps map. To be effective, qdisc should be (re)setup after changes of /sys/class/net/eth/queues/tx-/xps_cpus I added a numa_node field in struct netdev_queue, containing NUMA node if all cpus included in xps_cpus share same node, else -1. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4b0c7f3aa32b..a9ac5dc26e3c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -508,7 +508,9 @@ struct netdev_queue { #ifdef CONFIG_RPS struct kobject kobj; #endif - +#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) + int numa_node; +#endif /* * write mostly part */ @@ -523,6 +525,22 @@ struct netdev_queue { u64 tx_dropped; } ____cacheline_aligned_in_smp; +static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) +{ +#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) + return q->numa_node; +#else + return -1; +#endif +} + +static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node) +{ +#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) + q->numa_node = node; +#endif +} + #ifdef CONFIG_RPS /* * This structure holds an RPS map which can be of variable length. The -- cgit v1.2.2 From 941666c2e3e0f9f6a1cb5808d02352d445bd702c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Dec 2010 01:23:53 +0000 Subject: net: RCU conversion of dev_getbyhwaddr() and arp_ioctl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le dimanche 05 décembre 2010 à 09:19 +0100, Eric Dumazet a écrit : > Hmm.. > > If somebody can explain why RTNL is held in arp_ioctl() (and therefore > in arp_req_delete()), we might first remove RTNL use in arp_ioctl() so > that your patch can be applied. > > Right now it is not good, because RTNL wont be necessarly held when you > are going to call arp_invalidate() ? While doing this analysis, I found a refcount bug in llc, I'll send a patch for net-2.6 Meanwhile, here is the patch for net-next-2.6 Your patch then can be applied after mine. Thanks [PATCH] net: RCU conversion of dev_getbyhwaddr() and arp_ioctl() dev_getbyhwaddr() was called under RTNL. Rename it to dev_getbyhwaddr_rcu() and change all its caller to now use RCU locking instead of RTNL. Change arp_ioctl() to use RCU instead of RTNL locking. Note: this fix a dev refcount bug in llc Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9ac5dc26e3c..d31bc3c94717 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1360,7 +1360,8 @@ static inline struct net_device *first_net_device(struct net *net) extern int netdev_boot_setup_check(struct net_device *dev); extern unsigned long netdev_boot_base(const char *prefix, int unit); -extern struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *hwaddr); +extern struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *hwaddr); extern struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); extern struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type); extern void dev_add_pack(struct packet_type *pt); -- cgit v1.2.2 From a3d22a68d752ccc1a01bb0a64dd70b7a98bf9e23 Mon Sep 17 00:00:00 2001 From: Vladislav Zolotarov Date: Mon, 13 Dec 2010 06:27:10 +0000 Subject: bnx2x: Take the distribution range definition out of skb_tx_hash() Move the calcualation of the Tx hash for a given hash range into a separate function and define the skb_tx_hash(), which calculates a Tx hash for a [0; dev->real_num_tx_queues - 1] hash values range, using this function (__skb_tx_hash()). Signed-off-by: Vladislav Zolotarov Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d31bc3c94717..445e6825f8eb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1747,6 +1747,16 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) __netif_schedule(txq->qdisc); } +/* + * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used + * as a distribution range limit for the returned value. + */ +static inline u16 skb_tx_hash(const struct net_device *dev, + const struct sk_buff *skb) +{ + return __skb_tx_hash(dev, skb, dev->real_num_tx_queues); +} + /** * netif_is_multiqueue - test if device has multiple transmit queues * @dev: network device -- cgit v1.2.2 From b236da6931e2482bfe44a7865dd4e7bb036f3496 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 14 Dec 2010 03:09:15 +0000 Subject: net: use NUMA_NO_NODE instead of the magic number -1 Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 445e6825f8eb..cc916c5c3279 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -530,7 +530,7 @@ static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) return q->numa_node; #else - return -1; + return NUMA_NO_NODE; #endif } -- cgit v1.2.2 From 68763c890eb2a60f9b50a061502f94e0cf20fdfe Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Sun, 2 Jan 2011 22:54:09 +0000 Subject: trivial: Fix typo fault in netdevice.h Signed-off-by: Michal Simek Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc916c5c3279..0f6b1c965815 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -732,7 +732,7 @@ struct xps_dev_maps { * neither operation. * * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp); - * If device support VLAN receive accleration + * If device support VLAN receive acceleration * (ie. dev->features & NETIF_F_HW_VLAN_RX), then this function is called * when vlan groups for the device changes. Note: grp is NULL * if no vlan's groups are being used. -- cgit v1.2.2