aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2010-03-16 04:03:29 -0400
committerDavid S. Miller <davem@davemloft.net>2010-03-17 00:23:18 -0400
commit0a9627f2649a02bea165cfd529d7bcb625c2fcad (patch)
treee5d4424b99208c78e2b2fe6ff5a158fc21bdf782
parent768bbedf9ca4cc4784eae2003f37abe0818fe0b0 (diff)
rps: Receive Packet Steering
This patch implements software receive side packet steering (RPS). RPS distributes the load of received packet processing across multiple CPUs. Problem statement: Protocol processing done in the NAPI context for received packets is serialized per device queue and becomes a bottleneck under high packet load. This substantially limits pps that can be achieved on a single queue NIC and provides no scaling with multiple cores. This solution queues packets early on in the receive path on the backlog queues of other CPUs. This allows protocol processing (e.g. IP and TCP) to be performed on packets in parallel. For each device (or each receive queue in a multi-queue device) a mask of CPUs is set to indicate the CPUs that can process packets. A CPU is selected on a per packet basis by hashing contents of the packet header (e.g. the TCP or UDP 4-tuple) and using the result to index into the CPU mask. The IPI mechanism is used to raise networking receive softirqs between CPUs. This effectively emulates in software what a multi-queue NIC can provide, but is generic requiring no device support. Many devices now provide a hash over the 4-tuple on a per packet basis (e.g. the Toeplitz hash). This patch allow drivers to set the HW reported hash in an skb field, and that value in turn is used to index into the RPS maps. Using the HW generated hash can avoid cache misses on the packet when steering it to a remote CPU. The CPU mask is set on a per device and per queue basis in the sysfs variable /sys/class/net/<device>/queues/rx-<n>/rps_cpus. This is a set of canonical bit maps for receive queues in the device (numbered by <n>). If a device does not support multi-queue, a single variable is used for the device (rx-0). Generally, we have found this technique increases pps capabilities of a single queue device with good CPU utilization. Optimal settings for the CPU mask seem to depend on architectures and cache hierarcy. Below are some results running 500 instances of netperf TCP_RR test with 1 byte req. and resp. Results show cumulative transaction rate and system CPU utilization. e1000e on 8 core Intel Without RPS: 108K tps at 33% CPU With RPS: 311K tps at 64% CPU forcedeth on 16 core AMD Without RPS: 156K tps at 15% CPU With RPS: 404K tps at 49% CPU bnx2x on 16 core AMD Without RPS 567K tps at 61% CPU (4 HW RX queues) Without RPS 738K tps at 96% CPU (8 HW RX queues) With RPS: 854K tps at 76% CPU (4 HW RX queues) Caveats: - The benefits of this patch are dependent on architecture and cache hierarchy. Tuning the masks to get best performance is probably necessary. - This patch adds overhead in the path for processing a single packet. In a lightly loaded server this overhead may eliminate the advantages of increased parallelism, and possibly cause some relative performance degradation. We have found that masks that are cache aware (share same caches with the interrupting CPU) mitigate much of this. - The RPS masks can be changed dynamically, however whenever the mask is changed this introduces the possibility of generating out of order packets. It's probably best not change the masks too frequently. Signed-off-by: Tom Herbert <therbert@google.com> include/linux/netdevice.h | 32 ++++- include/linux/skbuff.h | 3 + net/core/dev.c | 335 +++++++++++++++++++++++++++++++++++++-------- net/core/net-sysfs.c | 225 ++++++++++++++++++++++++++++++- net/core/skbuff.c | 2 + 5 files changed, 538 insertions(+), 59 deletions(-) Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/netdevice.h32
-rw-r--r--include/linux/skbuff.h3
-rw-r--r--net/core/dev.c335
-rw-r--r--net/core/net-sysfs.c225
-rw-r--r--net/core/skbuff.c2
5 files changed, 538 insertions, 59 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c79a88be7c33..de1a52bcb9e0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -223,6 +223,7 @@ struct netif_rx_stats {
223 unsigned dropped; 223 unsigned dropped;
224 unsigned time_squeeze; 224 unsigned time_squeeze;
225 unsigned cpu_collision; 225 unsigned cpu_collision;
226 unsigned received_rps;
226}; 227};
227 228
228DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat); 229DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
@@ -530,6 +531,24 @@ struct netdev_queue {
530 unsigned long tx_dropped; 531 unsigned long tx_dropped;
531} ____cacheline_aligned_in_smp; 532} ____cacheline_aligned_in_smp;
532 533
534/*
535 * This structure holds an RPS map which can be of variable length. The
536 * map is an array of CPUs.
537 */
538struct rps_map {
539 unsigned int len;
540 struct rcu_head rcu;
541 u16 cpus[0];
542};
543#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
544
545/* This structure contains an instance of an RX queue. */
546struct netdev_rx_queue {
547 struct rps_map *rps_map;
548 struct kobject kobj;
549 struct netdev_rx_queue *first;
550 atomic_t count;
551} ____cacheline_aligned_in_smp;
533 552
534/* 553/*
535 * This structure defines the management hooks for network devices. 554 * This structure defines the management hooks for network devices.
@@ -878,6 +897,13 @@ struct net_device {
878 897
879 unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ 898 unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
880 899
900 struct kset *queues_kset;
901
902 struct netdev_rx_queue *_rx;
903
904 /* Number of RX queues allocated at alloc_netdev_mq() time */
905 unsigned int num_rx_queues;
906
881 struct netdev_queue rx_queue; 907 struct netdev_queue rx_queue;
882 908
883 struct netdev_queue *_tx ____cacheline_aligned_in_smp; 909 struct netdev_queue *_tx ____cacheline_aligned_in_smp;
@@ -1311,14 +1337,16 @@ static inline int unregister_gifconf(unsigned int family)
1311 */ 1337 */
1312struct softnet_data { 1338struct softnet_data {
1313 struct Qdisc *output_queue; 1339 struct Qdisc *output_queue;
1314 struct sk_buff_head input_pkt_queue;
1315 struct list_head poll_list; 1340 struct list_head poll_list;
1316 struct sk_buff *completion_queue; 1341 struct sk_buff *completion_queue;
1317 1342
1343 /* Elements below can be accessed between CPUs for RPS */
1344 struct call_single_data csd ____cacheline_aligned_in_smp;
1345 struct sk_buff_head input_pkt_queue;
1318 struct napi_struct backlog; 1346 struct napi_struct backlog;
1319}; 1347};
1320 1348
1321DECLARE_PER_CPU(struct softnet_data,softnet_data); 1349DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
1322 1350
1323#define HAVE_NETIF_QUEUE 1351#define HAVE_NETIF_QUEUE
1324 1352
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 03f816a9b659..def10b064f29 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -300,6 +300,7 @@ typedef unsigned char *sk_buff_data_t;
300 * @nfct_reasm: netfilter conntrack re-assembly pointer 300 * @nfct_reasm: netfilter conntrack re-assembly pointer
301 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c 301 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
302 * @skb_iif: ifindex of device we arrived on 302 * @skb_iif: ifindex of device we arrived on
303 * @rxhash: the packet hash computed on receive
303 * @queue_mapping: Queue mapping for multiqueue devices 304 * @queue_mapping: Queue mapping for multiqueue devices
304 * @tc_index: Traffic control index 305 * @tc_index: Traffic control index
305 * @tc_verd: traffic control verdict 306 * @tc_verd: traffic control verdict
@@ -375,6 +376,8 @@ struct sk_buff {
375#endif 376#endif
376#endif 377#endif
377 378
379 __u32 rxhash;
380
378 kmemcheck_bitfield_begin(flags2); 381 kmemcheck_bitfield_begin(flags2);
379 __u16 queue_mapping:16; 382 __u16 queue_mapping:16;
380#ifdef CONFIG_IPV6_NDISC_NODETYPE 383#ifdef CONFIG_IPV6_NDISC_NODETYPE
diff --git a/net/core/dev.c b/net/core/dev.c
index bcc490cc9452..17b168671501 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1931,7 +1931,7 @@ out_kfree_skb:
1931 return rc; 1931 return rc;
1932} 1932}
1933 1933
1934static u32 skb_tx_hashrnd; 1934static u32 hashrnd __read_mostly;
1935 1935
1936u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 1936u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1937{ 1937{
@@ -1949,7 +1949,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1949 else 1949 else
1950 hash = skb->protocol; 1950 hash = skb->protocol;
1951 1951
1952 hash = jhash_1word(hash, skb_tx_hashrnd); 1952 hash = jhash_1word(hash, hashrnd);
1953 1953
1954 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 1954 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1955} 1955}
@@ -1959,10 +1959,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1959{ 1959{
1960 if (unlikely(queue_index >= dev->real_num_tx_queues)) { 1960 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1961 if (net_ratelimit()) { 1961 if (net_ratelimit()) {
1962 WARN(1, "%s selects TX queue %d, but " 1962 netdev_warn(dev, "selects TX queue %d, but "
1963 "real number of TX queues is %d\n", 1963 "real number of TX queues is %d\n",
1964 dev->name, queue_index, 1964 queue_index, dev->real_num_tx_queues);
1965 dev->real_num_tx_queues);
1966 } 1965 }
1967 return 0; 1966 return 0;
1968 } 1967 }
@@ -2175,6 +2174,172 @@ int weight_p __read_mostly = 64; /* old backlog weight */
2175 2174
2176DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 2175DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2177 2176
2177/*
2178 * get_rps_cpu is called from netif_receive_skb and returns the target
2179 * CPU from the RPS map of the receiving queue for a given skb.
2180 */
2181static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
2182{
2183 struct ipv6hdr *ip6;
2184 struct iphdr *ip;
2185 struct netdev_rx_queue *rxqueue;
2186 struct rps_map *map;
2187 int cpu = -1;
2188 u8 ip_proto;
2189 u32 addr1, addr2, ports, ihl;
2190
2191 rcu_read_lock();
2192
2193 if (skb_rx_queue_recorded(skb)) {
2194 u16 index = skb_get_rx_queue(skb);
2195 if (unlikely(index >= dev->num_rx_queues)) {
2196 if (net_ratelimit()) {
2197 netdev_warn(dev, "received packet on queue "
2198 "%u, but number of RX queues is %u\n",
2199 index, dev->num_rx_queues);
2200 }
2201 goto done;
2202 }
2203 rxqueue = dev->_rx + index;
2204 } else
2205 rxqueue = dev->_rx;
2206
2207 if (!rxqueue->rps_map)
2208 goto done;
2209
2210 if (skb->rxhash)
2211 goto got_hash; /* Skip hash computation on packet header */
2212
2213 switch (skb->protocol) {
2214 case __constant_htons(ETH_P_IP):
2215 if (!pskb_may_pull(skb, sizeof(*ip)))
2216 goto done;
2217
2218 ip = (struct iphdr *) skb->data;
2219 ip_proto = ip->protocol;
2220 addr1 = ip->saddr;
2221 addr2 = ip->daddr;
2222 ihl = ip->ihl;
2223 break;
2224 case __constant_htons(ETH_P_IPV6):
2225 if (!pskb_may_pull(skb, sizeof(*ip6)))
2226 goto done;
2227
2228 ip6 = (struct ipv6hdr *) skb->data;
2229 ip_proto = ip6->nexthdr;
2230 addr1 = ip6->saddr.s6_addr32[3];
2231 addr2 = ip6->daddr.s6_addr32[3];
2232 ihl = (40 >> 2);
2233 break;
2234 default:
2235 goto done;
2236 }
2237 ports = 0;
2238 switch (ip_proto) {
2239 case IPPROTO_TCP:
2240 case IPPROTO_UDP:
2241 case IPPROTO_DCCP:
2242 case IPPROTO_ESP:
2243 case IPPROTO_AH:
2244 case IPPROTO_SCTP:
2245 case IPPROTO_UDPLITE:
2246 if (pskb_may_pull(skb, (ihl * 4) + 4))
2247 ports = *((u32 *) (skb->data + (ihl * 4)));
2248 break;
2249
2250 default:
2251 break;
2252 }
2253
2254 skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
2255 if (!skb->rxhash)
2256 skb->rxhash = 1;
2257
2258got_hash:
2259 map = rcu_dereference(rxqueue->rps_map);
2260 if (map) {
2261 u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2262
2263 if (cpu_online(tcpu)) {
2264 cpu = tcpu;
2265 goto done;
2266 }
2267 }
2268
2269done:
2270 rcu_read_unlock();
2271 return cpu;
2272}
2273
2274/*
2275 * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
2276 * to be sent to kick remote softirq processing. There are two masks since
2277 * the sending of IPIs must be done with interrupts enabled. The select field
2278 * indicates the current mask that enqueue_backlog uses to schedule IPIs.
2279 * select is flipped before net_rps_action is called while still under lock,
2280 * net_rps_action then uses the non-selected mask to send the IPIs and clears
2281 * it without conflicting with enqueue_backlog operation.
2282 */
2283struct rps_remote_softirq_cpus {
2284 cpumask_t mask[2];
2285 int select;
2286};
2287static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus);
2288
2289/* Called from hardirq (IPI) context */
2290static void trigger_softirq(void *data)
2291{
2292 struct softnet_data *queue = data;
2293 __napi_schedule(&queue->backlog);
2294 __get_cpu_var(netdev_rx_stat).received_rps++;
2295}
2296
2297/*
2298 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2299 * queue (may be a remote CPU queue).
2300 */
2301static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
2302{
2303 struct softnet_data *queue;
2304 unsigned long flags;
2305
2306 queue = &per_cpu(softnet_data, cpu);
2307
2308 local_irq_save(flags);
2309 __get_cpu_var(netdev_rx_stat).total++;
2310
2311 spin_lock(&queue->input_pkt_queue.lock);
2312 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2313 if (queue->input_pkt_queue.qlen) {
2314enqueue:
2315 __skb_queue_tail(&queue->input_pkt_queue, skb);
2316 spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
2317 flags);
2318 return NET_RX_SUCCESS;
2319 }
2320
2321 /* Schedule NAPI for backlog device */
2322 if (napi_schedule_prep(&queue->backlog)) {
2323 if (cpu != smp_processor_id()) {
2324 struct rps_remote_softirq_cpus *rcpus =
2325 &__get_cpu_var(rps_remote_softirq_cpus);
2326
2327 cpu_set(cpu, rcpus->mask[rcpus->select]);
2328 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2329 } else
2330 __napi_schedule(&queue->backlog);
2331 }
2332 goto enqueue;
2333 }
2334
2335 spin_unlock(&queue->input_pkt_queue.lock);
2336
2337 __get_cpu_var(netdev_rx_stat).dropped++;
2338 local_irq_restore(flags);
2339
2340 kfree_skb(skb);
2341 return NET_RX_DROP;
2342}
2178 2343
2179/** 2344/**
2180 * netif_rx - post buffer to the network code 2345 * netif_rx - post buffer to the network code
@@ -2193,8 +2358,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2193 2358
2194int netif_rx(struct sk_buff *skb) 2359int netif_rx(struct sk_buff *skb)
2195{ 2360{
2196 struct softnet_data *queue; 2361 int cpu;
2197 unsigned long flags;
2198 2362
2199 /* if netpoll wants it, pretend we never saw it */ 2363 /* if netpoll wants it, pretend we never saw it */
2200 if (netpoll_rx(skb)) 2364 if (netpoll_rx(skb))
@@ -2203,31 +2367,11 @@ int netif_rx(struct sk_buff *skb)
2203 if (!skb->tstamp.tv64) 2367 if (!skb->tstamp.tv64)
2204 net_timestamp(skb); 2368 net_timestamp(skb);
2205 2369
2206 /* 2370 cpu = get_rps_cpu(skb->dev, skb);
2207 * The code is rearranged so that the path is the most 2371 if (cpu < 0)
2208 * short when CPU is congested, but is still operating. 2372 cpu = smp_processor_id();
2209 */
2210 local_irq_save(flags);
2211 queue = &__get_cpu_var(softnet_data);
2212
2213 __get_cpu_var(netdev_rx_stat).total++;
2214 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2215 if (queue->input_pkt_queue.qlen) {
2216enqueue:
2217 __skb_queue_tail(&queue->input_pkt_queue, skb);
2218 local_irq_restore(flags);
2219 return NET_RX_SUCCESS;
2220 }
2221
2222 napi_schedule(&queue->backlog);
2223 goto enqueue;
2224 }
2225
2226 __get_cpu_var(netdev_rx_stat).dropped++;
2227 local_irq_restore(flags);
2228 2373
2229 kfree_skb(skb); 2374 return enqueue_to_backlog(skb, cpu);
2230 return NET_RX_DROP;
2231} 2375}
2232EXPORT_SYMBOL(netif_rx); 2376EXPORT_SYMBOL(netif_rx);
2233 2377
@@ -2464,22 +2608,7 @@ void netif_nit_deliver(struct sk_buff *skb)
2464 rcu_read_unlock(); 2608 rcu_read_unlock();
2465} 2609}
2466 2610
2467/** 2611int __netif_receive_skb(struct sk_buff *skb)
2468 * netif_receive_skb - process receive buffer from network
2469 * @skb: buffer to process
2470 *
2471 * netif_receive_skb() is the main receive data processing function.
2472 * It always succeeds. The buffer may be dropped during processing
2473 * for congestion control or by the protocol layers.
2474 *
2475 * This function may only be called from softirq context and interrupts
2476 * should be enabled.
2477 *
2478 * Return values (usually ignored):
2479 * NET_RX_SUCCESS: no congestion
2480 * NET_RX_DROP: packet was dropped
2481 */
2482int netif_receive_skb(struct sk_buff *skb)
2483{ 2612{
2484 struct packet_type *ptype, *pt_prev; 2613 struct packet_type *ptype, *pt_prev;
2485 struct net_device *orig_dev; 2614 struct net_device *orig_dev;
@@ -2588,6 +2717,33 @@ out:
2588 rcu_read_unlock(); 2717 rcu_read_unlock();
2589 return ret; 2718 return ret;
2590} 2719}
2720
2721/**
2722 * netif_receive_skb - process receive buffer from network
2723 * @skb: buffer to process
2724 *
2725 * netif_receive_skb() is the main receive data processing function.
2726 * It always succeeds. The buffer may be dropped during processing
2727 * for congestion control or by the protocol layers.
2728 *
2729 * This function may only be called from softirq context and interrupts
2730 * should be enabled.
2731 *
2732 * Return values (usually ignored):
2733 * NET_RX_SUCCESS: no congestion
2734 * NET_RX_DROP: packet was dropped
2735 */
2736int netif_receive_skb(struct sk_buff *skb)
2737{
2738 int cpu;
2739
2740 cpu = get_rps_cpu(skb->dev, skb);
2741
2742 if (cpu < 0)
2743 return __netif_receive_skb(skb);
2744 else
2745 return enqueue_to_backlog(skb, cpu);
2746}
2591EXPORT_SYMBOL(netif_receive_skb); 2747EXPORT_SYMBOL(netif_receive_skb);
2592 2748
2593/* Network device is going away, flush any packets still pending */ 2749/* Network device is going away, flush any packets still pending */
@@ -2914,16 +3070,16 @@ static int process_backlog(struct napi_struct *napi, int quota)
2914 do { 3070 do {
2915 struct sk_buff *skb; 3071 struct sk_buff *skb;
2916 3072
2917 local_irq_disable(); 3073 spin_lock_irq(&queue->input_pkt_queue.lock);
2918 skb = __skb_dequeue(&queue->input_pkt_queue); 3074 skb = __skb_dequeue(&queue->input_pkt_queue);
2919 if (!skb) { 3075 if (!skb) {
2920 __napi_complete(napi); 3076 __napi_complete(napi);
2921 local_irq_enable(); 3077 spin_unlock_irq(&queue->input_pkt_queue.lock);
2922 break; 3078 break;
2923 } 3079 }
2924 local_irq_enable(); 3080 spin_unlock_irq(&queue->input_pkt_queue.lock);
2925 3081
2926 netif_receive_skb(skb); 3082 __netif_receive_skb(skb);
2927 } while (++work < quota && jiffies == start_time); 3083 } while (++work < quota && jiffies == start_time);
2928 3084
2929 return work; 3085 return work;
@@ -3012,6 +3168,22 @@ void netif_napi_del(struct napi_struct *napi)
3012} 3168}
3013EXPORT_SYMBOL(netif_napi_del); 3169EXPORT_SYMBOL(netif_napi_del);
3014 3170
3171/*
3172 * net_rps_action sends any pending IPI's for rps. This is only called from
3173 * softirq and interrupts must be enabled.
3174 */
3175static void net_rps_action(cpumask_t *mask)
3176{
3177 int cpu;
3178
3179 /* Send pending IPI's to kick RPS processing on remote cpus. */
3180 for_each_cpu_mask_nr(cpu, *mask) {
3181 struct softnet_data *queue = &per_cpu(softnet_data, cpu);
3182 if (cpu_online(cpu))
3183 __smp_call_function_single(cpu, &queue->csd, 0);
3184 }
3185 cpus_clear(*mask);
3186}
3015 3187
3016static void net_rx_action(struct softirq_action *h) 3188static void net_rx_action(struct softirq_action *h)
3017{ 3189{
@@ -3019,6 +3191,8 @@ static void net_rx_action(struct softirq_action *h)
3019 unsigned long time_limit = jiffies + 2; 3191 unsigned long time_limit = jiffies + 2;
3020 int budget = netdev_budget; 3192 int budget = netdev_budget;
3021 void *have; 3193 void *have;
3194 int select;
3195 struct rps_remote_softirq_cpus *rcpus;
3022 3196
3023 local_irq_disable(); 3197 local_irq_disable();
3024 3198
@@ -3081,8 +3255,14 @@ static void net_rx_action(struct softirq_action *h)
3081 netpoll_poll_unlock(have); 3255 netpoll_poll_unlock(have);
3082 } 3256 }
3083out: 3257out:
3258 rcpus = &__get_cpu_var(rps_remote_softirq_cpus);
3259 select = rcpus->select;
3260 rcpus->select ^= 1;
3261
3084 local_irq_enable(); 3262 local_irq_enable();
3085 3263
3264 net_rps_action(&rcpus->mask[select]);
3265
3086#ifdef CONFIG_NET_DMA 3266#ifdef CONFIG_NET_DMA
3087 /* 3267 /*
3088 * There may not be any more sk_buffs coming right now, so push 3268 * There may not be any more sk_buffs coming right now, so push
@@ -3327,10 +3507,10 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
3327{ 3507{
3328 struct netif_rx_stats *s = v; 3508 struct netif_rx_stats *s = v;
3329 3509
3330 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3510 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3331 s->total, s->dropped, s->time_squeeze, 0, 3511 s->total, s->dropped, s->time_squeeze, 0,
3332 0, 0, 0, 0, /* was fastroute */ 3512 0, 0, 0, 0, /* was fastroute */
3333 s->cpu_collision); 3513 s->cpu_collision, s->received_rps);
3334 return 0; 3514 return 0;
3335} 3515}
3336 3516
@@ -5067,6 +5247,23 @@ int register_netdevice(struct net_device *dev)
5067 5247
5068 dev->iflink = -1; 5248 dev->iflink = -1;
5069 5249
5250 if (!dev->num_rx_queues) {
5251 /*
5252 * Allocate a single RX queue if driver never called
5253 * alloc_netdev_mq
5254 */
5255
5256 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
5257 if (!dev->_rx) {
5258 ret = -ENOMEM;
5259 goto out;
5260 }
5261
5262 dev->_rx->first = dev->_rx;
5263 atomic_set(&dev->_rx->count, 1);
5264 dev->num_rx_queues = 1;
5265 }
5266
5070 /* Init, if this function is available */ 5267 /* Init, if this function is available */
5071 if (dev->netdev_ops->ndo_init) { 5268 if (dev->netdev_ops->ndo_init) {
5072 ret = dev->netdev_ops->ndo_init(dev); 5269 ret = dev->netdev_ops->ndo_init(dev);
@@ -5424,9 +5621,11 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5424 void (*setup)(struct net_device *), unsigned int queue_count) 5621 void (*setup)(struct net_device *), unsigned int queue_count)
5425{ 5622{
5426 struct netdev_queue *tx; 5623 struct netdev_queue *tx;
5624 struct netdev_rx_queue *rx;
5427 struct net_device *dev; 5625 struct net_device *dev;
5428 size_t alloc_size; 5626 size_t alloc_size;
5429 struct net_device *p; 5627 struct net_device *p;
5628 int i;
5430 5629
5431 BUG_ON(strlen(name) >= sizeof(dev->name)); 5630 BUG_ON(strlen(name) >= sizeof(dev->name));
5432 5631
@@ -5452,11 +5651,27 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5452 goto free_p; 5651 goto free_p;
5453 } 5652 }
5454 5653
5654 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5655 if (!rx) {
5656 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5657 "rx queues.\n");
5658 goto free_tx;
5659 }
5660
5661 atomic_set(&rx->count, queue_count);
5662
5663 /*
5664 * Set a pointer to first element in the array which holds the
5665 * reference count.
5666 */
5667 for (i = 0; i < queue_count; i++)
5668 rx[i].first = rx;
5669
5455 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5670 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5456 dev->padded = (char *)dev - (char *)p; 5671 dev->padded = (char *)dev - (char *)p;
5457 5672
5458 if (dev_addr_init(dev)) 5673 if (dev_addr_init(dev))
5459 goto free_tx; 5674 goto free_rx;
5460 5675
5461 dev_unicast_init(dev); 5676 dev_unicast_init(dev);
5462 5677
@@ -5466,6 +5681,9 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5466 dev->num_tx_queues = queue_count; 5681 dev->num_tx_queues = queue_count;
5467 dev->real_num_tx_queues = queue_count; 5682 dev->real_num_tx_queues = queue_count;
5468 5683
5684 dev->_rx = rx;
5685 dev->num_rx_queues = queue_count;
5686
5469 dev->gso_max_size = GSO_MAX_SIZE; 5687 dev->gso_max_size = GSO_MAX_SIZE;
5470 5688
5471 netdev_init_queues(dev); 5689 netdev_init_queues(dev);
@@ -5480,9 +5698,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5480 strcpy(dev->name, name); 5698 strcpy(dev->name, name);
5481 return dev; 5699 return dev;
5482 5700
5701free_rx:
5702 kfree(rx);
5483free_tx: 5703free_tx:
5484 kfree(tx); 5704 kfree(tx);
5485
5486free_p: 5705free_p:
5487 kfree(p); 5706 kfree(p);
5488 return NULL; 5707 return NULL;
@@ -5985,6 +6204,10 @@ static int __init net_dev_init(void)
5985 queue->completion_queue = NULL; 6204 queue->completion_queue = NULL;
5986 INIT_LIST_HEAD(&queue->poll_list); 6205 INIT_LIST_HEAD(&queue->poll_list);
5987 6206
6207 queue->csd.func = trigger_softirq;
6208 queue->csd.info = queue;
6209 queue->csd.flags = 0;
6210
5988 queue->backlog.poll = process_backlog; 6211 queue->backlog.poll = process_backlog;
5989 queue->backlog.weight = weight_p; 6212 queue->backlog.weight = weight_p;
5990 queue->backlog.gro_list = NULL; 6213 queue->backlog.gro_list = NULL;
@@ -6023,7 +6246,7 @@ subsys_initcall(net_dev_init);
6023 6246
6024static int __init initialize_hashrnd(void) 6247static int __init initialize_hashrnd(void)
6025{ 6248{
6026 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); 6249 get_random_bytes(&hashrnd, sizeof(hashrnd));
6027 return 0; 6250 return 0;
6028} 6251}
6029 6252
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 099c753c4213..7a46343d5ae3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -466,6 +466,216 @@ static struct attribute_group wireless_group = {
466}; 466};
467#endif 467#endif
468 468
469/*
470 * RX queue sysfs structures and functions.
471 */
472struct rx_queue_attribute {
473 struct attribute attr;
474 ssize_t (*show)(struct netdev_rx_queue *queue,
475 struct rx_queue_attribute *attr, char *buf);
476 ssize_t (*store)(struct netdev_rx_queue *queue,
477 struct rx_queue_attribute *attr, const char *buf, size_t len);
478};
479#define to_rx_queue_attr(_attr) container_of(_attr, \
480 struct rx_queue_attribute, attr)
481
482#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
483
484static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
485 char *buf)
486{
487 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
488 struct netdev_rx_queue *queue = to_rx_queue(kobj);
489
490 if (!attribute->show)
491 return -EIO;
492
493 return attribute->show(queue, attribute, buf);
494}
495
496static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
497 const char *buf, size_t count)
498{
499 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
500 struct netdev_rx_queue *queue = to_rx_queue(kobj);
501
502 if (!attribute->store)
503 return -EIO;
504
505 return attribute->store(queue, attribute, buf, count);
506}
507
508static struct sysfs_ops rx_queue_sysfs_ops = {
509 .show = rx_queue_attr_show,
510 .store = rx_queue_attr_store,
511};
512
513static ssize_t show_rps_map(struct netdev_rx_queue *queue,
514 struct rx_queue_attribute *attribute, char *buf)
515{
516 struct rps_map *map;
517 cpumask_var_t mask;
518 size_t len = 0;
519 int i;
520
521 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
522 return -ENOMEM;
523
524 rcu_read_lock();
525 map = rcu_dereference(queue->rps_map);
526 if (map)
527 for (i = 0; i < map->len; i++)
528 cpumask_set_cpu(map->cpus[i], mask);
529
530 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
531 if (PAGE_SIZE - len < 3) {
532 rcu_read_unlock();
533 free_cpumask_var(mask);
534 return -EINVAL;
535 }
536 rcu_read_unlock();
537
538 free_cpumask_var(mask);
539 len += sprintf(buf + len, "\n");
540 return len;
541}
542
543static void rps_map_release(struct rcu_head *rcu)
544{
545 struct rps_map *map = container_of(rcu, struct rps_map, rcu);
546
547 kfree(map);
548}
549
550ssize_t store_rps_map(struct netdev_rx_queue *queue,
551 struct rx_queue_attribute *attribute,
552 const char *buf, size_t len)
553{
554 struct rps_map *old_map, *map;
555 cpumask_var_t mask;
556 int err, cpu, i;
557 static DEFINE_SPINLOCK(rps_map_lock);
558
559 if (!capable(CAP_NET_ADMIN))
560 return -EPERM;
561
562 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
563 return -ENOMEM;
564
565 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
566 if (err) {
567 free_cpumask_var(mask);
568 return err;
569 }
570
571 map = kzalloc(max_t(unsigned,
572 RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
573 GFP_KERNEL);
574 if (!map) {
575 free_cpumask_var(mask);
576 return -ENOMEM;
577 }
578
579 i = 0;
580 for_each_cpu_and(cpu, mask, cpu_online_mask)
581 map->cpus[i++] = cpu;
582
583 if (i)
584 map->len = i;
585 else {
586 kfree(map);
587 map = NULL;
588 }
589
590 spin_lock(&rps_map_lock);
591 old_map = queue->rps_map;
592 rcu_assign_pointer(queue->rps_map, map);
593 spin_unlock(&rps_map_lock);
594
595 if (old_map)
596 call_rcu(&old_map->rcu, rps_map_release);
597
598 free_cpumask_var(mask);
599 return len;
600}
601
602static struct rx_queue_attribute rps_cpus_attribute =
603 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
604
605static struct attribute *rx_queue_default_attrs[] = {
606 &rps_cpus_attribute.attr,
607 NULL
608};
609
610static void rx_queue_release(struct kobject *kobj)
611{
612 struct netdev_rx_queue *queue = to_rx_queue(kobj);
613 struct rps_map *map = queue->rps_map;
614 struct netdev_rx_queue *first = queue->first;
615
616 if (map)
617 call_rcu(&map->rcu, rps_map_release);
618
619 if (atomic_dec_and_test(&first->count))
620 kfree(first);
621}
622
623static struct kobj_type rx_queue_ktype = {
624 .sysfs_ops = &rx_queue_sysfs_ops,
625 .release = rx_queue_release,
626 .default_attrs = rx_queue_default_attrs,
627};
628
629static int rx_queue_add_kobject(struct net_device *net, int index)
630{
631 struct netdev_rx_queue *queue = net->_rx + index;
632 struct kobject *kobj = &queue->kobj;
633 int error = 0;
634
635 kobj->kset = net->queues_kset;
636 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
637 "rx-%u", index);
638 if (error) {
639 kobject_put(kobj);
640 return error;
641 }
642
643 kobject_uevent(kobj, KOBJ_ADD);
644
645 return error;
646}
647
648static int rx_queue_register_kobjects(struct net_device *net)
649{
650 int i;
651 int error = 0;
652
653 net->queues_kset = kset_create_and_add("queues",
654 NULL, &net->dev.kobj);
655 if (!net->queues_kset)
656 return -ENOMEM;
657 for (i = 0; i < net->num_rx_queues; i++) {
658 error = rx_queue_add_kobject(net, i);
659 if (error)
660 break;
661 }
662
663 if (error)
664 while (--i >= 0)
665 kobject_put(&net->_rx[i].kobj);
666
667 return error;
668}
669
670static void rx_queue_remove_kobjects(struct net_device *net)
671{
672 int i;
673
674 for (i = 0; i < net->num_rx_queues; i++)
675 kobject_put(&net->_rx[i].kobj);
676 kset_unregister(net->queues_kset);
677}
678
469#endif /* CONFIG_SYSFS */ 679#endif /* CONFIG_SYSFS */
470 680
471#ifdef CONFIG_HOTPLUG 681#ifdef CONFIG_HOTPLUG
@@ -529,6 +739,8 @@ void netdev_unregister_kobject(struct net_device * net)
529 if (!net_eq(dev_net(net), &init_net)) 739 if (!net_eq(dev_net(net), &init_net))
530 return; 740 return;
531 741
742 rx_queue_remove_kobjects(net);
743
532 device_del(dev); 744 device_del(dev);
533} 745}
534 746
@@ -537,6 +749,7 @@ int netdev_register_kobject(struct net_device *net)
537{ 749{
538 struct device *dev = &(net->dev); 750 struct device *dev = &(net->dev);
539 const struct attribute_group **groups = net->sysfs_groups; 751 const struct attribute_group **groups = net->sysfs_groups;
752 int error = 0;
540 753
541 dev->class = &net_class; 754 dev->class = &net_class;
542 dev->platform_data = net; 755 dev->platform_data = net;
@@ -563,7 +776,17 @@ int netdev_register_kobject(struct net_device *net)
563 if (!net_eq(dev_net(net), &init_net)) 776 if (!net_eq(dev_net(net), &init_net))
564 return 0; 777 return 0;
565 778
566 return device_add(dev); 779 error = device_add(dev);
780 if (error)
781 return error;
782
783 error = rx_queue_register_kobjects(net);
784 if (error) {
785 device_del(dev);
786 return error;
787 }
788
789 return error;
567} 790}
568 791
569int netdev_class_create_file(struct class_attribute *class_attr) 792int netdev_class_create_file(struct class_attribute *class_attr)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e060c91e..bdea0efdf8cb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -534,6 +534,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
534 new->network_header = old->network_header; 534 new->network_header = old->network_header;
535 new->mac_header = old->mac_header; 535 new->mac_header = old->mac_header;
536 skb_dst_set(new, dst_clone(skb_dst(old))); 536 skb_dst_set(new, dst_clone(skb_dst(old)));
537 new->rxhash = old->rxhash;
537#ifdef CONFIG_XFRM 538#ifdef CONFIG_XFRM
538 new->sp = secpath_get(old->sp); 539 new->sp = secpath_get(old->sp);
539#endif 540#endif
@@ -581,6 +582,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
581 C(len); 582 C(len);
582 C(data_len); 583 C(data_len);
583 C(mac_len); 584 C(mac_len);
585 C(rxhash);
584 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 586 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
585 n->cloned = 1; 587 n->cloned = 1;
586 n->nohdr = 0; 588 n->nohdr = 0;