aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/netdevice.h32
-rw-r--r--include/linux/skbuff.h3
-rw-r--r--net/core/dev.c335
-rw-r--r--net/core/net-sysfs.c225
-rw-r--r--net/core/skbuff.c2
5 files changed, 538 insertions, 59 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c79a88be7c33..de1a52bcb9e0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -223,6 +223,7 @@ struct netif_rx_stats {
223 unsigned dropped; 223 unsigned dropped;
224 unsigned time_squeeze; 224 unsigned time_squeeze;
225 unsigned cpu_collision; 225 unsigned cpu_collision;
226 unsigned received_rps;
226}; 227};
227 228
228DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat); 229DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
@@ -530,6 +531,24 @@ struct netdev_queue {
530 unsigned long tx_dropped; 531 unsigned long tx_dropped;
531} ____cacheline_aligned_in_smp; 532} ____cacheline_aligned_in_smp;
532 533
534/*
535 * This structure holds an RPS map which can be of variable length. The
536 * map is an array of CPUs.
537 */
538struct rps_map {
539 unsigned int len;
540 struct rcu_head rcu;
541 u16 cpus[0];
542};
543#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
544
545/* This structure contains an instance of an RX queue. */
546struct netdev_rx_queue {
547 struct rps_map *rps_map;
548 struct kobject kobj;
549 struct netdev_rx_queue *first;
550 atomic_t count;
551} ____cacheline_aligned_in_smp;
533 552
534/* 553/*
535 * This structure defines the management hooks for network devices. 554 * This structure defines the management hooks for network devices.
@@ -878,6 +897,13 @@ struct net_device {
878 897
879 unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ 898 unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
880 899
900 struct kset *queues_kset;
901
902 struct netdev_rx_queue *_rx;
903
904 /* Number of RX queues allocated at alloc_netdev_mq() time */
905 unsigned int num_rx_queues;
906
881 struct netdev_queue rx_queue; 907 struct netdev_queue rx_queue;
882 908
883 struct netdev_queue *_tx ____cacheline_aligned_in_smp; 909 struct netdev_queue *_tx ____cacheline_aligned_in_smp;
@@ -1311,14 +1337,16 @@ static inline int unregister_gifconf(unsigned int family)
1311 */ 1337 */
1312struct softnet_data { 1338struct softnet_data {
1313 struct Qdisc *output_queue; 1339 struct Qdisc *output_queue;
1314 struct sk_buff_head input_pkt_queue;
1315 struct list_head poll_list; 1340 struct list_head poll_list;
1316 struct sk_buff *completion_queue; 1341 struct sk_buff *completion_queue;
1317 1342
1343 /* Elements below can be accessed between CPUs for RPS */
1344 struct call_single_data csd ____cacheline_aligned_in_smp;
1345 struct sk_buff_head input_pkt_queue;
1318 struct napi_struct backlog; 1346 struct napi_struct backlog;
1319}; 1347};
1320 1348
1321DECLARE_PER_CPU(struct softnet_data,softnet_data); 1349DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
1322 1350
1323#define HAVE_NETIF_QUEUE 1351#define HAVE_NETIF_QUEUE
1324 1352
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 03f816a9b659..def10b064f29 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -300,6 +300,7 @@ typedef unsigned char *sk_buff_data_t;
300 * @nfct_reasm: netfilter conntrack re-assembly pointer 300 * @nfct_reasm: netfilter conntrack re-assembly pointer
301 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c 301 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
302 * @skb_iif: ifindex of device we arrived on 302 * @skb_iif: ifindex of device we arrived on
303 * @rxhash: the packet hash computed on receive
303 * @queue_mapping: Queue mapping for multiqueue devices 304 * @queue_mapping: Queue mapping for multiqueue devices
304 * @tc_index: Traffic control index 305 * @tc_index: Traffic control index
305 * @tc_verd: traffic control verdict 306 * @tc_verd: traffic control verdict
@@ -375,6 +376,8 @@ struct sk_buff {
375#endif 376#endif
376#endif 377#endif
377 378
379 __u32 rxhash;
380
378 kmemcheck_bitfield_begin(flags2); 381 kmemcheck_bitfield_begin(flags2);
379 __u16 queue_mapping:16; 382 __u16 queue_mapping:16;
380#ifdef CONFIG_IPV6_NDISC_NODETYPE 383#ifdef CONFIG_IPV6_NDISC_NODETYPE
diff --git a/net/core/dev.c b/net/core/dev.c
index bcc490cc9452..17b168671501 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1931,7 +1931,7 @@ out_kfree_skb:
1931 return rc; 1931 return rc;
1932} 1932}
1933 1933
1934static u32 skb_tx_hashrnd; 1934static u32 hashrnd __read_mostly;
1935 1935
1936u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 1936u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1937{ 1937{
@@ -1949,7 +1949,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1949 else 1949 else
1950 hash = skb->protocol; 1950 hash = skb->protocol;
1951 1951
1952 hash = jhash_1word(hash, skb_tx_hashrnd); 1952 hash = jhash_1word(hash, hashrnd);
1953 1953
1954 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 1954 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1955} 1955}
@@ -1959,10 +1959,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1959{ 1959{
1960 if (unlikely(queue_index >= dev->real_num_tx_queues)) { 1960 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1961 if (net_ratelimit()) { 1961 if (net_ratelimit()) {
1962 WARN(1, "%s selects TX queue %d, but " 1962 netdev_warn(dev, "selects TX queue %d, but "
1963 "real number of TX queues is %d\n", 1963 "real number of TX queues is %d\n",
1964 dev->name, queue_index, 1964 queue_index, dev->real_num_tx_queues);
1965 dev->real_num_tx_queues);
1966 } 1965 }
1967 return 0; 1966 return 0;
1968 } 1967 }
@@ -2175,6 +2174,172 @@ int weight_p __read_mostly = 64; /* old backlog weight */
2175 2174
2176DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 2175DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2177 2176
2177/*
2178 * get_rps_cpu is called from netif_receive_skb and returns the target
2179 * CPU from the RPS map of the receiving queue for a given skb.
2180 */
2181static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
2182{
2183 struct ipv6hdr *ip6;
2184 struct iphdr *ip;
2185 struct netdev_rx_queue *rxqueue;
2186 struct rps_map *map;
2187 int cpu = -1;
2188 u8 ip_proto;
2189 u32 addr1, addr2, ports, ihl;
2190
2191 rcu_read_lock();
2192
2193 if (skb_rx_queue_recorded(skb)) {
2194 u16 index = skb_get_rx_queue(skb);
2195 if (unlikely(index >= dev->num_rx_queues)) {
2196 if (net_ratelimit()) {
2197 netdev_warn(dev, "received packet on queue "
2198 "%u, but number of RX queues is %u\n",
2199 index, dev->num_rx_queues);
2200 }
2201 goto done;
2202 }
2203 rxqueue = dev->_rx + index;
2204 } else
2205 rxqueue = dev->_rx;
2206
2207 if (!rxqueue->rps_map)
2208 goto done;
2209
2210 if (skb->rxhash)
2211 goto got_hash; /* Skip hash computation on packet header */
2212
2213 switch (skb->protocol) {
2214 case __constant_htons(ETH_P_IP):
2215 if (!pskb_may_pull(skb, sizeof(*ip)))
2216 goto done;
2217
2218 ip = (struct iphdr *) skb->data;
2219 ip_proto = ip->protocol;
2220 addr1 = ip->saddr;
2221 addr2 = ip->daddr;
2222 ihl = ip->ihl;
2223 break;
2224 case __constant_htons(ETH_P_IPV6):
2225 if (!pskb_may_pull(skb, sizeof(*ip6)))
2226 goto done;
2227
2228 ip6 = (struct ipv6hdr *) skb->data;
2229 ip_proto = ip6->nexthdr;
2230 addr1 = ip6->saddr.s6_addr32[3];
2231 addr2 = ip6->daddr.s6_addr32[3];
2232 ihl = (40 >> 2);
2233 break;
2234 default:
2235 goto done;
2236 }
2237 ports = 0;
2238 switch (ip_proto) {
2239 case IPPROTO_TCP:
2240 case IPPROTO_UDP:
2241 case IPPROTO_DCCP:
2242 case IPPROTO_ESP:
2243 case IPPROTO_AH:
2244 case IPPROTO_SCTP:
2245 case IPPROTO_UDPLITE:
2246 if (pskb_may_pull(skb, (ihl * 4) + 4))
2247 ports = *((u32 *) (skb->data + (ihl * 4)));
2248 break;
2249
2250 default:
2251 break;
2252 }
2253
2254 skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
2255 if (!skb->rxhash)
2256 skb->rxhash = 1;
2257
2258got_hash:
2259 map = rcu_dereference(rxqueue->rps_map);
2260 if (map) {
2261 u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2262
2263 if (cpu_online(tcpu)) {
2264 cpu = tcpu;
2265 goto done;
2266 }
2267 }
2268
2269done:
2270 rcu_read_unlock();
2271 return cpu;
2272}
2273
2274/*
2275 * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
2276 * to be sent to kick remote softirq processing. There are two masks since
2277 * the sending of IPIs must be done with interrupts enabled. The select field
2278 * indicates the current mask that enqueue_backlog uses to schedule IPIs.
2279 * select is flipped before net_rps_action is called while still under lock,
2280 * net_rps_action then uses the non-selected mask to send the IPIs and clears
2281 * it without conflicting with enqueue_backlog operation.
2282 */
2283struct rps_remote_softirq_cpus {
2284 cpumask_t mask[2];
2285 int select;
2286};
2287static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus);
2288
2289/* Called from hardirq (IPI) context */
2290static void trigger_softirq(void *data)
2291{
2292 struct softnet_data *queue = data;
2293 __napi_schedule(&queue->backlog);
2294 __get_cpu_var(netdev_rx_stat).received_rps++;
2295}
2296
2297/*
2298 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2299 * queue (may be a remote CPU queue).
2300 */
2301static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
2302{
2303 struct softnet_data *queue;
2304 unsigned long flags;
2305
2306 queue = &per_cpu(softnet_data, cpu);
2307
2308 local_irq_save(flags);
2309 __get_cpu_var(netdev_rx_stat).total++;
2310
2311 spin_lock(&queue->input_pkt_queue.lock);
2312 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2313 if (queue->input_pkt_queue.qlen) {
2314enqueue:
2315 __skb_queue_tail(&queue->input_pkt_queue, skb);
2316 spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
2317 flags);
2318 return NET_RX_SUCCESS;
2319 }
2320
2321 /* Schedule NAPI for backlog device */
2322 if (napi_schedule_prep(&queue->backlog)) {
2323 if (cpu != smp_processor_id()) {
2324 struct rps_remote_softirq_cpus *rcpus =
2325 &__get_cpu_var(rps_remote_softirq_cpus);
2326
2327 cpu_set(cpu, rcpus->mask[rcpus->select]);
2328 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2329 } else
2330 __napi_schedule(&queue->backlog);
2331 }
2332 goto enqueue;
2333 }
2334
2335 spin_unlock(&queue->input_pkt_queue.lock);
2336
2337 __get_cpu_var(netdev_rx_stat).dropped++;
2338 local_irq_restore(flags);
2339
2340 kfree_skb(skb);
2341 return NET_RX_DROP;
2342}
2178 2343
2179/** 2344/**
2180 * netif_rx - post buffer to the network code 2345 * netif_rx - post buffer to the network code
@@ -2193,8 +2358,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2193 2358
2194int netif_rx(struct sk_buff *skb) 2359int netif_rx(struct sk_buff *skb)
2195{ 2360{
2196 struct softnet_data *queue; 2361 int cpu;
2197 unsigned long flags;
2198 2362
2199 /* if netpoll wants it, pretend we never saw it */ 2363 /* if netpoll wants it, pretend we never saw it */
2200 if (netpoll_rx(skb)) 2364 if (netpoll_rx(skb))
@@ -2203,31 +2367,11 @@ int netif_rx(struct sk_buff *skb)
2203 if (!skb->tstamp.tv64) 2367 if (!skb->tstamp.tv64)
2204 net_timestamp(skb); 2368 net_timestamp(skb);
2205 2369
2206 /* 2370 cpu = get_rps_cpu(skb->dev, skb);
2207 * The code is rearranged so that the path is the most 2371 if (cpu < 0)
2208 * short when CPU is congested, but is still operating. 2372 cpu = smp_processor_id();
2209 */
2210 local_irq_save(flags);
2211 queue = &__get_cpu_var(softnet_data);
2212
2213 __get_cpu_var(netdev_rx_stat).total++;
2214 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2215 if (queue->input_pkt_queue.qlen) {
2216enqueue:
2217 __skb_queue_tail(&queue->input_pkt_queue, skb);
2218 local_irq_restore(flags);
2219 return NET_RX_SUCCESS;
2220 }
2221
2222 napi_schedule(&queue->backlog);
2223 goto enqueue;
2224 }
2225
2226 __get_cpu_var(netdev_rx_stat).dropped++;
2227 local_irq_restore(flags);
2228 2373
2229 kfree_skb(skb); 2374 return enqueue_to_backlog(skb, cpu);
2230 return NET_RX_DROP;
2231} 2375}
2232EXPORT_SYMBOL(netif_rx); 2376EXPORT_SYMBOL(netif_rx);
2233 2377
@@ -2464,22 +2608,7 @@ void netif_nit_deliver(struct sk_buff *skb)
2464 rcu_read_unlock(); 2608 rcu_read_unlock();
2465} 2609}
2466 2610
2467/** 2611int __netif_receive_skb(struct sk_buff *skb)
2468 * netif_receive_skb - process receive buffer from network
2469 * @skb: buffer to process
2470 *
2471 * netif_receive_skb() is the main receive data processing function.
2472 * It always succeeds. The buffer may be dropped during processing
2473 * for congestion control or by the protocol layers.
2474 *
2475 * This function may only be called from softirq context and interrupts
2476 * should be enabled.
2477 *
2478 * Return values (usually ignored):
2479 * NET_RX_SUCCESS: no congestion
2480 * NET_RX_DROP: packet was dropped
2481 */
2482int netif_receive_skb(struct sk_buff *skb)
2483{ 2612{
2484 struct packet_type *ptype, *pt_prev; 2613 struct packet_type *ptype, *pt_prev;
2485 struct net_device *orig_dev; 2614 struct net_device *orig_dev;
@@ -2588,6 +2717,33 @@ out:
2588 rcu_read_unlock(); 2717 rcu_read_unlock();
2589 return ret; 2718 return ret;
2590} 2719}
2720
2721/**
2722 * netif_receive_skb - process receive buffer from network
2723 * @skb: buffer to process
2724 *
2725 * netif_receive_skb() is the main receive data processing function.
2726 * It always succeeds. The buffer may be dropped during processing
2727 * for congestion control or by the protocol layers.
2728 *
2729 * This function may only be called from softirq context and interrupts
2730 * should be enabled.
2731 *
2732 * Return values (usually ignored):
2733 * NET_RX_SUCCESS: no congestion
2734 * NET_RX_DROP: packet was dropped
2735 */
2736int netif_receive_skb(struct sk_buff *skb)
2737{
2738 int cpu;
2739
2740 cpu = get_rps_cpu(skb->dev, skb);
2741
2742 if (cpu < 0)
2743 return __netif_receive_skb(skb);
2744 else
2745 return enqueue_to_backlog(skb, cpu);
2746}
2591EXPORT_SYMBOL(netif_receive_skb); 2747EXPORT_SYMBOL(netif_receive_skb);
2592 2748
2593/* Network device is going away, flush any packets still pending */ 2749/* Network device is going away, flush any packets still pending */
@@ -2914,16 +3070,16 @@ static int process_backlog(struct napi_struct *napi, int quota)
2914 do { 3070 do {
2915 struct sk_buff *skb; 3071 struct sk_buff *skb;
2916 3072
2917 local_irq_disable(); 3073 spin_lock_irq(&queue->input_pkt_queue.lock);
2918 skb = __skb_dequeue(&queue->input_pkt_queue); 3074 skb = __skb_dequeue(&queue->input_pkt_queue);
2919 if (!skb) { 3075 if (!skb) {
2920 __napi_complete(napi); 3076 __napi_complete(napi);
2921 local_irq_enable(); 3077 spin_unlock_irq(&queue->input_pkt_queue.lock);
2922 break; 3078 break;
2923 } 3079 }
2924 local_irq_enable(); 3080 spin_unlock_irq(&queue->input_pkt_queue.lock);
2925 3081
2926 netif_receive_skb(skb); 3082 __netif_receive_skb(skb);
2927 } while (++work < quota && jiffies == start_time); 3083 } while (++work < quota && jiffies == start_time);
2928 3084
2929 return work; 3085 return work;
@@ -3012,6 +3168,22 @@ void netif_napi_del(struct napi_struct *napi)
3012} 3168}
3013EXPORT_SYMBOL(netif_napi_del); 3169EXPORT_SYMBOL(netif_napi_del);
3014 3170
3171/*
3172 * net_rps_action sends any pending IPI's for rps. This is only called from
3173 * softirq and interrupts must be enabled.
3174 */
3175static void net_rps_action(cpumask_t *mask)
3176{
3177 int cpu;
3178
3179 /* Send pending IPI's to kick RPS processing on remote cpus. */
3180 for_each_cpu_mask_nr(cpu, *mask) {
3181 struct softnet_data *queue = &per_cpu(softnet_data, cpu);
3182 if (cpu_online(cpu))
3183 __smp_call_function_single(cpu, &queue->csd, 0);
3184 }
3185 cpus_clear(*mask);
3186}
3015 3187
3016static void net_rx_action(struct softirq_action *h) 3188static void net_rx_action(struct softirq_action *h)
3017{ 3189{
@@ -3019,6 +3191,8 @@ static void net_rx_action(struct softirq_action *h)
3019 unsigned long time_limit = jiffies + 2; 3191 unsigned long time_limit = jiffies + 2;
3020 int budget = netdev_budget; 3192 int budget = netdev_budget;
3021 void *have; 3193 void *have;
3194 int select;
3195 struct rps_remote_softirq_cpus *rcpus;
3022 3196
3023 local_irq_disable(); 3197 local_irq_disable();
3024 3198
@@ -3081,8 +3255,14 @@ static void net_rx_action(struct softirq_action *h)
3081 netpoll_poll_unlock(have); 3255 netpoll_poll_unlock(have);
3082 } 3256 }
3083out: 3257out:
3258 rcpus = &__get_cpu_var(rps_remote_softirq_cpus);
3259 select = rcpus->select;
3260 rcpus->select ^= 1;
3261
3084 local_irq_enable(); 3262 local_irq_enable();
3085 3263
3264 net_rps_action(&rcpus->mask[select]);
3265
3086#ifdef CONFIG_NET_DMA 3266#ifdef CONFIG_NET_DMA
3087 /* 3267 /*
3088 * There may not be any more sk_buffs coming right now, so push 3268 * There may not be any more sk_buffs coming right now, so push
@@ -3327,10 +3507,10 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
3327{ 3507{
3328 struct netif_rx_stats *s = v; 3508 struct netif_rx_stats *s = v;
3329 3509
3330 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3510 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3331 s->total, s->dropped, s->time_squeeze, 0, 3511 s->total, s->dropped, s->time_squeeze, 0,
3332 0, 0, 0, 0, /* was fastroute */ 3512 0, 0, 0, 0, /* was fastroute */
3333 s->cpu_collision); 3513 s->cpu_collision, s->received_rps);
3334 return 0; 3514 return 0;
3335} 3515}
3336 3516
@@ -5067,6 +5247,23 @@ int register_netdevice(struct net_device *dev)
5067 5247
5068 dev->iflink = -1; 5248 dev->iflink = -1;
5069 5249
5250 if (!dev->num_rx_queues) {
5251 /*
5252 * Allocate a single RX queue if driver never called
5253 * alloc_netdev_mq
5254 */
5255
5256 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
5257 if (!dev->_rx) {
5258 ret = -ENOMEM;
5259 goto out;
5260 }
5261
5262 dev->_rx->first = dev->_rx;
5263 atomic_set(&dev->_rx->count, 1);
5264 dev->num_rx_queues = 1;
5265 }
5266
5070 /* Init, if this function is available */ 5267 /* Init, if this function is available */
5071 if (dev->netdev_ops->ndo_init) { 5268 if (dev->netdev_ops->ndo_init) {
5072 ret = dev->netdev_ops->ndo_init(dev); 5269 ret = dev->netdev_ops->ndo_init(dev);
@@ -5424,9 +5621,11 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5424 void (*setup)(struct net_device *), unsigned int queue_count) 5621 void (*setup)(struct net_device *), unsigned int queue_count)
5425{ 5622{
5426 struct netdev_queue *tx; 5623 struct netdev_queue *tx;
5624 struct netdev_rx_queue *rx;
5427 struct net_device *dev; 5625 struct net_device *dev;
5428 size_t alloc_size; 5626 size_t alloc_size;
5429 struct net_device *p; 5627 struct net_device *p;
5628 int i;
5430 5629
5431 BUG_ON(strlen(name) >= sizeof(dev->name)); 5630 BUG_ON(strlen(name) >= sizeof(dev->name));
5432 5631
@@ -5452,11 +5651,27 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5452 goto free_p; 5651 goto free_p;
5453 } 5652 }
5454 5653
5654 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5655 if (!rx) {
5656 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5657 "rx queues.\n");
5658 goto free_tx;
5659 }
5660
5661 atomic_set(&rx->count, queue_count);
5662
5663 /*
5664 * Set a pointer to first element in the array which holds the
5665 * reference count.
5666 */
5667 for (i = 0; i < queue_count; i++)
5668 rx[i].first = rx;
5669
5455 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5670 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5456 dev->padded = (char *)dev - (char *)p; 5671 dev->padded = (char *)dev - (char *)p;
5457 5672
5458 if (dev_addr_init(dev)) 5673 if (dev_addr_init(dev))
5459 goto free_tx; 5674 goto free_rx;
5460 5675
5461 dev_unicast_init(dev); 5676 dev_unicast_init(dev);
5462 5677
@@ -5466,6 +5681,9 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5466 dev->num_tx_queues = queue_count; 5681 dev->num_tx_queues = queue_count;
5467 dev->real_num_tx_queues = queue_count; 5682 dev->real_num_tx_queues = queue_count;
5468 5683
5684 dev->_rx = rx;
5685 dev->num_rx_queues = queue_count;
5686
5469 dev->gso_max_size = GSO_MAX_SIZE; 5687 dev->gso_max_size = GSO_MAX_SIZE;
5470 5688
5471 netdev_init_queues(dev); 5689 netdev_init_queues(dev);
@@ -5480,9 +5698,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5480 strcpy(dev->name, name); 5698 strcpy(dev->name, name);
5481 return dev; 5699 return dev;
5482 5700
5701free_rx:
5702 kfree(rx);
5483free_tx: 5703free_tx:
5484 kfree(tx); 5704 kfree(tx);
5485
5486free_p: 5705free_p:
5487 kfree(p); 5706 kfree(p);
5488 return NULL; 5707 return NULL;
@@ -5985,6 +6204,10 @@ static int __init net_dev_init(void)
5985 queue->completion_queue = NULL; 6204 queue->completion_queue = NULL;
5986 INIT_LIST_HEAD(&queue->poll_list); 6205 INIT_LIST_HEAD(&queue->poll_list);
5987 6206
6207 queue->csd.func = trigger_softirq;
6208 queue->csd.info = queue;
6209 queue->csd.flags = 0;
6210
5988 queue->backlog.poll = process_backlog; 6211 queue->backlog.poll = process_backlog;
5989 queue->backlog.weight = weight_p; 6212 queue->backlog.weight = weight_p;
5990 queue->backlog.gro_list = NULL; 6213 queue->backlog.gro_list = NULL;
@@ -6023,7 +6246,7 @@ subsys_initcall(net_dev_init);
6023 6246
6024static int __init initialize_hashrnd(void) 6247static int __init initialize_hashrnd(void)
6025{ 6248{
6026 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); 6249 get_random_bytes(&hashrnd, sizeof(hashrnd));
6027 return 0; 6250 return 0;
6028} 6251}
6029 6252
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 099c753c4213..7a46343d5ae3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -466,6 +466,216 @@ static struct attribute_group wireless_group = {
466}; 466};
467#endif 467#endif
468 468
469/*
470 * RX queue sysfs structures and functions.
471 */
472struct rx_queue_attribute {
473 struct attribute attr;
474 ssize_t (*show)(struct netdev_rx_queue *queue,
475 struct rx_queue_attribute *attr, char *buf);
476 ssize_t (*store)(struct netdev_rx_queue *queue,
477 struct rx_queue_attribute *attr, const char *buf, size_t len);
478};
479#define to_rx_queue_attr(_attr) container_of(_attr, \
480 struct rx_queue_attribute, attr)
481
482#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
483
484static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
485 char *buf)
486{
487 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
488 struct netdev_rx_queue *queue = to_rx_queue(kobj);
489
490 if (!attribute->show)
491 return -EIO;
492
493 return attribute->show(queue, attribute, buf);
494}
495
496static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
497 const char *buf, size_t count)
498{
499 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
500 struct netdev_rx_queue *queue = to_rx_queue(kobj);
501
502 if (!attribute->store)
503 return -EIO;
504
505 return attribute->store(queue, attribute, buf, count);
506}
507
508static struct sysfs_ops rx_queue_sysfs_ops = {
509 .show = rx_queue_attr_show,
510 .store = rx_queue_attr_store,
511};
512
513static ssize_t show_rps_map(struct netdev_rx_queue *queue,
514 struct rx_queue_attribute *attribute, char *buf)
515{
516 struct rps_map *map;
517 cpumask_var_t mask;
518 size_t len = 0;
519 int i;
520
521 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
522 return -ENOMEM;
523
524 rcu_read_lock();
525 map = rcu_dereference(queue->rps_map);
526 if (map)
527 for (i = 0; i < map->len; i++)
528 cpumask_set_cpu(map->cpus[i], mask);
529
530 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
531 if (PAGE_SIZE - len < 3) {
532 rcu_read_unlock();
533 free_cpumask_var(mask);
534 return -EINVAL;
535 }
536 rcu_read_unlock();
537
538 free_cpumask_var(mask);
539 len += sprintf(buf + len, "\n");
540 return len;
541}
542
543static void rps_map_release(struct rcu_head *rcu)
544{
545 struct rps_map *map = container_of(rcu, struct rps_map, rcu);
546
547 kfree(map);
548}
549
550ssize_t store_rps_map(struct netdev_rx_queue *queue,
551 struct rx_queue_attribute *attribute,
552 const char *buf, size_t len)
553{
554 struct rps_map *old_map, *map;
555 cpumask_var_t mask;
556 int err, cpu, i;
557 static DEFINE_SPINLOCK(rps_map_lock);
558
559 if (!capable(CAP_NET_ADMIN))
560 return -EPERM;
561
562 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
563 return -ENOMEM;
564
565 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
566 if (err) {
567 free_cpumask_var(mask);
568 return err;
569 }
570
571 map = kzalloc(max_t(unsigned,
572 RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
573 GFP_KERNEL);
574 if (!map) {
575 free_cpumask_var(mask);
576 return -ENOMEM;
577 }
578
579 i = 0;
580 for_each_cpu_and(cpu, mask, cpu_online_mask)
581 map->cpus[i++] = cpu;
582
583 if (i)
584 map->len = i;
585 else {
586 kfree(map);
587 map = NULL;
588 }
589
590 spin_lock(&rps_map_lock);
591 old_map = queue->rps_map;
592 rcu_assign_pointer(queue->rps_map, map);
593 spin_unlock(&rps_map_lock);
594
595 if (old_map)
596 call_rcu(&old_map->rcu, rps_map_release);
597
598 free_cpumask_var(mask);
599 return len;
600}
601
602static struct rx_queue_attribute rps_cpus_attribute =
603 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
604
605static struct attribute *rx_queue_default_attrs[] = {
606 &rps_cpus_attribute.attr,
607 NULL
608};
609
610static void rx_queue_release(struct kobject *kobj)
611{
612 struct netdev_rx_queue *queue = to_rx_queue(kobj);
613 struct rps_map *map = queue->rps_map;
614 struct netdev_rx_queue *first = queue->first;
615
616 if (map)
617 call_rcu(&map->rcu, rps_map_release);
618
619 if (atomic_dec_and_test(&first->count))
620 kfree(first);
621}
622
623static struct kobj_type rx_queue_ktype = {
624 .sysfs_ops = &rx_queue_sysfs_ops,
625 .release = rx_queue_release,
626 .default_attrs = rx_queue_default_attrs,
627};
628
629static int rx_queue_add_kobject(struct net_device *net, int index)
630{
631 struct netdev_rx_queue *queue = net->_rx + index;
632 struct kobject *kobj = &queue->kobj;
633 int error = 0;
634
635 kobj->kset = net->queues_kset;
636 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
637 "rx-%u", index);
638 if (error) {
639 kobject_put(kobj);
640 return error;
641 }
642
643 kobject_uevent(kobj, KOBJ_ADD);
644
645 return error;
646}
647
648static int rx_queue_register_kobjects(struct net_device *net)
649{
650 int i;
651 int error = 0;
652
653 net->queues_kset = kset_create_and_add("queues",
654 NULL, &net->dev.kobj);
655 if (!net->queues_kset)
656 return -ENOMEM;
657 for (i = 0; i < net->num_rx_queues; i++) {
658 error = rx_queue_add_kobject(net, i);
659 if (error)
660 break;
661 }
662
663 if (error)
664 while (--i >= 0)
665 kobject_put(&net->_rx[i].kobj);
666
667 return error;
668}
669
670static void rx_queue_remove_kobjects(struct net_device *net)
671{
672 int i;
673
674 for (i = 0; i < net->num_rx_queues; i++)
675 kobject_put(&net->_rx[i].kobj);
676 kset_unregister(net->queues_kset);
677}
678
469#endif /* CONFIG_SYSFS */ 679#endif /* CONFIG_SYSFS */
470 680
471#ifdef CONFIG_HOTPLUG 681#ifdef CONFIG_HOTPLUG
@@ -529,6 +739,8 @@ void netdev_unregister_kobject(struct net_device * net)
529 if (!net_eq(dev_net(net), &init_net)) 739 if (!net_eq(dev_net(net), &init_net))
530 return; 740 return;
531 741
742 rx_queue_remove_kobjects(net);
743
532 device_del(dev); 744 device_del(dev);
533} 745}
534 746
@@ -537,6 +749,7 @@ int netdev_register_kobject(struct net_device *net)
537{ 749{
538 struct device *dev = &(net->dev); 750 struct device *dev = &(net->dev);
539 const struct attribute_group **groups = net->sysfs_groups; 751 const struct attribute_group **groups = net->sysfs_groups;
752 int error = 0;
540 753
541 dev->class = &net_class; 754 dev->class = &net_class;
542 dev->platform_data = net; 755 dev->platform_data = net;
@@ -563,7 +776,17 @@ int netdev_register_kobject(struct net_device *net)
563 if (!net_eq(dev_net(net), &init_net)) 776 if (!net_eq(dev_net(net), &init_net))
564 return 0; 777 return 0;
565 778
566 return device_add(dev); 779 error = device_add(dev);
780 if (error)
781 return error;
782
783 error = rx_queue_register_kobjects(net);
784 if (error) {
785 device_del(dev);
786 return error;
787 }
788
789 return error;
567} 790}
568 791
569int netdev_class_create_file(struct class_attribute *class_attr) 792int netdev_class_create_file(struct class_attribute *class_attr)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e060c91e..bdea0efdf8cb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -534,6 +534,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
534 new->network_header = old->network_header; 534 new->network_header = old->network_header;
535 new->mac_header = old->mac_header; 535 new->mac_header = old->mac_header;
536 skb_dst_set(new, dst_clone(skb_dst(old))); 536 skb_dst_set(new, dst_clone(skb_dst(old)));
537 new->rxhash = old->rxhash;
537#ifdef CONFIG_XFRM 538#ifdef CONFIG_XFRM
538 new->sp = secpath_get(old->sp); 539 new->sp = secpath_get(old->sp);
539#endif 540#endif
@@ -581,6 +582,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
581 C(len); 582 C(len);
582 C(data_len); 583 C(data_len);
583 C(mac_len); 584 C(mac_len);
585 C(rxhash);
584 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 586 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
585 n->cloned = 1; 587 n->cloned = 1;
586 n->nohdr = 0; 588 n->nohdr = 0;