diff options
-rw-r--r-- | include/linux/netdevice.h | 69 | ||||
-rw-r--r-- | include/net/inet_sock.h | 38 | ||||
-rw-r--r-- | net/core/dev.c | 111 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 94 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 68 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 29 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 7 |
8 files changed, 389 insertions, 29 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 55c2086e1f06..649a0252686e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
@@ -530,14 +530,73 @@ struct rps_map { | |||
530 | }; | 530 | }; |
531 | #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) | 531 | #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) |
532 | 532 | ||
533 | /* | ||
534 | * The rps_dev_flow structure contains the mapping of a flow to a CPU and the | ||
535 | * tail pointer for that CPU's input queue at the time of last enqueue. | ||
536 | */ | ||
537 | struct rps_dev_flow { | ||
538 | u16 cpu; | ||
539 | u16 fill; | ||
540 | unsigned int last_qtail; | ||
541 | }; | ||
542 | |||
543 | /* | ||
544 | * The rps_dev_flow_table structure contains a table of flow mappings. | ||
545 | */ | ||
546 | struct rps_dev_flow_table { | ||
547 | unsigned int mask; | ||
548 | struct rcu_head rcu; | ||
549 | struct work_struct free_work; | ||
550 | struct rps_dev_flow flows[0]; | ||
551 | }; | ||
552 | #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ | ||
553 | (_num * sizeof(struct rps_dev_flow))) | ||
554 | |||
555 | /* | ||
556 | * The rps_sock_flow_table contains mappings of flows to the last CPU | ||
557 | * on which they were processed by the application (set in recvmsg). | ||
558 | */ | ||
559 | struct rps_sock_flow_table { | ||
560 | unsigned int mask; | ||
561 | u16 ents[0]; | ||
562 | }; | ||
563 | #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \ | ||
564 | (_num * sizeof(u16))) | ||
565 | |||
566 | #define RPS_NO_CPU 0xffff | ||
567 | |||
568 | static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, | ||
569 | u32 hash) | ||
570 | { | ||
571 | if (table && hash) { | ||
572 | unsigned int cpu, index = hash & table->mask; | ||
573 | |||
574 | /* We only give a hint, preemption can change cpu under us */ | ||
575 | cpu = raw_smp_processor_id(); | ||
576 | |||
577 | if (table->ents[index] != cpu) | ||
578 | table->ents[index] = cpu; | ||
579 | } | ||
580 | } | ||
581 | |||
582 | static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table, | ||
583 | u32 hash) | ||
584 | { | ||
585 | if (table && hash) | ||
586 | table->ents[hash & table->mask] = RPS_NO_CPU; | ||
587 | } | ||
588 | |||
589 | extern struct rps_sock_flow_table *rps_sock_flow_table; | ||
590 | |||
533 | /* This structure contains an instance of an RX queue. */ | 591 | /* This structure contains an instance of an RX queue. */ |
534 | struct netdev_rx_queue { | 592 | struct netdev_rx_queue { |
535 | struct rps_map *rps_map; | 593 | struct rps_map *rps_map; |
594 | struct rps_dev_flow_table *rps_flow_table; | ||
536 | struct kobject kobj; | 595 | struct kobject kobj; |
537 | struct netdev_rx_queue *first; | 596 | struct netdev_rx_queue *first; |
538 | atomic_t count; | 597 | atomic_t count; |
539 | } ____cacheline_aligned_in_smp; | 598 | } ____cacheline_aligned_in_smp; |
540 | #endif | 599 | #endif /* CONFIG_RPS */ |
541 | 600 | ||
542 | /* | 601 | /* |
543 | * This structure defines the management hooks for network devices. | 602 | * This structure defines the management hooks for network devices. |
@@ -1333,11 +1392,19 @@ struct softnet_data { | |||
1333 | /* Elements below can be accessed between CPUs for RPS */ | 1392 | /* Elements below can be accessed between CPUs for RPS */ |
1334 | #ifdef CONFIG_RPS | 1393 | #ifdef CONFIG_RPS |
1335 | struct call_single_data csd ____cacheline_aligned_in_smp; | 1394 | struct call_single_data csd ____cacheline_aligned_in_smp; |
1395 | unsigned int input_queue_head; | ||
1336 | #endif | 1396 | #endif |
1337 | struct sk_buff_head input_pkt_queue; | 1397 | struct sk_buff_head input_pkt_queue; |
1338 | struct napi_struct backlog; | 1398 | struct napi_struct backlog; |
1339 | }; | 1399 | }; |
1340 | 1400 | ||
1401 | static inline void incr_input_queue_head(struct softnet_data *queue) | ||
1402 | { | ||
1403 | #ifdef CONFIG_RPS | ||
1404 | queue->input_queue_head++; | ||
1405 | #endif | ||
1406 | } | ||
1407 | |||
1341 | DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); | 1408 | DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); |
1342 | 1409 | ||
1343 | #define HAVE_NETIF_QUEUE | 1410 | #define HAVE_NETIF_QUEUE |
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 83fd34437cf1..b487bc1b99ab 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/string.h> | 21 | #include <linux/string.h> |
22 | #include <linux/types.h> | 22 | #include <linux/types.h> |
23 | #include <linux/jhash.h> | 23 | #include <linux/jhash.h> |
24 | #include <linux/netdevice.h> | ||
24 | 25 | ||
25 | #include <net/flow.h> | 26 | #include <net/flow.h> |
26 | #include <net/sock.h> | 27 | #include <net/sock.h> |
@@ -101,6 +102,7 @@ struct rtable; | |||
101 | * @uc_ttl - Unicast TTL | 102 | * @uc_ttl - Unicast TTL |
102 | * @inet_sport - Source port | 103 | * @inet_sport - Source port |
103 | * @inet_id - ID counter for DF pkts | 104 | * @inet_id - ID counter for DF pkts |
105 | * @rxhash - flow hash received from netif layer | ||
104 | * @tos - TOS | 106 | * @tos - TOS |
105 | * @mc_ttl - Multicasting TTL | 107 | * @mc_ttl - Multicasting TTL |
106 | * @is_icsk - is this an inet_connection_sock? | 108 | * @is_icsk - is this an inet_connection_sock? |
@@ -124,6 +126,9 @@ struct inet_sock { | |||
124 | __u16 cmsg_flags; | 126 | __u16 cmsg_flags; |
125 | __be16 inet_sport; | 127 | __be16 inet_sport; |
126 | __u16 inet_id; | 128 | __u16 inet_id; |
129 | #ifdef CONFIG_RPS | ||
130 | __u32 rxhash; | ||
131 | #endif | ||
127 | 132 | ||
128 | struct ip_options *opt; | 133 | struct ip_options *opt; |
129 | __u8 tos; | 134 | __u8 tos; |
@@ -219,4 +224,37 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) | |||
219 | return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0; | 224 | return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0; |
220 | } | 225 | } |
221 | 226 | ||
227 | static inline void inet_rps_record_flow(const struct sock *sk) | ||
228 | { | ||
229 | #ifdef CONFIG_RPS | ||
230 | struct rps_sock_flow_table *sock_flow_table; | ||
231 | |||
232 | rcu_read_lock(); | ||
233 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | ||
234 | rps_record_sock_flow(sock_flow_table, inet_sk(sk)->rxhash); | ||
235 | rcu_read_unlock(); | ||
236 | #endif | ||
237 | } | ||
238 | |||
239 | static inline void inet_rps_reset_flow(const struct sock *sk) | ||
240 | { | ||
241 | #ifdef CONFIG_RPS | ||
242 | struct rps_sock_flow_table *sock_flow_table; | ||
243 | |||
244 | rcu_read_lock(); | ||
245 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | ||
246 | rps_reset_sock_flow(sock_flow_table, inet_sk(sk)->rxhash); | ||
247 | rcu_read_unlock(); | ||
248 | #endif | ||
249 | } | ||
250 | |||
251 | static inline void inet_rps_save_rxhash(const struct sock *sk, u32 rxhash) | ||
252 | { | ||
253 | #ifdef CONFIG_RPS | ||
254 | if (unlikely(inet_sk(sk)->rxhash != rxhash)) { | ||
255 | inet_rps_reset_flow(sk); | ||
256 | inet_sk(sk)->rxhash = rxhash; | ||
257 | } | ||
258 | #endif | ||
259 | } | ||
222 | #endif /* _INET_SOCK_H */ | 260 | #endif /* _INET_SOCK_H */ |
diff --git a/net/core/dev.c b/net/core/dev.c index e8041eb76ac1..d7107ac835fa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -2203,19 +2203,28 @@ int weight_p __read_mostly = 64; /* old backlog weight */ | |||
2203 | DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; | 2203 | DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; |
2204 | 2204 | ||
2205 | #ifdef CONFIG_RPS | 2205 | #ifdef CONFIG_RPS |
2206 | |||
2207 | /* One global table that all flow-based protocols share. */ | ||
2208 | struct rps_sock_flow_table *rps_sock_flow_table; | ||
2209 | EXPORT_SYMBOL(rps_sock_flow_table); | ||
2210 | |||
2206 | /* | 2211 | /* |
2207 | * get_rps_cpu is called from netif_receive_skb and returns the target | 2212 | * get_rps_cpu is called from netif_receive_skb and returns the target |
2208 | * CPU from the RPS map of the receiving queue for a given skb. | 2213 | * CPU from the RPS map of the receiving queue for a given skb. |
2209 | * rcu_read_lock must be held on entry. | 2214 | * rcu_read_lock must be held on entry. |
2210 | */ | 2215 | */ |
2211 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) | 2216 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, |
2217 | struct rps_dev_flow **rflowp) | ||
2212 | { | 2218 | { |
2213 | struct ipv6hdr *ip6; | 2219 | struct ipv6hdr *ip6; |
2214 | struct iphdr *ip; | 2220 | struct iphdr *ip; |
2215 | struct netdev_rx_queue *rxqueue; | 2221 | struct netdev_rx_queue *rxqueue; |
2216 | struct rps_map *map; | 2222 | struct rps_map *map; |
2223 | struct rps_dev_flow_table *flow_table; | ||
2224 | struct rps_sock_flow_table *sock_flow_table; | ||
2217 | int cpu = -1; | 2225 | int cpu = -1; |
2218 | u8 ip_proto; | 2226 | u8 ip_proto; |
2227 | u16 tcpu; | ||
2219 | u32 addr1, addr2, ports, ihl; | 2228 | u32 addr1, addr2, ports, ihl; |
2220 | 2229 | ||
2221 | if (skb_rx_queue_recorded(skb)) { | 2230 | if (skb_rx_queue_recorded(skb)) { |
@@ -2232,7 +2241,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) | |||
2232 | } else | 2241 | } else |
2233 | rxqueue = dev->_rx; | 2242 | rxqueue = dev->_rx; |
2234 | 2243 | ||
2235 | if (!rxqueue->rps_map) | 2244 | if (!rxqueue->rps_map && !rxqueue->rps_flow_table) |
2236 | goto done; | 2245 | goto done; |
2237 | 2246 | ||
2238 | if (skb->rxhash) | 2247 | if (skb->rxhash) |
@@ -2284,9 +2293,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) | |||
2284 | skb->rxhash = 1; | 2293 | skb->rxhash = 1; |
2285 | 2294 | ||
2286 | got_hash: | 2295 | got_hash: |
2296 | flow_table = rcu_dereference(rxqueue->rps_flow_table); | ||
2297 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | ||
2298 | if (flow_table && sock_flow_table) { | ||
2299 | u16 next_cpu; | ||
2300 | struct rps_dev_flow *rflow; | ||
2301 | |||
2302 | rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; | ||
2303 | tcpu = rflow->cpu; | ||
2304 | |||
2305 | next_cpu = sock_flow_table->ents[skb->rxhash & | ||
2306 | sock_flow_table->mask]; | ||
2307 | |||
2308 | /* | ||
2309 | * If the desired CPU (where last recvmsg was done) is | ||
2310 | * different from current CPU (one in the rx-queue flow | ||
2311 | * table entry), switch if one of the following holds: | ||
2312 | * - Current CPU is unset (equal to RPS_NO_CPU). | ||
2313 | * - Current CPU is offline. | ||
2314 | * - The current CPU's queue tail has advanced beyond the | ||
2315 | * last packet that was enqueued using this table entry. | ||
2316 | * This guarantees that all previous packets for the flow | ||
2317 | * have been dequeued, thus preserving in order delivery. | ||
2318 | */ | ||
2319 | if (unlikely(tcpu != next_cpu) && | ||
2320 | (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || | ||
2321 | ((int)(per_cpu(softnet_data, tcpu).input_queue_head - | ||
2322 | rflow->last_qtail)) >= 0)) { | ||
2323 | tcpu = rflow->cpu = next_cpu; | ||
2324 | if (tcpu != RPS_NO_CPU) | ||
2325 | rflow->last_qtail = per_cpu(softnet_data, | ||
2326 | tcpu).input_queue_head; | ||
2327 | } | ||
2328 | if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { | ||
2329 | *rflowp = rflow; | ||
2330 | cpu = tcpu; | ||
2331 | goto done; | ||
2332 | } | ||
2333 | } | ||
2334 | |||
2287 | map = rcu_dereference(rxqueue->rps_map); | 2335 | map = rcu_dereference(rxqueue->rps_map); |
2288 | if (map) { | 2336 | if (map) { |
2289 | u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; | 2337 | tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; |
2290 | 2338 | ||
2291 | if (cpu_online(tcpu)) { | 2339 | if (cpu_online(tcpu)) { |
2292 | cpu = tcpu; | 2340 | cpu = tcpu; |
@@ -2320,13 +2368,14 @@ static void trigger_softirq(void *data) | |||
2320 | __napi_schedule(&queue->backlog); | 2368 | __napi_schedule(&queue->backlog); |
2321 | __get_cpu_var(netdev_rx_stat).received_rps++; | 2369 | __get_cpu_var(netdev_rx_stat).received_rps++; |
2322 | } | 2370 | } |
2323 | #endif /* CONFIG_SMP */ | 2371 | #endif /* CONFIG_RPS */ |
2324 | 2372 | ||
2325 | /* | 2373 | /* |
2326 | * enqueue_to_backlog is called to queue an skb to a per CPU backlog | 2374 | * enqueue_to_backlog is called to queue an skb to a per CPU backlog |
2327 | * queue (may be a remote CPU queue). | 2375 | * queue (may be a remote CPU queue). |
2328 | */ | 2376 | */ |
2329 | static int enqueue_to_backlog(struct sk_buff *skb, int cpu) | 2377 | static int enqueue_to_backlog(struct sk_buff *skb, int cpu, |
2378 | unsigned int *qtail) | ||
2330 | { | 2379 | { |
2331 | struct softnet_data *queue; | 2380 | struct softnet_data *queue; |
2332 | unsigned long flags; | 2381 | unsigned long flags; |
@@ -2341,6 +2390,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu) | |||
2341 | if (queue->input_pkt_queue.qlen) { | 2390 | if (queue->input_pkt_queue.qlen) { |
2342 | enqueue: | 2391 | enqueue: |
2343 | __skb_queue_tail(&queue->input_pkt_queue, skb); | 2392 | __skb_queue_tail(&queue->input_pkt_queue, skb); |
2393 | #ifdef CONFIG_RPS | ||
2394 | *qtail = queue->input_queue_head + | ||
2395 | queue->input_pkt_queue.qlen; | ||
2396 | #endif | ||
2344 | rps_unlock(queue); | 2397 | rps_unlock(queue); |
2345 | local_irq_restore(flags); | 2398 | local_irq_restore(flags); |
2346 | return NET_RX_SUCCESS; | 2399 | return NET_RX_SUCCESS; |
@@ -2355,11 +2408,10 @@ enqueue: | |||
2355 | 2408 | ||
2356 | cpu_set(cpu, rcpus->mask[rcpus->select]); | 2409 | cpu_set(cpu, rcpus->mask[rcpus->select]); |
2357 | __raise_softirq_irqoff(NET_RX_SOFTIRQ); | 2410 | __raise_softirq_irqoff(NET_RX_SOFTIRQ); |
2358 | } else | 2411 | goto enqueue; |
2359 | __napi_schedule(&queue->backlog); | 2412 | } |
2360 | #else | ||
2361 | __napi_schedule(&queue->backlog); | ||
2362 | #endif | 2413 | #endif |
2414 | __napi_schedule(&queue->backlog); | ||
2363 | } | 2415 | } |
2364 | goto enqueue; | 2416 | goto enqueue; |
2365 | } | 2417 | } |
@@ -2401,18 +2453,25 @@ int netif_rx(struct sk_buff *skb) | |||
2401 | 2453 | ||
2402 | #ifdef CONFIG_RPS | 2454 | #ifdef CONFIG_RPS |
2403 | { | 2455 | { |
2456 | struct rps_dev_flow voidflow, *rflow = &voidflow; | ||
2404 | int cpu; | 2457 | int cpu; |
2405 | 2458 | ||
2406 | rcu_read_lock(); | 2459 | rcu_read_lock(); |
2407 | cpu = get_rps_cpu(skb->dev, skb); | 2460 | |
2461 | cpu = get_rps_cpu(skb->dev, skb, &rflow); | ||
2408 | if (cpu < 0) | 2462 | if (cpu < 0) |
2409 | cpu = smp_processor_id(); | 2463 | cpu = smp_processor_id(); |
2410 | ret = enqueue_to_backlog(skb, cpu); | 2464 | |
2465 | ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); | ||
2466 | |||
2411 | rcu_read_unlock(); | 2467 | rcu_read_unlock(); |
2412 | } | 2468 | } |
2413 | #else | 2469 | #else |
2414 | ret = enqueue_to_backlog(skb, get_cpu()); | 2470 | { |
2415 | put_cpu(); | 2471 | unsigned int qtail; |
2472 | ret = enqueue_to_backlog(skb, get_cpu(), &qtail); | ||
2473 | put_cpu(); | ||
2474 | } | ||
2416 | #endif | 2475 | #endif |
2417 | return ret; | 2476 | return ret; |
2418 | } | 2477 | } |
@@ -2830,14 +2889,22 @@ out: | |||
2830 | int netif_receive_skb(struct sk_buff *skb) | 2889 | int netif_receive_skb(struct sk_buff *skb) |
2831 | { | 2890 | { |
2832 | #ifdef CONFIG_RPS | 2891 | #ifdef CONFIG_RPS |
2833 | int cpu; | 2892 | struct rps_dev_flow voidflow, *rflow = &voidflow; |
2893 | int cpu, ret; | ||
2894 | |||
2895 | rcu_read_lock(); | ||
2834 | 2896 | ||
2835 | cpu = get_rps_cpu(skb->dev, skb); | 2897 | cpu = get_rps_cpu(skb->dev, skb, &rflow); |
2836 | 2898 | ||
2837 | if (cpu < 0) | 2899 | if (cpu >= 0) { |
2838 | return __netif_receive_skb(skb); | 2900 | ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); |
2839 | else | 2901 | rcu_read_unlock(); |
2840 | return enqueue_to_backlog(skb, cpu); | 2902 | } else { |
2903 | rcu_read_unlock(); | ||
2904 | ret = __netif_receive_skb(skb); | ||
2905 | } | ||
2906 | |||
2907 | return ret; | ||
2841 | #else | 2908 | #else |
2842 | return __netif_receive_skb(skb); | 2909 | return __netif_receive_skb(skb); |
2843 | #endif | 2910 | #endif |
@@ -2856,6 +2923,7 @@ static void flush_backlog(void *arg) | |||
2856 | if (skb->dev == dev) { | 2923 | if (skb->dev == dev) { |
2857 | __skb_unlink(skb, &queue->input_pkt_queue); | 2924 | __skb_unlink(skb, &queue->input_pkt_queue); |
2858 | kfree_skb(skb); | 2925 | kfree_skb(skb); |
2926 | incr_input_queue_head(queue); | ||
2859 | } | 2927 | } |
2860 | rps_unlock(queue); | 2928 | rps_unlock(queue); |
2861 | } | 2929 | } |
@@ -3179,6 +3247,7 @@ static int process_backlog(struct napi_struct *napi, int quota) | |||
3179 | local_irq_enable(); | 3247 | local_irq_enable(); |
3180 | break; | 3248 | break; |
3181 | } | 3249 | } |
3250 | incr_input_queue_head(queue); | ||
3182 | rps_unlock(queue); | 3251 | rps_unlock(queue); |
3183 | local_irq_enable(); | 3252 | local_irq_enable(); |
3184 | 3253 | ||
@@ -5542,8 +5611,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, | |||
5542 | local_irq_enable(); | 5611 | local_irq_enable(); |
5543 | 5612 | ||
5544 | /* Process offline CPU's input_pkt_queue */ | 5613 | /* Process offline CPU's input_pkt_queue */ |
5545 | while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) | 5614 | while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { |
5546 | netif_rx(skb); | 5615 | netif_rx(skb); |
5616 | incr_input_queue_head(oldsd); | ||
5617 | } | ||
5547 | 5618 | ||
5548 | return NOTIFY_OK; | 5619 | return NOTIFY_OK; |
5549 | } | 5620 | } |
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 96ed6905b823..143052a22b9b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <net/sock.h> | 17 | #include <net/sock.h> |
18 | #include <linux/rtnetlink.h> | 18 | #include <linux/rtnetlink.h> |
19 | #include <linux/wireless.h> | 19 | #include <linux/wireless.h> |
20 | #include <linux/vmalloc.h> | ||
20 | #include <net/wext.h> | 21 | #include <net/wext.h> |
21 | 22 | ||
22 | #include "net-sysfs.h" | 23 | #include "net-sysfs.h" |
@@ -601,22 +602,109 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, | |||
601 | return len; | 602 | return len; |
602 | } | 603 | } |
603 | 604 | ||
605 | static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, | ||
606 | struct rx_queue_attribute *attr, | ||
607 | char *buf) | ||
608 | { | ||
609 | struct rps_dev_flow_table *flow_table; | ||
610 | unsigned int val = 0; | ||
611 | |||
612 | rcu_read_lock(); | ||
613 | flow_table = rcu_dereference(queue->rps_flow_table); | ||
614 | if (flow_table) | ||
615 | val = flow_table->mask + 1; | ||
616 | rcu_read_unlock(); | ||
617 | |||
618 | return sprintf(buf, "%u\n", val); | ||
619 | } | ||
620 | |||
621 | static void rps_dev_flow_table_release_work(struct work_struct *work) | ||
622 | { | ||
623 | struct rps_dev_flow_table *table = container_of(work, | ||
624 | struct rps_dev_flow_table, free_work); | ||
625 | |||
626 | vfree(table); | ||
627 | } | ||
628 | |||
629 | static void rps_dev_flow_table_release(struct rcu_head *rcu) | ||
630 | { | ||
631 | struct rps_dev_flow_table *table = container_of(rcu, | ||
632 | struct rps_dev_flow_table, rcu); | ||
633 | |||
634 | INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); | ||
635 | schedule_work(&table->free_work); | ||
636 | } | ||
637 | |||
638 | ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, | ||
639 | struct rx_queue_attribute *attr, | ||
640 | const char *buf, size_t len) | ||
641 | { | ||
642 | unsigned int count; | ||
643 | char *endp; | ||
644 | struct rps_dev_flow_table *table, *old_table; | ||
645 | static DEFINE_SPINLOCK(rps_dev_flow_lock); | ||
646 | |||
647 | if (!capable(CAP_NET_ADMIN)) | ||
648 | return -EPERM; | ||
649 | |||
650 | count = simple_strtoul(buf, &endp, 0); | ||
651 | if (endp == buf) | ||
652 | return -EINVAL; | ||
653 | |||
654 | if (count) { | ||
655 | int i; | ||
656 | |||
657 | if (count > 1<<30) { | ||
658 | /* Enforce a limit to prevent overflow */ | ||
659 | return -EINVAL; | ||
660 | } | ||
661 | count = roundup_pow_of_two(count); | ||
662 | table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); | ||
663 | if (!table) | ||
664 | return -ENOMEM; | ||
665 | |||
666 | table->mask = count - 1; | ||
667 | for (i = 0; i < count; i++) | ||
668 | table->flows[i].cpu = RPS_NO_CPU; | ||
669 | } else | ||
670 | table = NULL; | ||
671 | |||
672 | spin_lock(&rps_dev_flow_lock); | ||
673 | old_table = queue->rps_flow_table; | ||
674 | rcu_assign_pointer(queue->rps_flow_table, table); | ||
675 | spin_unlock(&rps_dev_flow_lock); | ||
676 | |||
677 | if (old_table) | ||
678 | call_rcu(&old_table->rcu, rps_dev_flow_table_release); | ||
679 | |||
680 | return len; | ||
681 | } | ||
682 | |||
604 | static struct rx_queue_attribute rps_cpus_attribute = | 683 | static struct rx_queue_attribute rps_cpus_attribute = |
605 | __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); | 684 | __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); |
606 | 685 | ||
686 | |||
687 | static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = | ||
688 | __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, | ||
689 | show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); | ||
690 | |||
607 | static struct attribute *rx_queue_default_attrs[] = { | 691 | static struct attribute *rx_queue_default_attrs[] = { |
608 | &rps_cpus_attribute.attr, | 692 | &rps_cpus_attribute.attr, |
693 | &rps_dev_flow_table_cnt_attribute.attr, | ||
609 | NULL | 694 | NULL |
610 | }; | 695 | }; |
611 | 696 | ||
612 | static void rx_queue_release(struct kobject *kobj) | 697 | static void rx_queue_release(struct kobject *kobj) |
613 | { | 698 | { |
614 | struct netdev_rx_queue *queue = to_rx_queue(kobj); | 699 | struct netdev_rx_queue *queue = to_rx_queue(kobj); |
615 | struct rps_map *map = queue->rps_map; | ||
616 | struct netdev_rx_queue *first = queue->first; | 700 | struct netdev_rx_queue *first = queue->first; |
617 | 701 | ||
618 | if (map) | 702 | if (queue->rps_map) |
619 | call_rcu(&map->rcu, rps_map_release); | 703 | call_rcu(&queue->rps_map->rcu, rps_map_release); |
704 | |||
705 | if (queue->rps_flow_table) | ||
706 | call_rcu(&queue->rps_flow_table->rcu, | ||
707 | rps_dev_flow_table_release); | ||
620 | 708 | ||
621 | if (atomic_dec_and_test(&first->count)) | 709 | if (atomic_dec_and_test(&first->count)) |
622 | kfree(first); | 710 | kfree(first); |
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b7b6b8208f75..dcc7d25996ab 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c | |||
@@ -11,12 +11,72 @@ | |||
11 | #include <linux/socket.h> | 11 | #include <linux/socket.h> |
12 | #include <linux/netdevice.h> | 12 | #include <linux/netdevice.h> |
13 | #include <linux/ratelimit.h> | 13 | #include <linux/ratelimit.h> |
14 | #include <linux/vmalloc.h> | ||
14 | #include <linux/init.h> | 15 | #include <linux/init.h> |
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | 17 | ||
17 | #include <net/ip.h> | 18 | #include <net/ip.h> |
18 | #include <net/sock.h> | 19 | #include <net/sock.h> |
19 | 20 | ||
21 | #ifdef CONFIG_RPS | ||
22 | static int rps_sock_flow_sysctl(ctl_table *table, int write, | ||
23 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
24 | { | ||
25 | unsigned int orig_size, size; | ||
26 | int ret, i; | ||
27 | ctl_table tmp = { | ||
28 | .data = &size, | ||
29 | .maxlen = sizeof(size), | ||
30 | .mode = table->mode | ||
31 | }; | ||
32 | struct rps_sock_flow_table *orig_sock_table, *sock_table; | ||
33 | static DEFINE_MUTEX(sock_flow_mutex); | ||
34 | |||
35 | mutex_lock(&sock_flow_mutex); | ||
36 | |||
37 | orig_sock_table = rps_sock_flow_table; | ||
38 | size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; | ||
39 | |||
40 | ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); | ||
41 | |||
42 | if (write) { | ||
43 | if (size) { | ||
44 | if (size > 1<<30) { | ||
45 | /* Enforce limit to prevent overflow */ | ||
46 | mutex_unlock(&sock_flow_mutex); | ||
47 | return -EINVAL; | ||
48 | } | ||
49 | size = roundup_pow_of_two(size); | ||
50 | if (size != orig_size) { | ||
51 | sock_table = | ||
52 | vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); | ||
53 | if (!sock_table) { | ||
54 | mutex_unlock(&sock_flow_mutex); | ||
55 | return -ENOMEM; | ||
56 | } | ||
57 | |||
58 | sock_table->mask = size - 1; | ||
59 | } else | ||
60 | sock_table = orig_sock_table; | ||
61 | |||
62 | for (i = 0; i < size; i++) | ||
63 | sock_table->ents[i] = RPS_NO_CPU; | ||
64 | } else | ||
65 | sock_table = NULL; | ||
66 | |||
67 | if (sock_table != orig_sock_table) { | ||
68 | rcu_assign_pointer(rps_sock_flow_table, sock_table); | ||
69 | synchronize_rcu(); | ||
70 | vfree(orig_sock_table); | ||
71 | } | ||
72 | } | ||
73 | |||
74 | mutex_unlock(&sock_flow_mutex); | ||
75 | |||
76 | return ret; | ||
77 | } | ||
78 | #endif /* CONFIG_RPS */ | ||
79 | |||
20 | static struct ctl_table net_core_table[] = { | 80 | static struct ctl_table net_core_table[] = { |
21 | #ifdef CONFIG_NET | 81 | #ifdef CONFIG_NET |
22 | { | 82 | { |
@@ -82,6 +142,14 @@ static struct ctl_table net_core_table[] = { | |||
82 | .mode = 0644, | 142 | .mode = 0644, |
83 | .proc_handler = proc_dointvec | 143 | .proc_handler = proc_dointvec |
84 | }, | 144 | }, |
145 | #ifdef CONFIG_RPS | ||
146 | { | ||
147 | .procname = "rps_sock_flow_entries", | ||
148 | .maxlen = sizeof(int), | ||
149 | .mode = 0644, | ||
150 | .proc_handler = rps_sock_flow_sysctl | ||
151 | }, | ||
152 | #endif | ||
85 | #endif /* CONFIG_NET */ | 153 | #endif /* CONFIG_NET */ |
86 | { | 154 | { |
87 | .procname = "netdev_budget", | 155 | .procname = "netdev_budget", |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 193dcd6ed64f..c5376c725503 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -419,6 +419,8 @@ int inet_release(struct socket *sock) | |||
419 | if (sk) { | 419 | if (sk) { |
420 | long timeout; | 420 | long timeout; |
421 | 421 | ||
422 | inet_rps_reset_flow(sk); | ||
423 | |||
422 | /* Applications forget to leave groups before exiting */ | 424 | /* Applications forget to leave groups before exiting */ |
423 | ip_mc_drop_socket(sk); | 425 | ip_mc_drop_socket(sk); |
424 | 426 | ||
@@ -720,6 +722,8 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
720 | { | 722 | { |
721 | struct sock *sk = sock->sk; | 723 | struct sock *sk = sock->sk; |
722 | 724 | ||
725 | inet_rps_record_flow(sk); | ||
726 | |||
723 | /* We may need to bind the socket. */ | 727 | /* We may need to bind the socket. */ |
724 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) | 728 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) |
725 | return -EAGAIN; | 729 | return -EAGAIN; |
@@ -728,12 +732,13 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
728 | } | 732 | } |
729 | EXPORT_SYMBOL(inet_sendmsg); | 733 | EXPORT_SYMBOL(inet_sendmsg); |
730 | 734 | ||
731 | |||
732 | static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, | 735 | static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, |
733 | size_t size, int flags) | 736 | size_t size, int flags) |
734 | { | 737 | { |
735 | struct sock *sk = sock->sk; | 738 | struct sock *sk = sock->sk; |
736 | 739 | ||
740 | inet_rps_record_flow(sk); | ||
741 | |||
737 | /* We may need to bind the socket. */ | 742 | /* We may need to bind the socket. */ |
738 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) | 743 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) |
739 | return -EAGAIN; | 744 | return -EAGAIN; |
@@ -743,6 +748,22 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, | |||
743 | return sock_no_sendpage(sock, page, offset, size, flags); | 748 | return sock_no_sendpage(sock, page, offset, size, flags); |
744 | } | 749 | } |
745 | 750 | ||
751 | int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | ||
752 | size_t size, int flags) | ||
753 | { | ||
754 | struct sock *sk = sock->sk; | ||
755 | int addr_len = 0; | ||
756 | int err; | ||
757 | |||
758 | inet_rps_record_flow(sk); | ||
759 | |||
760 | err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, | ||
761 | flags & ~MSG_DONTWAIT, &addr_len); | ||
762 | if (err >= 0) | ||
763 | msg->msg_namelen = addr_len; | ||
764 | return err; | ||
765 | } | ||
766 | EXPORT_SYMBOL(inet_recvmsg); | ||
746 | 767 | ||
747 | int inet_shutdown(struct socket *sock, int how) | 768 | int inet_shutdown(struct socket *sock, int how) |
748 | { | 769 | { |
@@ -872,7 +893,7 @@ const struct proto_ops inet_stream_ops = { | |||
872 | .setsockopt = sock_common_setsockopt, | 893 | .setsockopt = sock_common_setsockopt, |
873 | .getsockopt = sock_common_getsockopt, | 894 | .getsockopt = sock_common_getsockopt, |
874 | .sendmsg = tcp_sendmsg, | 895 | .sendmsg = tcp_sendmsg, |
875 | .recvmsg = sock_common_recvmsg, | 896 | .recvmsg = inet_recvmsg, |
876 | .mmap = sock_no_mmap, | 897 | .mmap = sock_no_mmap, |
877 | .sendpage = tcp_sendpage, | 898 | .sendpage = tcp_sendpage, |
878 | .splice_read = tcp_splice_read, | 899 | .splice_read = tcp_splice_read, |
@@ -899,7 +920,7 @@ const struct proto_ops inet_dgram_ops = { | |||
899 | .setsockopt = sock_common_setsockopt, | 920 | .setsockopt = sock_common_setsockopt, |
900 | .getsockopt = sock_common_getsockopt, | 921 | .getsockopt = sock_common_getsockopt, |
901 | .sendmsg = inet_sendmsg, | 922 | .sendmsg = inet_sendmsg, |
902 | .recvmsg = sock_common_recvmsg, | 923 | .recvmsg = inet_recvmsg, |
903 | .mmap = sock_no_mmap, | 924 | .mmap = sock_no_mmap, |
904 | .sendpage = inet_sendpage, | 925 | .sendpage = inet_sendpage, |
905 | #ifdef CONFIG_COMPAT | 926 | #ifdef CONFIG_COMPAT |
@@ -929,7 +950,7 @@ static const struct proto_ops inet_sockraw_ops = { | |||
929 | .setsockopt = sock_common_setsockopt, | 950 | .setsockopt = sock_common_setsockopt, |
930 | .getsockopt = sock_common_getsockopt, | 951 | .getsockopt = sock_common_getsockopt, |
931 | .sendmsg = inet_sendmsg, | 952 | .sendmsg = inet_sendmsg, |
932 | .recvmsg = sock_common_recvmsg, | 953 | .recvmsg = inet_recvmsg, |
933 | .mmap = sock_no_mmap, | 954 | .mmap = sock_no_mmap, |
934 | .sendpage = inet_sendpage, | 955 | .sendpage = inet_sendpage, |
935 | #ifdef CONFIG_COMPAT | 956 | #ifdef CONFIG_COMPAT |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a24995cdc4b6..ad08392a738c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -1672,6 +1672,8 @@ process: | |||
1672 | 1672 | ||
1673 | skb->dev = NULL; | 1673 | skb->dev = NULL; |
1674 | 1674 | ||
1675 | inet_rps_save_rxhash(sk, skb->rxhash); | ||
1676 | |||
1675 | bh_lock_sock_nested(sk); | 1677 | bh_lock_sock_nested(sk); |
1676 | ret = 0; | 1678 | ret = 0; |
1677 | if (!sock_owned_by_user(sk)) { | 1679 | if (!sock_owned_by_user(sk)) { |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8fef859db35d..666b963496ff 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -1217,6 +1217,7 @@ int udp_disconnect(struct sock *sk, int flags) | |||
1217 | sk->sk_state = TCP_CLOSE; | 1217 | sk->sk_state = TCP_CLOSE; |
1218 | inet->inet_daddr = 0; | 1218 | inet->inet_daddr = 0; |
1219 | inet->inet_dport = 0; | 1219 | inet->inet_dport = 0; |
1220 | inet_rps_save_rxhash(sk, 0); | ||
1220 | sk->sk_bound_dev_if = 0; | 1221 | sk->sk_bound_dev_if = 0; |
1221 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) | 1222 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) |
1222 | inet_reset_saddr(sk); | 1223 | inet_reset_saddr(sk); |
@@ -1258,8 +1259,12 @@ EXPORT_SYMBOL(udp_lib_unhash); | |||
1258 | 1259 | ||
1259 | static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | 1260 | static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) |
1260 | { | 1261 | { |
1261 | int rc = sock_queue_rcv_skb(sk, skb); | 1262 | int rc; |
1263 | |||
1264 | if (inet_sk(sk)->inet_daddr) | ||
1265 | inet_rps_save_rxhash(sk, skb->rxhash); | ||
1262 | 1266 | ||
1267 | rc = sock_queue_rcv_skb(sk, skb); | ||
1263 | if (rc < 0) { | 1268 | if (rc < 0) { |
1264 | int is_udplite = IS_UDPLITE(sk); | 1269 | int is_udplite = IS_UDPLITE(sk); |
1265 | 1270 | ||