aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/netdevice.h69
-rw-r--r--include/net/inet_sock.h38
-rw-r--r--net/core/dev.c111
-rw-r--r--net/core/net-sysfs.c94
-rw-r--r--net/core/sysctl_net_core.c68
-rw-r--r--net/ipv4/af_inet.c29
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/udp.c7
8 files changed, 389 insertions, 29 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 55c2086e1f06..649a0252686e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,14 +530,73 @@ struct rps_map {
530}; 530};
531#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) 531#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
532 532
533/*
534 * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
535 * tail pointer for that CPU's input queue at the time of last enqueue.
536 */
537struct rps_dev_flow {
538 u16 cpu;
539 u16 fill;
540 unsigned int last_qtail;
541};
542
543/*
544 * The rps_dev_flow_table structure contains a table of flow mappings.
545 */
546struct rps_dev_flow_table {
547 unsigned int mask;
548 struct rcu_head rcu;
549 struct work_struct free_work;
550 struct rps_dev_flow flows[0];
551};
552#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
553 (_num * sizeof(struct rps_dev_flow)))
554
555/*
556 * The rps_sock_flow_table contains mappings of flows to the last CPU
557 * on which they were processed by the application (set in recvmsg).
558 */
559struct rps_sock_flow_table {
560 unsigned int mask;
561 u16 ents[0];
562};
563#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
564 (_num * sizeof(u16)))
565
566#define RPS_NO_CPU 0xffff
567
568static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
569 u32 hash)
570{
571 if (table && hash) {
572 unsigned int cpu, index = hash & table->mask;
573
574 /* We only give a hint, preemption can change cpu under us */
575 cpu = raw_smp_processor_id();
576
577 if (table->ents[index] != cpu)
578 table->ents[index] = cpu;
579 }
580}
581
582static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
583 u32 hash)
584{
585 if (table && hash)
586 table->ents[hash & table->mask] = RPS_NO_CPU;
587}
588
589extern struct rps_sock_flow_table *rps_sock_flow_table;
590
533/* This structure contains an instance of an RX queue. */ 591/* This structure contains an instance of an RX queue. */
534struct netdev_rx_queue { 592struct netdev_rx_queue {
535 struct rps_map *rps_map; 593 struct rps_map *rps_map;
594 struct rps_dev_flow_table *rps_flow_table;
536 struct kobject kobj; 595 struct kobject kobj;
537 struct netdev_rx_queue *first; 596 struct netdev_rx_queue *first;
538 atomic_t count; 597 atomic_t count;
539} ____cacheline_aligned_in_smp; 598} ____cacheline_aligned_in_smp;
540#endif 599#endif /* CONFIG_RPS */
541 600
542/* 601/*
543 * This structure defines the management hooks for network devices. 602 * This structure defines the management hooks for network devices.
@@ -1333,11 +1392,19 @@ struct softnet_data {
1333 /* Elements below can be accessed between CPUs for RPS */ 1392 /* Elements below can be accessed between CPUs for RPS */
1334#ifdef CONFIG_RPS 1393#ifdef CONFIG_RPS
1335 struct call_single_data csd ____cacheline_aligned_in_smp; 1394 struct call_single_data csd ____cacheline_aligned_in_smp;
1395 unsigned int input_queue_head;
1336#endif 1396#endif
1337 struct sk_buff_head input_pkt_queue; 1397 struct sk_buff_head input_pkt_queue;
1338 struct napi_struct backlog; 1398 struct napi_struct backlog;
1339}; 1399};
1340 1400
1401static inline void incr_input_queue_head(struct softnet_data *queue)
1402{
1403#ifdef CONFIG_RPS
1404 queue->input_queue_head++;
1405#endif
1406}
1407
1341DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 1408DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
1342 1409
1343#define HAVE_NETIF_QUEUE 1410#define HAVE_NETIF_QUEUE
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 83fd34437cf1..b487bc1b99ab 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -21,6 +21,7 @@
21#include <linux/string.h> 21#include <linux/string.h>
22#include <linux/types.h> 22#include <linux/types.h>
23#include <linux/jhash.h> 23#include <linux/jhash.h>
24#include <linux/netdevice.h>
24 25
25#include <net/flow.h> 26#include <net/flow.h>
26#include <net/sock.h> 27#include <net/sock.h>
@@ -101,6 +102,7 @@ struct rtable;
101 * @uc_ttl - Unicast TTL 102 * @uc_ttl - Unicast TTL
102 * @inet_sport - Source port 103 * @inet_sport - Source port
103 * @inet_id - ID counter for DF pkts 104 * @inet_id - ID counter for DF pkts
105 * @rxhash - flow hash received from netif layer
104 * @tos - TOS 106 * @tos - TOS
105 * @mc_ttl - Multicasting TTL 107 * @mc_ttl - Multicasting TTL
106 * @is_icsk - is this an inet_connection_sock? 108 * @is_icsk - is this an inet_connection_sock?
@@ -124,6 +126,9 @@ struct inet_sock {
124 __u16 cmsg_flags; 126 __u16 cmsg_flags;
125 __be16 inet_sport; 127 __be16 inet_sport;
126 __u16 inet_id; 128 __u16 inet_id;
129#ifdef CONFIG_RPS
130 __u32 rxhash;
131#endif
127 132
128 struct ip_options *opt; 133 struct ip_options *opt;
129 __u8 tos; 134 __u8 tos;
@@ -219,4 +224,37 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
219 return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0; 224 return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0;
220} 225}
221 226
227static inline void inet_rps_record_flow(const struct sock *sk)
228{
229#ifdef CONFIG_RPS
230 struct rps_sock_flow_table *sock_flow_table;
231
232 rcu_read_lock();
233 sock_flow_table = rcu_dereference(rps_sock_flow_table);
234 rps_record_sock_flow(sock_flow_table, inet_sk(sk)->rxhash);
235 rcu_read_unlock();
236#endif
237}
238
239static inline void inet_rps_reset_flow(const struct sock *sk)
240{
241#ifdef CONFIG_RPS
242 struct rps_sock_flow_table *sock_flow_table;
243
244 rcu_read_lock();
245 sock_flow_table = rcu_dereference(rps_sock_flow_table);
246 rps_reset_sock_flow(sock_flow_table, inet_sk(sk)->rxhash);
247 rcu_read_unlock();
248#endif
249}
250
251static inline void inet_rps_save_rxhash(const struct sock *sk, u32 rxhash)
252{
253#ifdef CONFIG_RPS
254 if (unlikely(inet_sk(sk)->rxhash != rxhash)) {
255 inet_rps_reset_flow(sk);
256 inet_sk(sk)->rxhash = rxhash;
257 }
258#endif
259}
222#endif /* _INET_SOCK_H */ 260#endif /* _INET_SOCK_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index e8041eb76ac1..d7107ac835fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2203,19 +2203,28 @@ int weight_p __read_mostly = 64; /* old backlog weight */
2203DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 2203DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2204 2204
2205#ifdef CONFIG_RPS 2205#ifdef CONFIG_RPS
2206
2207/* One global table that all flow-based protocols share. */
2208struct rps_sock_flow_table *rps_sock_flow_table;
2209EXPORT_SYMBOL(rps_sock_flow_table);
2210
2206/* 2211/*
2207 * get_rps_cpu is called from netif_receive_skb and returns the target 2212 * get_rps_cpu is called from netif_receive_skb and returns the target
2208 * CPU from the RPS map of the receiving queue for a given skb. 2213 * CPU from the RPS map of the receiving queue for a given skb.
2209 * rcu_read_lock must be held on entry. 2214 * rcu_read_lock must be held on entry.
2210 */ 2215 */
2211static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) 2216static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2217 struct rps_dev_flow **rflowp)
2212{ 2218{
2213 struct ipv6hdr *ip6; 2219 struct ipv6hdr *ip6;
2214 struct iphdr *ip; 2220 struct iphdr *ip;
2215 struct netdev_rx_queue *rxqueue; 2221 struct netdev_rx_queue *rxqueue;
2216 struct rps_map *map; 2222 struct rps_map *map;
2223 struct rps_dev_flow_table *flow_table;
2224 struct rps_sock_flow_table *sock_flow_table;
2217 int cpu = -1; 2225 int cpu = -1;
2218 u8 ip_proto; 2226 u8 ip_proto;
2227 u16 tcpu;
2219 u32 addr1, addr2, ports, ihl; 2228 u32 addr1, addr2, ports, ihl;
2220 2229
2221 if (skb_rx_queue_recorded(skb)) { 2230 if (skb_rx_queue_recorded(skb)) {
@@ -2232,7 +2241,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
2232 } else 2241 } else
2233 rxqueue = dev->_rx; 2242 rxqueue = dev->_rx;
2234 2243
2235 if (!rxqueue->rps_map) 2244 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2236 goto done; 2245 goto done;
2237 2246
2238 if (skb->rxhash) 2247 if (skb->rxhash)
@@ -2284,9 +2293,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
2284 skb->rxhash = 1; 2293 skb->rxhash = 1;
2285 2294
2286got_hash: 2295got_hash:
2296 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2297 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2298 if (flow_table && sock_flow_table) {
2299 u16 next_cpu;
2300 struct rps_dev_flow *rflow;
2301
2302 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2303 tcpu = rflow->cpu;
2304
2305 next_cpu = sock_flow_table->ents[skb->rxhash &
2306 sock_flow_table->mask];
2307
2308 /*
2309 * If the desired CPU (where last recvmsg was done) is
2310 * different from current CPU (one in the rx-queue flow
2311 * table entry), switch if one of the following holds:
2312 * - Current CPU is unset (equal to RPS_NO_CPU).
2313 * - Current CPU is offline.
2314 * - The current CPU's queue tail has advanced beyond the
2315 * last packet that was enqueued using this table entry.
2316 * This guarantees that all previous packets for the flow
2317 * have been dequeued, thus preserving in order delivery.
2318 */
2319 if (unlikely(tcpu != next_cpu) &&
2320 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2321 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2322 rflow->last_qtail)) >= 0)) {
2323 tcpu = rflow->cpu = next_cpu;
2324 if (tcpu != RPS_NO_CPU)
2325 rflow->last_qtail = per_cpu(softnet_data,
2326 tcpu).input_queue_head;
2327 }
2328 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2329 *rflowp = rflow;
2330 cpu = tcpu;
2331 goto done;
2332 }
2333 }
2334
2287 map = rcu_dereference(rxqueue->rps_map); 2335 map = rcu_dereference(rxqueue->rps_map);
2288 if (map) { 2336 if (map) {
2289 u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 2337 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2290 2338
2291 if (cpu_online(tcpu)) { 2339 if (cpu_online(tcpu)) {
2292 cpu = tcpu; 2340 cpu = tcpu;
@@ -2320,13 +2368,14 @@ static void trigger_softirq(void *data)
2320 __napi_schedule(&queue->backlog); 2368 __napi_schedule(&queue->backlog);
2321 __get_cpu_var(netdev_rx_stat).received_rps++; 2369 __get_cpu_var(netdev_rx_stat).received_rps++;
2322} 2370}
2323#endif /* CONFIG_SMP */ 2371#endif /* CONFIG_RPS */
2324 2372
2325/* 2373/*
2326 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 2374 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2327 * queue (may be a remote CPU queue). 2375 * queue (may be a remote CPU queue).
2328 */ 2376 */
2329static int enqueue_to_backlog(struct sk_buff *skb, int cpu) 2377static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2378 unsigned int *qtail)
2330{ 2379{
2331 struct softnet_data *queue; 2380 struct softnet_data *queue;
2332 unsigned long flags; 2381 unsigned long flags;
@@ -2341,6 +2390,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
2341 if (queue->input_pkt_queue.qlen) { 2390 if (queue->input_pkt_queue.qlen) {
2342enqueue: 2391enqueue:
2343 __skb_queue_tail(&queue->input_pkt_queue, skb); 2392 __skb_queue_tail(&queue->input_pkt_queue, skb);
2393#ifdef CONFIG_RPS
2394 *qtail = queue->input_queue_head +
2395 queue->input_pkt_queue.qlen;
2396#endif
2344 rps_unlock(queue); 2397 rps_unlock(queue);
2345 local_irq_restore(flags); 2398 local_irq_restore(flags);
2346 return NET_RX_SUCCESS; 2399 return NET_RX_SUCCESS;
@@ -2355,11 +2408,10 @@ enqueue:
2355 2408
2356 cpu_set(cpu, rcpus->mask[rcpus->select]); 2409 cpu_set(cpu, rcpus->mask[rcpus->select]);
2357 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2410 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2358 } else 2411 goto enqueue;
2359 __napi_schedule(&queue->backlog); 2412 }
2360#else
2361 __napi_schedule(&queue->backlog);
2362#endif 2413#endif
2414 __napi_schedule(&queue->backlog);
2363 } 2415 }
2364 goto enqueue; 2416 goto enqueue;
2365 } 2417 }
@@ -2401,18 +2453,25 @@ int netif_rx(struct sk_buff *skb)
2401 2453
2402#ifdef CONFIG_RPS 2454#ifdef CONFIG_RPS
2403 { 2455 {
2456 struct rps_dev_flow voidflow, *rflow = &voidflow;
2404 int cpu; 2457 int cpu;
2405 2458
2406 rcu_read_lock(); 2459 rcu_read_lock();
2407 cpu = get_rps_cpu(skb->dev, skb); 2460
2461 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2408 if (cpu < 0) 2462 if (cpu < 0)
2409 cpu = smp_processor_id(); 2463 cpu = smp_processor_id();
2410 ret = enqueue_to_backlog(skb, cpu); 2464
2465 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2466
2411 rcu_read_unlock(); 2467 rcu_read_unlock();
2412 } 2468 }
2413#else 2469#else
2414 ret = enqueue_to_backlog(skb, get_cpu()); 2470 {
2415 put_cpu(); 2471 unsigned int qtail;
2472 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2473 put_cpu();
2474 }
2416#endif 2475#endif
2417 return ret; 2476 return ret;
2418} 2477}
@@ -2830,14 +2889,22 @@ out:
2830int netif_receive_skb(struct sk_buff *skb) 2889int netif_receive_skb(struct sk_buff *skb)
2831{ 2890{
2832#ifdef CONFIG_RPS 2891#ifdef CONFIG_RPS
2833 int cpu; 2892 struct rps_dev_flow voidflow, *rflow = &voidflow;
2893 int cpu, ret;
2894
2895 rcu_read_lock();
2834 2896
2835 cpu = get_rps_cpu(skb->dev, skb); 2897 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2836 2898
2837 if (cpu < 0) 2899 if (cpu >= 0) {
2838 return __netif_receive_skb(skb); 2900 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2839 else 2901 rcu_read_unlock();
2840 return enqueue_to_backlog(skb, cpu); 2902 } else {
2903 rcu_read_unlock();
2904 ret = __netif_receive_skb(skb);
2905 }
2906
2907 return ret;
2841#else 2908#else
2842 return __netif_receive_skb(skb); 2909 return __netif_receive_skb(skb);
2843#endif 2910#endif
@@ -2856,6 +2923,7 @@ static void flush_backlog(void *arg)
2856 if (skb->dev == dev) { 2923 if (skb->dev == dev) {
2857 __skb_unlink(skb, &queue->input_pkt_queue); 2924 __skb_unlink(skb, &queue->input_pkt_queue);
2858 kfree_skb(skb); 2925 kfree_skb(skb);
2926 incr_input_queue_head(queue);
2859 } 2927 }
2860 rps_unlock(queue); 2928 rps_unlock(queue);
2861} 2929}
@@ -3179,6 +3247,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
3179 local_irq_enable(); 3247 local_irq_enable();
3180 break; 3248 break;
3181 } 3249 }
3250 incr_input_queue_head(queue);
3182 rps_unlock(queue); 3251 rps_unlock(queue);
3183 local_irq_enable(); 3252 local_irq_enable();
3184 3253
@@ -5542,8 +5611,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5542 local_irq_enable(); 5611 local_irq_enable();
5543 5612
5544 /* Process offline CPU's input_pkt_queue */ 5613 /* Process offline CPU's input_pkt_queue */
5545 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) 5614 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5546 netif_rx(skb); 5615 netif_rx(skb);
5616 incr_input_queue_head(oldsd);
5617 }
5547 5618
5548 return NOTIFY_OK; 5619 return NOTIFY_OK;
5549} 5620}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 96ed6905b823..143052a22b9b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -17,6 +17,7 @@
17#include <net/sock.h> 17#include <net/sock.h>
18#include <linux/rtnetlink.h> 18#include <linux/rtnetlink.h>
19#include <linux/wireless.h> 19#include <linux/wireless.h>
20#include <linux/vmalloc.h>
20#include <net/wext.h> 21#include <net/wext.h>
21 22
22#include "net-sysfs.h" 23#include "net-sysfs.h"
@@ -601,22 +602,109 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
601 return len; 602 return len;
602} 603}
603 604
605static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
606 struct rx_queue_attribute *attr,
607 char *buf)
608{
609 struct rps_dev_flow_table *flow_table;
610 unsigned int val = 0;
611
612 rcu_read_lock();
613 flow_table = rcu_dereference(queue->rps_flow_table);
614 if (flow_table)
615 val = flow_table->mask + 1;
616 rcu_read_unlock();
617
618 return sprintf(buf, "%u\n", val);
619}
620
621static void rps_dev_flow_table_release_work(struct work_struct *work)
622{
623 struct rps_dev_flow_table *table = container_of(work,
624 struct rps_dev_flow_table, free_work);
625
626 vfree(table);
627}
628
629static void rps_dev_flow_table_release(struct rcu_head *rcu)
630{
631 struct rps_dev_flow_table *table = container_of(rcu,
632 struct rps_dev_flow_table, rcu);
633
634 INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
635 schedule_work(&table->free_work);
636}
637
638ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
639 struct rx_queue_attribute *attr,
640 const char *buf, size_t len)
641{
642 unsigned int count;
643 char *endp;
644 struct rps_dev_flow_table *table, *old_table;
645 static DEFINE_SPINLOCK(rps_dev_flow_lock);
646
647 if (!capable(CAP_NET_ADMIN))
648 return -EPERM;
649
650 count = simple_strtoul(buf, &endp, 0);
651 if (endp == buf)
652 return -EINVAL;
653
654 if (count) {
655 int i;
656
657 if (count > 1<<30) {
658 /* Enforce a limit to prevent overflow */
659 return -EINVAL;
660 }
661 count = roundup_pow_of_two(count);
662 table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
663 if (!table)
664 return -ENOMEM;
665
666 table->mask = count - 1;
667 for (i = 0; i < count; i++)
668 table->flows[i].cpu = RPS_NO_CPU;
669 } else
670 table = NULL;
671
672 spin_lock(&rps_dev_flow_lock);
673 old_table = queue->rps_flow_table;
674 rcu_assign_pointer(queue->rps_flow_table, table);
675 spin_unlock(&rps_dev_flow_lock);
676
677 if (old_table)
678 call_rcu(&old_table->rcu, rps_dev_flow_table_release);
679
680 return len;
681}
682
604static struct rx_queue_attribute rps_cpus_attribute = 683static struct rx_queue_attribute rps_cpus_attribute =
605 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); 684 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
606 685
686
687static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
688 __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
689 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
690
607static struct attribute *rx_queue_default_attrs[] = { 691static struct attribute *rx_queue_default_attrs[] = {
608 &rps_cpus_attribute.attr, 692 &rps_cpus_attribute.attr,
693 &rps_dev_flow_table_cnt_attribute.attr,
609 NULL 694 NULL
610}; 695};
611 696
612static void rx_queue_release(struct kobject *kobj) 697static void rx_queue_release(struct kobject *kobj)
613{ 698{
614 struct netdev_rx_queue *queue = to_rx_queue(kobj); 699 struct netdev_rx_queue *queue = to_rx_queue(kobj);
615 struct rps_map *map = queue->rps_map;
616 struct netdev_rx_queue *first = queue->first; 700 struct netdev_rx_queue *first = queue->first;
617 701
618 if (map) 702 if (queue->rps_map)
619 call_rcu(&map->rcu, rps_map_release); 703 call_rcu(&queue->rps_map->rcu, rps_map_release);
704
705 if (queue->rps_flow_table)
706 call_rcu(&queue->rps_flow_table->rcu,
707 rps_dev_flow_table_release);
620 708
621 if (atomic_dec_and_test(&first->count)) 709 if (atomic_dec_and_test(&first->count))
622 kfree(first); 710 kfree(first);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b7b6b8208f75..dcc7d25996ab 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -11,12 +11,72 @@
11#include <linux/socket.h> 11#include <linux/socket.h>
12#include <linux/netdevice.h> 12#include <linux/netdevice.h>
13#include <linux/ratelimit.h> 13#include <linux/ratelimit.h>
14#include <linux/vmalloc.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16 17
17#include <net/ip.h> 18#include <net/ip.h>
18#include <net/sock.h> 19#include <net/sock.h>
19 20
21#ifdef CONFIG_RPS
22static int rps_sock_flow_sysctl(ctl_table *table, int write,
23 void __user *buffer, size_t *lenp, loff_t *ppos)
24{
25 unsigned int orig_size, size;
26 int ret, i;
27 ctl_table tmp = {
28 .data = &size,
29 .maxlen = sizeof(size),
30 .mode = table->mode
31 };
32 struct rps_sock_flow_table *orig_sock_table, *sock_table;
33 static DEFINE_MUTEX(sock_flow_mutex);
34
35 mutex_lock(&sock_flow_mutex);
36
37 orig_sock_table = rps_sock_flow_table;
38 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
39
40 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
41
42 if (write) {
43 if (size) {
44 if (size > 1<<30) {
45 /* Enforce limit to prevent overflow */
46 mutex_unlock(&sock_flow_mutex);
47 return -EINVAL;
48 }
49 size = roundup_pow_of_two(size);
50 if (size != orig_size) {
51 sock_table =
52 vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
53 if (!sock_table) {
54 mutex_unlock(&sock_flow_mutex);
55 return -ENOMEM;
56 }
57
58 sock_table->mask = size - 1;
59 } else
60 sock_table = orig_sock_table;
61
62 for (i = 0; i < size; i++)
63 sock_table->ents[i] = RPS_NO_CPU;
64 } else
65 sock_table = NULL;
66
67 if (sock_table != orig_sock_table) {
68 rcu_assign_pointer(rps_sock_flow_table, sock_table);
69 synchronize_rcu();
70 vfree(orig_sock_table);
71 }
72 }
73
74 mutex_unlock(&sock_flow_mutex);
75
76 return ret;
77}
78#endif /* CONFIG_RPS */
79
20static struct ctl_table net_core_table[] = { 80static struct ctl_table net_core_table[] = {
21#ifdef CONFIG_NET 81#ifdef CONFIG_NET
22 { 82 {
@@ -82,6 +142,14 @@ static struct ctl_table net_core_table[] = {
82 .mode = 0644, 142 .mode = 0644,
83 .proc_handler = proc_dointvec 143 .proc_handler = proc_dointvec
84 }, 144 },
145#ifdef CONFIG_RPS
146 {
147 .procname = "rps_sock_flow_entries",
148 .maxlen = sizeof(int),
149 .mode = 0644,
150 .proc_handler = rps_sock_flow_sysctl
151 },
152#endif
85#endif /* CONFIG_NET */ 153#endif /* CONFIG_NET */
86 { 154 {
87 .procname = "netdev_budget", 155 .procname = "netdev_budget",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 193dcd6ed64f..c5376c725503 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -419,6 +419,8 @@ int inet_release(struct socket *sock)
419 if (sk) { 419 if (sk) {
420 long timeout; 420 long timeout;
421 421
422 inet_rps_reset_flow(sk);
423
422 /* Applications forget to leave groups before exiting */ 424 /* Applications forget to leave groups before exiting */
423 ip_mc_drop_socket(sk); 425 ip_mc_drop_socket(sk);
424 426
@@ -720,6 +722,8 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
720{ 722{
721 struct sock *sk = sock->sk; 723 struct sock *sk = sock->sk;
722 724
725 inet_rps_record_flow(sk);
726
723 /* We may need to bind the socket. */ 727 /* We may need to bind the socket. */
724 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 728 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
725 return -EAGAIN; 729 return -EAGAIN;
@@ -728,12 +732,13 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
728} 732}
729EXPORT_SYMBOL(inet_sendmsg); 733EXPORT_SYMBOL(inet_sendmsg);
730 734
731
732static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, 735static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
733 size_t size, int flags) 736 size_t size, int flags)
734{ 737{
735 struct sock *sk = sock->sk; 738 struct sock *sk = sock->sk;
736 739
740 inet_rps_record_flow(sk);
741
737 /* We may need to bind the socket. */ 742 /* We may need to bind the socket. */
738 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 743 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
739 return -EAGAIN; 744 return -EAGAIN;
@@ -743,6 +748,22 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
743 return sock_no_sendpage(sock, page, offset, size, flags); 748 return sock_no_sendpage(sock, page, offset, size, flags);
744} 749}
745 750
751int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
752 size_t size, int flags)
753{
754 struct sock *sk = sock->sk;
755 int addr_len = 0;
756 int err;
757
758 inet_rps_record_flow(sk);
759
760 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
761 flags & ~MSG_DONTWAIT, &addr_len);
762 if (err >= 0)
763 msg->msg_namelen = addr_len;
764 return err;
765}
766EXPORT_SYMBOL(inet_recvmsg);
746 767
747int inet_shutdown(struct socket *sock, int how) 768int inet_shutdown(struct socket *sock, int how)
748{ 769{
@@ -872,7 +893,7 @@ const struct proto_ops inet_stream_ops = {
872 .setsockopt = sock_common_setsockopt, 893 .setsockopt = sock_common_setsockopt,
873 .getsockopt = sock_common_getsockopt, 894 .getsockopt = sock_common_getsockopt,
874 .sendmsg = tcp_sendmsg, 895 .sendmsg = tcp_sendmsg,
875 .recvmsg = sock_common_recvmsg, 896 .recvmsg = inet_recvmsg,
876 .mmap = sock_no_mmap, 897 .mmap = sock_no_mmap,
877 .sendpage = tcp_sendpage, 898 .sendpage = tcp_sendpage,
878 .splice_read = tcp_splice_read, 899 .splice_read = tcp_splice_read,
@@ -899,7 +920,7 @@ const struct proto_ops inet_dgram_ops = {
899 .setsockopt = sock_common_setsockopt, 920 .setsockopt = sock_common_setsockopt,
900 .getsockopt = sock_common_getsockopt, 921 .getsockopt = sock_common_getsockopt,
901 .sendmsg = inet_sendmsg, 922 .sendmsg = inet_sendmsg,
902 .recvmsg = sock_common_recvmsg, 923 .recvmsg = inet_recvmsg,
903 .mmap = sock_no_mmap, 924 .mmap = sock_no_mmap,
904 .sendpage = inet_sendpage, 925 .sendpage = inet_sendpage,
905#ifdef CONFIG_COMPAT 926#ifdef CONFIG_COMPAT
@@ -929,7 +950,7 @@ static const struct proto_ops inet_sockraw_ops = {
929 .setsockopt = sock_common_setsockopt, 950 .setsockopt = sock_common_setsockopt,
930 .getsockopt = sock_common_getsockopt, 951 .getsockopt = sock_common_getsockopt,
931 .sendmsg = inet_sendmsg, 952 .sendmsg = inet_sendmsg,
932 .recvmsg = sock_common_recvmsg, 953 .recvmsg = inet_recvmsg,
933 .mmap = sock_no_mmap, 954 .mmap = sock_no_mmap,
934 .sendpage = inet_sendpage, 955 .sendpage = inet_sendpage,
935#ifdef CONFIG_COMPAT 956#ifdef CONFIG_COMPAT
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a24995cdc4b6..ad08392a738c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1672,6 +1672,8 @@ process:
1672 1672
1673 skb->dev = NULL; 1673 skb->dev = NULL;
1674 1674
1675 inet_rps_save_rxhash(sk, skb->rxhash);
1676
1675 bh_lock_sock_nested(sk); 1677 bh_lock_sock_nested(sk);
1676 ret = 0; 1678 ret = 0;
1677 if (!sock_owned_by_user(sk)) { 1679 if (!sock_owned_by_user(sk)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8fef859db35d..666b963496ff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1217,6 +1217,7 @@ int udp_disconnect(struct sock *sk, int flags)
1217 sk->sk_state = TCP_CLOSE; 1217 sk->sk_state = TCP_CLOSE;
1218 inet->inet_daddr = 0; 1218 inet->inet_daddr = 0;
1219 inet->inet_dport = 0; 1219 inet->inet_dport = 0;
1220 inet_rps_save_rxhash(sk, 0);
1220 sk->sk_bound_dev_if = 0; 1221 sk->sk_bound_dev_if = 0;
1221 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1222 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1222 inet_reset_saddr(sk); 1223 inet_reset_saddr(sk);
@@ -1258,8 +1259,12 @@ EXPORT_SYMBOL(udp_lib_unhash);
1258 1259
1259static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1260static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1260{ 1261{
1261 int rc = sock_queue_rcv_skb(sk, skb); 1262 int rc;
1263
1264 if (inet_sk(sk)->inet_daddr)
1265 inet_rps_save_rxhash(sk, skb->rxhash);
1262 1266
1267 rc = sock_queue_rcv_skb(sk, skb);
1263 if (rc < 0) { 1268 if (rc < 0) {
1264 int is_udplite = IS_UDPLITE(sk); 1269 int is_udplite = IS_UDPLITE(sk);
1265 1270