aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-02-06 15:59:01 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-08 19:53:57 -0500
commit567e4b79731c352a17d73c483959f795d3593e03 (patch)
tree4af65c205a8b65cfc5fd7b42e7b8750728230616
parent096a4cfa5807aa89c78ce12309c0b1c10cf88184 (diff)
net: rfs: add hash collision detection
Receive Flow Steering is a nice solution but suffers from hash collisions when a mix of connected and unconnected traffic is received on the host, when flow hash table is populated. Also, clearing flow in inet_release() makes RFS not very good for short lived flows, as many packets can follow close(). (FIN , ACK packets, ...) This patch extends the information stored into global hash table to not only include cpu number, but upper part of the hash value. I use a 32bit value, and dynamically split it in two parts. For host with less than 64 possible cpus, this gives 6 bits for the cpu number, and 26 (32-6) bits for the upper part of the hash. Since hash bucket selection use low order bits of the hash, we have a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big enough. If the hash found in flow table does not match, we fallback to RPS (if it is enabled for the rxqueue). This means that a packet for an non connected flow can avoid the IPI through a unrelated/victim CPU. This also means we no longer have to clear the table at socket close time, and this helps short lived flows performance. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/tun.c5
-rw-r--r--include/linux/netdevice.h34
-rw-r--r--include/net/sock.h24
-rw-r--r--net/core/dev.c48
-rw-r--r--net/core/sysctl_net_core.c2
-rw-r--r--net/ipv4/af_inet.c2
6 files changed, 47 insertions, 68 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ad7d3d5f3ee5..857dca47bf80 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
256{ 256{
257 tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", 257 tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
258 e->rxhash, e->queue_index); 258 e->rxhash, e->queue_index);
259 sock_rps_reset_flow_hash(e->rps_rxhash);
260 hlist_del_rcu(&e->hash_link); 259 hlist_del_rcu(&e->hash_link);
261 kfree_rcu(e, rcu); 260 kfree_rcu(e, rcu);
262 --tun->flow_count; 261 --tun->flow_count;
@@ -373,10 +372,8 @@ unlock:
373 */ 372 */
374static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) 373static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
375{ 374{
376 if (unlikely(e->rps_rxhash != hash)) { 375 if (unlikely(e->rps_rxhash != hash))
377 sock_rps_reset_flow_hash(e->rps_rxhash);
378 e->rps_rxhash = hash; 376 e->rps_rxhash = hash;
379 }
380} 377}
381 378
382/* We try to identify a flow through its rxhash first. The reason that 379/* We try to identify a flow through its rxhash first. The reason that
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ce784d5018e0..ab3b7cef4638 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -644,39 +644,39 @@ struct rps_dev_flow_table {
644/* 644/*
645 * The rps_sock_flow_table contains mappings of flows to the last CPU 645 * The rps_sock_flow_table contains mappings of flows to the last CPU
646 * on which they were processed by the application (set in recvmsg). 646 * on which they were processed by the application (set in recvmsg).
647 * Each entry is a 32bit value. Upper part is the high order bits
648 * of flow hash, lower part is cpu number.
649 * rps_cpu_mask is used to partition the space, depending on number of
650 * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
651 * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
652 * meaning we use 32-6=26 bits for the hash.
647 */ 653 */
648struct rps_sock_flow_table { 654struct rps_sock_flow_table {
649 unsigned int mask; 655 u32 mask;
650 u16 ents[0]; 656 u32 ents[0];
651}; 657};
652#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \ 658#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
653 ((_num) * sizeof(u16)))
654 659
655#define RPS_NO_CPU 0xffff 660#define RPS_NO_CPU 0xffff
656 661
662extern u32 rps_cpu_mask;
663extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
664
657static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, 665static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
658 u32 hash) 666 u32 hash)
659{ 667{
660 if (table && hash) { 668 if (table && hash) {
661 unsigned int cpu, index = hash & table->mask; 669 unsigned int index = hash & table->mask;
670 u32 val = hash & ~rps_cpu_mask;
662 671
663 /* We only give a hint, preemption can change cpu under us */ 672 /* We only give a hint, preemption can change cpu under us */
664 cpu = raw_smp_processor_id(); 673 val |= raw_smp_processor_id();
665 674
666 if (table->ents[index] != cpu) 675 if (table->ents[index] != val)
667 table->ents[index] = cpu; 676 table->ents[index] = val;
668 } 677 }
669} 678}
670 679
671static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
672 u32 hash)
673{
674 if (table && hash)
675 table->ents[hash & table->mask] = RPS_NO_CPU;
676}
677
678extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
679
680#ifdef CONFIG_RFS_ACCEL 680#ifdef CONFIG_RFS_ACCEL
681bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, 681bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
682 u16 filter_id); 682 u16 filter_id);
diff --git a/include/net/sock.h b/include/net/sock.h
index d28b8fededd6..e13824570b0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
857#endif 857#endif
858} 858}
859 859
860static inline void sock_rps_reset_flow_hash(__u32 hash)
861{
862#ifdef CONFIG_RPS
863 struct rps_sock_flow_table *sock_flow_table;
864
865 rcu_read_lock();
866 sock_flow_table = rcu_dereference(rps_sock_flow_table);
867 rps_reset_sock_flow(sock_flow_table, hash);
868 rcu_read_unlock();
869#endif
870}
871
872static inline void sock_rps_record_flow(const struct sock *sk) 860static inline void sock_rps_record_flow(const struct sock *sk)
873{ 861{
874#ifdef CONFIG_RPS 862#ifdef CONFIG_RPS
@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
876#endif 864#endif
877} 865}
878 866
879static inline void sock_rps_reset_flow(const struct sock *sk)
880{
881#ifdef CONFIG_RPS
882 sock_rps_reset_flow_hash(sk->sk_rxhash);
883#endif
884}
885
886static inline void sock_rps_save_rxhash(struct sock *sk, 867static inline void sock_rps_save_rxhash(struct sock *sk,
887 const struct sk_buff *skb) 868 const struct sk_buff *skb)
888{ 869{
889#ifdef CONFIG_RPS 870#ifdef CONFIG_RPS
890 if (unlikely(sk->sk_rxhash != skb->hash)) { 871 if (unlikely(sk->sk_rxhash != skb->hash))
891 sock_rps_reset_flow(sk);
892 sk->sk_rxhash = skb->hash; 872 sk->sk_rxhash = skb->hash;
893 }
894#endif 873#endif
895} 874}
896 875
897static inline void sock_rps_reset_rxhash(struct sock *sk) 876static inline void sock_rps_reset_rxhash(struct sock *sk)
898{ 877{
899#ifdef CONFIG_RPS 878#ifdef CONFIG_RPS
900 sock_rps_reset_flow(sk);
901 sk->sk_rxhash = 0; 879 sk->sk_rxhash = 0;
902#endif 880#endif
903} 881}
diff --git a/net/core/dev.c b/net/core/dev.c
index a3a96ffc67f4..8be38675e1a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
3030/* One global table that all flow-based protocols share. */ 3030/* One global table that all flow-based protocols share. */
3031struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3031struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3032EXPORT_SYMBOL(rps_sock_flow_table); 3032EXPORT_SYMBOL(rps_sock_flow_table);
3033u32 rps_cpu_mask __read_mostly;
3034EXPORT_SYMBOL(rps_cpu_mask);
3033 3035
3034struct static_key rps_needed __read_mostly; 3036struct static_key rps_needed __read_mostly;
3035 3037
@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3086static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3088static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3087 struct rps_dev_flow **rflowp) 3089 struct rps_dev_flow **rflowp)
3088{ 3090{
3089 struct netdev_rx_queue *rxqueue; 3091 const struct rps_sock_flow_table *sock_flow_table;
3090 struct rps_map *map; 3092 struct netdev_rx_queue *rxqueue = dev->_rx;
3091 struct rps_dev_flow_table *flow_table; 3093 struct rps_dev_flow_table *flow_table;
3092 struct rps_sock_flow_table *sock_flow_table; 3094 struct rps_map *map;
3093 int cpu = -1; 3095 int cpu = -1;
3094 u16 tcpu; 3096 u32 tcpu;
3095 u32 hash; 3097 u32 hash;
3096 3098
3097 if (skb_rx_queue_recorded(skb)) { 3099 if (skb_rx_queue_recorded(skb)) {
3098 u16 index = skb_get_rx_queue(skb); 3100 u16 index = skb_get_rx_queue(skb);
3101
3099 if (unlikely(index >= dev->real_num_rx_queues)) { 3102 if (unlikely(index >= dev->real_num_rx_queues)) {
3100 WARN_ONCE(dev->real_num_rx_queues > 1, 3103 WARN_ONCE(dev->real_num_rx_queues > 1,
3101 "%s received packet on queue %u, but number " 3104 "%s received packet on queue %u, but number "
@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3103 dev->name, index, dev->real_num_rx_queues); 3106 dev->name, index, dev->real_num_rx_queues);
3104 goto done; 3107 goto done;
3105 } 3108 }
3106 rxqueue = dev->_rx + index; 3109 rxqueue += index;
3107 } else 3110 }
3108 rxqueue = dev->_rx;
3109 3111
3112 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3113
3114 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3110 map = rcu_dereference(rxqueue->rps_map); 3115 map = rcu_dereference(rxqueue->rps_map);
3111 if (map) { 3116 if (!flow_table && !map)
3112 if (map->len == 1 &&
3113 !rcu_access_pointer(rxqueue->rps_flow_table)) {
3114 tcpu = map->cpus[0];
3115 if (cpu_online(tcpu))
3116 cpu = tcpu;
3117 goto done;
3118 }
3119 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3120 goto done; 3117 goto done;
3121 }
3122 3118
3123 skb_reset_network_header(skb); 3119 skb_reset_network_header(skb);
3124 hash = skb_get_hash(skb); 3120 hash = skb_get_hash(skb);
3125 if (!hash) 3121 if (!hash)
3126 goto done; 3122 goto done;
3127 3123
3128 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3129 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3124 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3130 if (flow_table && sock_flow_table) { 3125 if (flow_table && sock_flow_table) {
3131 u16 next_cpu;
3132 struct rps_dev_flow *rflow; 3126 struct rps_dev_flow *rflow;
3127 u32 next_cpu;
3128 u32 ident;
3129
3130 /* First check into global flow table if there is a match */
3131 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3132 if ((ident ^ hash) & ~rps_cpu_mask)
3133 goto try_rps;
3133 3134
3135 next_cpu = ident & rps_cpu_mask;
3136
3137 /* OK, now we know there is a match,
3138 * we can look at the local (per receive queue) flow table
3139 */
3134 rflow = &flow_table->flows[hash & flow_table->mask]; 3140 rflow = &flow_table->flows[hash & flow_table->mask];
3135 tcpu = rflow->cpu; 3141 tcpu = rflow->cpu;
3136 3142
3137 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3138
3139 /* 3143 /*
3140 * If the desired CPU (where last recvmsg was done) is 3144 * If the desired CPU (where last recvmsg was done) is
3141 * different from current CPU (one in the rx-queue flow 3145 * different from current CPU (one in the rx-queue flow
@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3162 } 3166 }
3163 } 3167 }
3164 3168
3169try_rps:
3170
3165 if (map) { 3171 if (map) {
3166 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3172 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3167 if (cpu_online(tcpu)) { 3173 if (cpu_online(tcpu)) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fde21d19e61b..7a31be5e361f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
65 mutex_unlock(&sock_flow_mutex); 65 mutex_unlock(&sock_flow_mutex);
66 return -ENOMEM; 66 return -ENOMEM;
67 } 67 }
68 68 rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
69 sock_table->mask = size - 1; 69 sock_table->mask = size - 1;
70 } else 70 } else
71 sock_table = orig_sock_table; 71 sock_table = orig_sock_table;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a44773c8346c..d2e49baaff63 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
395 if (sk) { 395 if (sk) {
396 long timeout; 396 long timeout;
397 397
398 sock_rps_reset_flow(sk);
399
400 /* Applications forget to leave groups before exiting */ 398 /* Applications forget to leave groups before exiting */
401 ip_mc_drop_socket(sk); 399 ip_mc_drop_socket(sk);
402 400