diff options
author | Eric Dumazet <edumazet@google.com> | 2015-02-06 15:59:01 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-02-08 19:53:57 -0500 |
commit | 567e4b79731c352a17d73c483959f795d3593e03 (patch) | |
tree | 4af65c205a8b65cfc5fd7b42e7b8750728230616 | |
parent | 096a4cfa5807aa89c78ce12309c0b1c10cf88184 (diff) |
net: rfs: add hash collision detection
Receive Flow Steering is a nice solution but suffers from
hash collisions when a mix of connected and unconnected traffic
is received on the host, when flow hash table is populated.
Also, clearing flow in inet_release() makes RFS not very good
for short lived flows, as many packets can follow close().
(FIN , ACK packets, ...)
This patch extends the information stored into global hash table
to not only include cpu number, but upper part of the hash value.
I use a 32bit value, and dynamically split it in two parts.
For host with less than 64 possible cpus, this gives 6 bits for the
cpu number, and 26 (32-6) bits for the upper part of the hash.
Since hash bucket selection use low order bits of the hash, we have
a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
enough.
If the hash found in flow table does not match, we fallback to RPS (if
it is enabled for the rxqueue).
This means that a packet for an non connected flow can avoid the
IPI through a unrelated/victim CPU.
This also means we no longer have to clear the table at socket
close time, and this helps short lived flows performance.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/tun.c | 5 | ||||
-rw-r--r-- | include/linux/netdevice.h | 34 | ||||
-rw-r--r-- | include/net/sock.h | 24 | ||||
-rw-r--r-- | net/core/dev.c | 48 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 2 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 2 |
6 files changed, 47 insertions, 68 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ad7d3d5f3ee5..857dca47bf80 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c | |||
@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) | |||
256 | { | 256 | { |
257 | tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", | 257 | tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", |
258 | e->rxhash, e->queue_index); | 258 | e->rxhash, e->queue_index); |
259 | sock_rps_reset_flow_hash(e->rps_rxhash); | ||
260 | hlist_del_rcu(&e->hash_link); | 259 | hlist_del_rcu(&e->hash_link); |
261 | kfree_rcu(e, rcu); | 260 | kfree_rcu(e, rcu); |
262 | --tun->flow_count; | 261 | --tun->flow_count; |
@@ -373,10 +372,8 @@ unlock: | |||
373 | */ | 372 | */ |
374 | static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) | 373 | static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) |
375 | { | 374 | { |
376 | if (unlikely(e->rps_rxhash != hash)) { | 375 | if (unlikely(e->rps_rxhash != hash)) |
377 | sock_rps_reset_flow_hash(e->rps_rxhash); | ||
378 | e->rps_rxhash = hash; | 376 | e->rps_rxhash = hash; |
379 | } | ||
380 | } | 377 | } |
381 | 378 | ||
382 | /* We try to identify a flow through its rxhash first. The reason that | 379 | /* We try to identify a flow through its rxhash first. The reason that |
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce784d5018e0..ab3b7cef4638 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
@@ -644,39 +644,39 @@ struct rps_dev_flow_table { | |||
644 | /* | 644 | /* |
645 | * The rps_sock_flow_table contains mappings of flows to the last CPU | 645 | * The rps_sock_flow_table contains mappings of flows to the last CPU |
646 | * on which they were processed by the application (set in recvmsg). | 646 | * on which they were processed by the application (set in recvmsg). |
647 | * Each entry is a 32bit value. Upper part is the high order bits | ||
648 | * of flow hash, lower part is cpu number. | ||
649 | * rps_cpu_mask is used to partition the space, depending on number of | ||
650 | * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 | ||
651 | * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f, | ||
652 | * meaning we use 32-6=26 bits for the hash. | ||
647 | */ | 653 | */ |
648 | struct rps_sock_flow_table { | 654 | struct rps_sock_flow_table { |
649 | unsigned int mask; | 655 | u32 mask; |
650 | u16 ents[0]; | 656 | u32 ents[0]; |
651 | }; | 657 | }; |
652 | #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \ | 658 | #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) |
653 | ((_num) * sizeof(u16))) | ||
654 | 659 | ||
655 | #define RPS_NO_CPU 0xffff | 660 | #define RPS_NO_CPU 0xffff |
656 | 661 | ||
662 | extern u32 rps_cpu_mask; | ||
663 | extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; | ||
664 | |||
657 | static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, | 665 | static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, |
658 | u32 hash) | 666 | u32 hash) |
659 | { | 667 | { |
660 | if (table && hash) { | 668 | if (table && hash) { |
661 | unsigned int cpu, index = hash & table->mask; | 669 | unsigned int index = hash & table->mask; |
670 | u32 val = hash & ~rps_cpu_mask; | ||
662 | 671 | ||
663 | /* We only give a hint, preemption can change cpu under us */ | 672 | /* We only give a hint, preemption can change cpu under us */ |
664 | cpu = raw_smp_processor_id(); | 673 | val |= raw_smp_processor_id(); |
665 | 674 | ||
666 | if (table->ents[index] != cpu) | 675 | if (table->ents[index] != val) |
667 | table->ents[index] = cpu; | 676 | table->ents[index] = val; |
668 | } | 677 | } |
669 | } | 678 | } |
670 | 679 | ||
671 | static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table, | ||
672 | u32 hash) | ||
673 | { | ||
674 | if (table && hash) | ||
675 | table->ents[hash & table->mask] = RPS_NO_CPU; | ||
676 | } | ||
677 | |||
678 | extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; | ||
679 | |||
680 | #ifdef CONFIG_RFS_ACCEL | 680 | #ifdef CONFIG_RFS_ACCEL |
681 | bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, | 681 | bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, |
682 | u16 filter_id); | 682 | u16 filter_id); |
diff --git a/include/net/sock.h b/include/net/sock.h index d28b8fededd6..e13824570b0f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash) | |||
857 | #endif | 857 | #endif |
858 | } | 858 | } |
859 | 859 | ||
860 | static inline void sock_rps_reset_flow_hash(__u32 hash) | ||
861 | { | ||
862 | #ifdef CONFIG_RPS | ||
863 | struct rps_sock_flow_table *sock_flow_table; | ||
864 | |||
865 | rcu_read_lock(); | ||
866 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | ||
867 | rps_reset_sock_flow(sock_flow_table, hash); | ||
868 | rcu_read_unlock(); | ||
869 | #endif | ||
870 | } | ||
871 | |||
872 | static inline void sock_rps_record_flow(const struct sock *sk) | 860 | static inline void sock_rps_record_flow(const struct sock *sk) |
873 | { | 861 | { |
874 | #ifdef CONFIG_RPS | 862 | #ifdef CONFIG_RPS |
@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk) | |||
876 | #endif | 864 | #endif |
877 | } | 865 | } |
878 | 866 | ||
879 | static inline void sock_rps_reset_flow(const struct sock *sk) | ||
880 | { | ||
881 | #ifdef CONFIG_RPS | ||
882 | sock_rps_reset_flow_hash(sk->sk_rxhash); | ||
883 | #endif | ||
884 | } | ||
885 | |||
886 | static inline void sock_rps_save_rxhash(struct sock *sk, | 867 | static inline void sock_rps_save_rxhash(struct sock *sk, |
887 | const struct sk_buff *skb) | 868 | const struct sk_buff *skb) |
888 | { | 869 | { |
889 | #ifdef CONFIG_RPS | 870 | #ifdef CONFIG_RPS |
890 | if (unlikely(sk->sk_rxhash != skb->hash)) { | 871 | if (unlikely(sk->sk_rxhash != skb->hash)) |
891 | sock_rps_reset_flow(sk); | ||
892 | sk->sk_rxhash = skb->hash; | 872 | sk->sk_rxhash = skb->hash; |
893 | } | ||
894 | #endif | 873 | #endif |
895 | } | 874 | } |
896 | 875 | ||
897 | static inline void sock_rps_reset_rxhash(struct sock *sk) | 876 | static inline void sock_rps_reset_rxhash(struct sock *sk) |
898 | { | 877 | { |
899 | #ifdef CONFIG_RPS | 878 | #ifdef CONFIG_RPS |
900 | sock_rps_reset_flow(sk); | ||
901 | sk->sk_rxhash = 0; | 879 | sk->sk_rxhash = 0; |
902 | #endif | 880 | #endif |
903 | } | 881 | } |
diff --git a/net/core/dev.c b/net/core/dev.c index a3a96ffc67f4..8be38675e1a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd, | |||
3030 | /* One global table that all flow-based protocols share. */ | 3030 | /* One global table that all flow-based protocols share. */ |
3031 | struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; | 3031 | struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; |
3032 | EXPORT_SYMBOL(rps_sock_flow_table); | 3032 | EXPORT_SYMBOL(rps_sock_flow_table); |
3033 | u32 rps_cpu_mask __read_mostly; | ||
3034 | EXPORT_SYMBOL(rps_cpu_mask); | ||
3033 | 3035 | ||
3034 | struct static_key rps_needed __read_mostly; | 3036 | struct static_key rps_needed __read_mostly; |
3035 | 3037 | ||
@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, | |||
3086 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, | 3088 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, |
3087 | struct rps_dev_flow **rflowp) | 3089 | struct rps_dev_flow **rflowp) |
3088 | { | 3090 | { |
3089 | struct netdev_rx_queue *rxqueue; | 3091 | const struct rps_sock_flow_table *sock_flow_table; |
3090 | struct rps_map *map; | 3092 | struct netdev_rx_queue *rxqueue = dev->_rx; |
3091 | struct rps_dev_flow_table *flow_table; | 3093 | struct rps_dev_flow_table *flow_table; |
3092 | struct rps_sock_flow_table *sock_flow_table; | 3094 | struct rps_map *map; |
3093 | int cpu = -1; | 3095 | int cpu = -1; |
3094 | u16 tcpu; | 3096 | u32 tcpu; |
3095 | u32 hash; | 3097 | u32 hash; |
3096 | 3098 | ||
3097 | if (skb_rx_queue_recorded(skb)) { | 3099 | if (skb_rx_queue_recorded(skb)) { |
3098 | u16 index = skb_get_rx_queue(skb); | 3100 | u16 index = skb_get_rx_queue(skb); |
3101 | |||
3099 | if (unlikely(index >= dev->real_num_rx_queues)) { | 3102 | if (unlikely(index >= dev->real_num_rx_queues)) { |
3100 | WARN_ONCE(dev->real_num_rx_queues > 1, | 3103 | WARN_ONCE(dev->real_num_rx_queues > 1, |
3101 | "%s received packet on queue %u, but number " | 3104 | "%s received packet on queue %u, but number " |
@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, | |||
3103 | dev->name, index, dev->real_num_rx_queues); | 3106 | dev->name, index, dev->real_num_rx_queues); |
3104 | goto done; | 3107 | goto done; |
3105 | } | 3108 | } |
3106 | rxqueue = dev->_rx + index; | 3109 | rxqueue += index; |
3107 | } else | 3110 | } |
3108 | rxqueue = dev->_rx; | ||
3109 | 3111 | ||
3112 | /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ | ||
3113 | |||
3114 | flow_table = rcu_dereference(rxqueue->rps_flow_table); | ||
3110 | map = rcu_dereference(rxqueue->rps_map); | 3115 | map = rcu_dereference(rxqueue->rps_map); |
3111 | if (map) { | 3116 | if (!flow_table && !map) |
3112 | if (map->len == 1 && | ||
3113 | !rcu_access_pointer(rxqueue->rps_flow_table)) { | ||
3114 | tcpu = map->cpus[0]; | ||
3115 | if (cpu_online(tcpu)) | ||
3116 | cpu = tcpu; | ||
3117 | goto done; | ||
3118 | } | ||
3119 | } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { | ||
3120 | goto done; | 3117 | goto done; |
3121 | } | ||
3122 | 3118 | ||
3123 | skb_reset_network_header(skb); | 3119 | skb_reset_network_header(skb); |
3124 | hash = skb_get_hash(skb); | 3120 | hash = skb_get_hash(skb); |
3125 | if (!hash) | 3121 | if (!hash) |
3126 | goto done; | 3122 | goto done; |
3127 | 3123 | ||
3128 | flow_table = rcu_dereference(rxqueue->rps_flow_table); | ||
3129 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | 3124 | sock_flow_table = rcu_dereference(rps_sock_flow_table); |
3130 | if (flow_table && sock_flow_table) { | 3125 | if (flow_table && sock_flow_table) { |
3131 | u16 next_cpu; | ||
3132 | struct rps_dev_flow *rflow; | 3126 | struct rps_dev_flow *rflow; |
3127 | u32 next_cpu; | ||
3128 | u32 ident; | ||
3129 | |||
3130 | /* First check into global flow table if there is a match */ | ||
3131 | ident = sock_flow_table->ents[hash & sock_flow_table->mask]; | ||
3132 | if ((ident ^ hash) & ~rps_cpu_mask) | ||
3133 | goto try_rps; | ||
3133 | 3134 | ||
3135 | next_cpu = ident & rps_cpu_mask; | ||
3136 | |||
3137 | /* OK, now we know there is a match, | ||
3138 | * we can look at the local (per receive queue) flow table | ||
3139 | */ | ||
3134 | rflow = &flow_table->flows[hash & flow_table->mask]; | 3140 | rflow = &flow_table->flows[hash & flow_table->mask]; |
3135 | tcpu = rflow->cpu; | 3141 | tcpu = rflow->cpu; |
3136 | 3142 | ||
3137 | next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; | ||
3138 | |||
3139 | /* | 3143 | /* |
3140 | * If the desired CPU (where last recvmsg was done) is | 3144 | * If the desired CPU (where last recvmsg was done) is |
3141 | * different from current CPU (one in the rx-queue flow | 3145 | * different from current CPU (one in the rx-queue flow |
@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, | |||
3162 | } | 3166 | } |
3163 | } | 3167 | } |
3164 | 3168 | ||
3169 | try_rps: | ||
3170 | |||
3165 | if (map) { | 3171 | if (map) { |
3166 | tcpu = map->cpus[reciprocal_scale(hash, map->len)]; | 3172 | tcpu = map->cpus[reciprocal_scale(hash, map->len)]; |
3167 | if (cpu_online(tcpu)) { | 3173 | if (cpu_online(tcpu)) { |
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index fde21d19e61b..7a31be5e361f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c | |||
@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, | |||
65 | mutex_unlock(&sock_flow_mutex); | 65 | mutex_unlock(&sock_flow_mutex); |
66 | return -ENOMEM; | 66 | return -ENOMEM; |
67 | } | 67 | } |
68 | 68 | rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; | |
69 | sock_table->mask = size - 1; | 69 | sock_table->mask = size - 1; |
70 | } else | 70 | } else |
71 | sock_table = orig_sock_table; | 71 | sock_table = orig_sock_table; |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a44773c8346c..d2e49baaff63 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock) | |||
395 | if (sk) { | 395 | if (sk) { |
396 | long timeout; | 396 | long timeout; |
397 | 397 | ||
398 | sock_rps_reset_flow(sk); | ||
399 | |||
400 | /* Applications forget to leave groups before exiting */ | 398 | /* Applications forget to leave groups before exiting */ |
401 | ip_mc_drop_socket(sk); | 399 | ip_mc_drop_socket(sk); |
402 | 400 | ||