diff options
author | Eric Dumazet <edumazet@google.com> | 2015-02-06 15:59:01 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-02-08 19:53:57 -0500 |
commit | 567e4b79731c352a17d73c483959f795d3593e03 (patch) | |
tree | 4af65c205a8b65cfc5fd7b42e7b8750728230616 /net/core | |
parent | 096a4cfa5807aa89c78ce12309c0b1c10cf88184 (diff) |
net: rfs: add hash collision detection
Receive Flow Steering is a nice solution but suffers from
hash collisions when a mix of connected and unconnected traffic
is received on the host, when flow hash table is populated.
Also, clearing flow in inet_release() makes RFS not very good
for short lived flows, as many packets can follow close().
(FIN , ACK packets, ...)
This patch extends the information stored into global hash table
to not only include cpu number, but upper part of the hash value.
I use a 32bit value, and dynamically split it in two parts.
For host with less than 64 possible cpus, this gives 6 bits for the
cpu number, and 26 (32-6) bits for the upper part of the hash.
Since hash bucket selection use low order bits of the hash, we have
a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
enough.
If the hash found in flow table does not match, we fallback to RPS (if
it is enabled for the rxqueue).
This means that a packet for an non connected flow can avoid the
IPI through a unrelated/victim CPU.
This also means we no longer have to clear the table at socket
close time, and this helps short lived flows performance.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 48 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 2 |
2 files changed, 28 insertions, 22 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index a3a96ffc67f4..8be38675e1a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd, | |||
3030 | /* One global table that all flow-based protocols share. */ | 3030 | /* One global table that all flow-based protocols share. */ |
3031 | struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; | 3031 | struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; |
3032 | EXPORT_SYMBOL(rps_sock_flow_table); | 3032 | EXPORT_SYMBOL(rps_sock_flow_table); |
3033 | u32 rps_cpu_mask __read_mostly; | ||
3034 | EXPORT_SYMBOL(rps_cpu_mask); | ||
3033 | 3035 | ||
3034 | struct static_key rps_needed __read_mostly; | 3036 | struct static_key rps_needed __read_mostly; |
3035 | 3037 | ||
@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, | |||
3086 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, | 3088 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, |
3087 | struct rps_dev_flow **rflowp) | 3089 | struct rps_dev_flow **rflowp) |
3088 | { | 3090 | { |
3089 | struct netdev_rx_queue *rxqueue; | 3091 | const struct rps_sock_flow_table *sock_flow_table; |
3090 | struct rps_map *map; | 3092 | struct netdev_rx_queue *rxqueue = dev->_rx; |
3091 | struct rps_dev_flow_table *flow_table; | 3093 | struct rps_dev_flow_table *flow_table; |
3092 | struct rps_sock_flow_table *sock_flow_table; | 3094 | struct rps_map *map; |
3093 | int cpu = -1; | 3095 | int cpu = -1; |
3094 | u16 tcpu; | 3096 | u32 tcpu; |
3095 | u32 hash; | 3097 | u32 hash; |
3096 | 3098 | ||
3097 | if (skb_rx_queue_recorded(skb)) { | 3099 | if (skb_rx_queue_recorded(skb)) { |
3098 | u16 index = skb_get_rx_queue(skb); | 3100 | u16 index = skb_get_rx_queue(skb); |
3101 | |||
3099 | if (unlikely(index >= dev->real_num_rx_queues)) { | 3102 | if (unlikely(index >= dev->real_num_rx_queues)) { |
3100 | WARN_ONCE(dev->real_num_rx_queues > 1, | 3103 | WARN_ONCE(dev->real_num_rx_queues > 1, |
3101 | "%s received packet on queue %u, but number " | 3104 | "%s received packet on queue %u, but number " |
@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, | |||
3103 | dev->name, index, dev->real_num_rx_queues); | 3106 | dev->name, index, dev->real_num_rx_queues); |
3104 | goto done; | 3107 | goto done; |
3105 | } | 3108 | } |
3106 | rxqueue = dev->_rx + index; | 3109 | rxqueue += index; |
3107 | } else | 3110 | } |
3108 | rxqueue = dev->_rx; | ||
3109 | 3111 | ||
3112 | /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ | ||
3113 | |||
3114 | flow_table = rcu_dereference(rxqueue->rps_flow_table); | ||
3110 | map = rcu_dereference(rxqueue->rps_map); | 3115 | map = rcu_dereference(rxqueue->rps_map); |
3111 | if (map) { | 3116 | if (!flow_table && !map) |
3112 | if (map->len == 1 && | ||
3113 | !rcu_access_pointer(rxqueue->rps_flow_table)) { | ||
3114 | tcpu = map->cpus[0]; | ||
3115 | if (cpu_online(tcpu)) | ||
3116 | cpu = tcpu; | ||
3117 | goto done; | ||
3118 | } | ||
3119 | } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { | ||
3120 | goto done; | 3117 | goto done; |
3121 | } | ||
3122 | 3118 | ||
3123 | skb_reset_network_header(skb); | 3119 | skb_reset_network_header(skb); |
3124 | hash = skb_get_hash(skb); | 3120 | hash = skb_get_hash(skb); |
3125 | if (!hash) | 3121 | if (!hash) |
3126 | goto done; | 3122 | goto done; |
3127 | 3123 | ||
3128 | flow_table = rcu_dereference(rxqueue->rps_flow_table); | ||
3129 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | 3124 | sock_flow_table = rcu_dereference(rps_sock_flow_table); |
3130 | if (flow_table && sock_flow_table) { | 3125 | if (flow_table && sock_flow_table) { |
3131 | u16 next_cpu; | ||
3132 | struct rps_dev_flow *rflow; | 3126 | struct rps_dev_flow *rflow; |
3127 | u32 next_cpu; | ||
3128 | u32 ident; | ||
3129 | |||
3130 | /* First check into global flow table if there is a match */ | ||
3131 | ident = sock_flow_table->ents[hash & sock_flow_table->mask]; | ||
3132 | if ((ident ^ hash) & ~rps_cpu_mask) | ||
3133 | goto try_rps; | ||
3133 | 3134 | ||
3135 | next_cpu = ident & rps_cpu_mask; | ||
3136 | |||
3137 | /* OK, now we know there is a match, | ||
3138 | * we can look at the local (per receive queue) flow table | ||
3139 | */ | ||
3134 | rflow = &flow_table->flows[hash & flow_table->mask]; | 3140 | rflow = &flow_table->flows[hash & flow_table->mask]; |
3135 | tcpu = rflow->cpu; | 3141 | tcpu = rflow->cpu; |
3136 | 3142 | ||
3137 | next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; | ||
3138 | |||
3139 | /* | 3143 | /* |
3140 | * If the desired CPU (where last recvmsg was done) is | 3144 | * If the desired CPU (where last recvmsg was done) is |
3141 | * different from current CPU (one in the rx-queue flow | 3145 | * different from current CPU (one in the rx-queue flow |
@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, | |||
3162 | } | 3166 | } |
3163 | } | 3167 | } |
3164 | 3168 | ||
3169 | try_rps: | ||
3170 | |||
3165 | if (map) { | 3171 | if (map) { |
3166 | tcpu = map->cpus[reciprocal_scale(hash, map->len)]; | 3172 | tcpu = map->cpus[reciprocal_scale(hash, map->len)]; |
3167 | if (cpu_online(tcpu)) { | 3173 | if (cpu_online(tcpu)) { |
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index fde21d19e61b..7a31be5e361f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c | |||
@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, | |||
65 | mutex_unlock(&sock_flow_mutex); | 65 | mutex_unlock(&sock_flow_mutex); |
66 | return -ENOMEM; | 66 | return -ENOMEM; |
67 | } | 67 | } |
68 | 68 | rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; | |
69 | sock_table->mask = size - 1; | 69 | sock_table->mask = size - 1; |
70 | } else | 70 | } else |
71 | sock_table = orig_sock_table; | 71 | sock_table = orig_sock_table; |