diff options
author | Tom Herbert <therbert@google.com> | 2010-04-16 19:01:27 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-04-16 19:01:27 -0400 |
commit | fec5e652e58fa6017b2c9e06466cb2a6538de5b4 (patch) | |
tree | e034f2a1e7930a0a225bd30896f834ec5e09c084 /net/core/dev.c | |
parent | b5d43998234331b9c01bd2165fdbb25115f4387f (diff) |
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 111 |
1 files changed, 91 insertions, 20 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index e8041eb76ac1..d7107ac835fa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -2203,19 +2203,28 @@ int weight_p __read_mostly = 64; /* old backlog weight */ | |||
2203 | DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; | 2203 | DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; |
2204 | 2204 | ||
2205 | #ifdef CONFIG_RPS | 2205 | #ifdef CONFIG_RPS |
2206 | |||
2207 | /* One global table that all flow-based protocols share. */ | ||
2208 | struct rps_sock_flow_table *rps_sock_flow_table; | ||
2209 | EXPORT_SYMBOL(rps_sock_flow_table); | ||
2210 | |||
2206 | /* | 2211 | /* |
2207 | * get_rps_cpu is called from netif_receive_skb and returns the target | 2212 | * get_rps_cpu is called from netif_receive_skb and returns the target |
2208 | * CPU from the RPS map of the receiving queue for a given skb. | 2213 | * CPU from the RPS map of the receiving queue for a given skb. |
2209 | * rcu_read_lock must be held on entry. | 2214 | * rcu_read_lock must be held on entry. |
2210 | */ | 2215 | */ |
2211 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) | 2216 | static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, |
2217 | struct rps_dev_flow **rflowp) | ||
2212 | { | 2218 | { |
2213 | struct ipv6hdr *ip6; | 2219 | struct ipv6hdr *ip6; |
2214 | struct iphdr *ip; | 2220 | struct iphdr *ip; |
2215 | struct netdev_rx_queue *rxqueue; | 2221 | struct netdev_rx_queue *rxqueue; |
2216 | struct rps_map *map; | 2222 | struct rps_map *map; |
2223 | struct rps_dev_flow_table *flow_table; | ||
2224 | struct rps_sock_flow_table *sock_flow_table; | ||
2217 | int cpu = -1; | 2225 | int cpu = -1; |
2218 | u8 ip_proto; | 2226 | u8 ip_proto; |
2227 | u16 tcpu; | ||
2219 | u32 addr1, addr2, ports, ihl; | 2228 | u32 addr1, addr2, ports, ihl; |
2220 | 2229 | ||
2221 | if (skb_rx_queue_recorded(skb)) { | 2230 | if (skb_rx_queue_recorded(skb)) { |
@@ -2232,7 +2241,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) | |||
2232 | } else | 2241 | } else |
2233 | rxqueue = dev->_rx; | 2242 | rxqueue = dev->_rx; |
2234 | 2243 | ||
2235 | if (!rxqueue->rps_map) | 2244 | if (!rxqueue->rps_map && !rxqueue->rps_flow_table) |
2236 | goto done; | 2245 | goto done; |
2237 | 2246 | ||
2238 | if (skb->rxhash) | 2247 | if (skb->rxhash) |
@@ -2284,9 +2293,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) | |||
2284 | skb->rxhash = 1; | 2293 | skb->rxhash = 1; |
2285 | 2294 | ||
2286 | got_hash: | 2295 | got_hash: |
2296 | flow_table = rcu_dereference(rxqueue->rps_flow_table); | ||
2297 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | ||
2298 | if (flow_table && sock_flow_table) { | ||
2299 | u16 next_cpu; | ||
2300 | struct rps_dev_flow *rflow; | ||
2301 | |||
2302 | rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; | ||
2303 | tcpu = rflow->cpu; | ||
2304 | |||
2305 | next_cpu = sock_flow_table->ents[skb->rxhash & | ||
2306 | sock_flow_table->mask]; | ||
2307 | |||
2308 | /* | ||
2309 | * If the desired CPU (where last recvmsg was done) is | ||
2310 | * different from current CPU (one in the rx-queue flow | ||
2311 | * table entry), switch if one of the following holds: | ||
2312 | * - Current CPU is unset (equal to RPS_NO_CPU). | ||
2313 | * - Current CPU is offline. | ||
2314 | * - The current CPU's queue tail has advanced beyond the | ||
2315 | * last packet that was enqueued using this table entry. | ||
2316 | * This guarantees that all previous packets for the flow | ||
2317 | * have been dequeued, thus preserving in order delivery. | ||
2318 | */ | ||
2319 | if (unlikely(tcpu != next_cpu) && | ||
2320 | (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || | ||
2321 | ((int)(per_cpu(softnet_data, tcpu).input_queue_head - | ||
2322 | rflow->last_qtail)) >= 0)) { | ||
2323 | tcpu = rflow->cpu = next_cpu; | ||
2324 | if (tcpu != RPS_NO_CPU) | ||
2325 | rflow->last_qtail = per_cpu(softnet_data, | ||
2326 | tcpu).input_queue_head; | ||
2327 | } | ||
2328 | if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { | ||
2329 | *rflowp = rflow; | ||
2330 | cpu = tcpu; | ||
2331 | goto done; | ||
2332 | } | ||
2333 | } | ||
2334 | |||
2287 | map = rcu_dereference(rxqueue->rps_map); | 2335 | map = rcu_dereference(rxqueue->rps_map); |
2288 | if (map) { | 2336 | if (map) { |
2289 | u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; | 2337 | tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; |
2290 | 2338 | ||
2291 | if (cpu_online(tcpu)) { | 2339 | if (cpu_online(tcpu)) { |
2292 | cpu = tcpu; | 2340 | cpu = tcpu; |
@@ -2320,13 +2368,14 @@ static void trigger_softirq(void *data) | |||
2320 | __napi_schedule(&queue->backlog); | 2368 | __napi_schedule(&queue->backlog); |
2321 | __get_cpu_var(netdev_rx_stat).received_rps++; | 2369 | __get_cpu_var(netdev_rx_stat).received_rps++; |
2322 | } | 2370 | } |
2323 | #endif /* CONFIG_SMP */ | 2371 | #endif /* CONFIG_RPS */ |
2324 | 2372 | ||
2325 | /* | 2373 | /* |
2326 | * enqueue_to_backlog is called to queue an skb to a per CPU backlog | 2374 | * enqueue_to_backlog is called to queue an skb to a per CPU backlog |
2327 | * queue (may be a remote CPU queue). | 2375 | * queue (may be a remote CPU queue). |
2328 | */ | 2376 | */ |
2329 | static int enqueue_to_backlog(struct sk_buff *skb, int cpu) | 2377 | static int enqueue_to_backlog(struct sk_buff *skb, int cpu, |
2378 | unsigned int *qtail) | ||
2330 | { | 2379 | { |
2331 | struct softnet_data *queue; | 2380 | struct softnet_data *queue; |
2332 | unsigned long flags; | 2381 | unsigned long flags; |
@@ -2341,6 +2390,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu) | |||
2341 | if (queue->input_pkt_queue.qlen) { | 2390 | if (queue->input_pkt_queue.qlen) { |
2342 | enqueue: | 2391 | enqueue: |
2343 | __skb_queue_tail(&queue->input_pkt_queue, skb); | 2392 | __skb_queue_tail(&queue->input_pkt_queue, skb); |
2393 | #ifdef CONFIG_RPS | ||
2394 | *qtail = queue->input_queue_head + | ||
2395 | queue->input_pkt_queue.qlen; | ||
2396 | #endif | ||
2344 | rps_unlock(queue); | 2397 | rps_unlock(queue); |
2345 | local_irq_restore(flags); | 2398 | local_irq_restore(flags); |
2346 | return NET_RX_SUCCESS; | 2399 | return NET_RX_SUCCESS; |
@@ -2355,11 +2408,10 @@ enqueue: | |||
2355 | 2408 | ||
2356 | cpu_set(cpu, rcpus->mask[rcpus->select]); | 2409 | cpu_set(cpu, rcpus->mask[rcpus->select]); |
2357 | __raise_softirq_irqoff(NET_RX_SOFTIRQ); | 2410 | __raise_softirq_irqoff(NET_RX_SOFTIRQ); |
2358 | } else | 2411 | goto enqueue; |
2359 | __napi_schedule(&queue->backlog); | 2412 | } |
2360 | #else | ||
2361 | __napi_schedule(&queue->backlog); | ||
2362 | #endif | 2413 | #endif |
2414 | __napi_schedule(&queue->backlog); | ||
2363 | } | 2415 | } |
2364 | goto enqueue; | 2416 | goto enqueue; |
2365 | } | 2417 | } |
@@ -2401,18 +2453,25 @@ int netif_rx(struct sk_buff *skb) | |||
2401 | 2453 | ||
2402 | #ifdef CONFIG_RPS | 2454 | #ifdef CONFIG_RPS |
2403 | { | 2455 | { |
2456 | struct rps_dev_flow voidflow, *rflow = &voidflow; | ||
2404 | int cpu; | 2457 | int cpu; |
2405 | 2458 | ||
2406 | rcu_read_lock(); | 2459 | rcu_read_lock(); |
2407 | cpu = get_rps_cpu(skb->dev, skb); | 2460 | |
2461 | cpu = get_rps_cpu(skb->dev, skb, &rflow); | ||
2408 | if (cpu < 0) | 2462 | if (cpu < 0) |
2409 | cpu = smp_processor_id(); | 2463 | cpu = smp_processor_id(); |
2410 | ret = enqueue_to_backlog(skb, cpu); | 2464 | |
2465 | ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); | ||
2466 | |||
2411 | rcu_read_unlock(); | 2467 | rcu_read_unlock(); |
2412 | } | 2468 | } |
2413 | #else | 2469 | #else |
2414 | ret = enqueue_to_backlog(skb, get_cpu()); | 2470 | { |
2415 | put_cpu(); | 2471 | unsigned int qtail; |
2472 | ret = enqueue_to_backlog(skb, get_cpu(), &qtail); | ||
2473 | put_cpu(); | ||
2474 | } | ||
2416 | #endif | 2475 | #endif |
2417 | return ret; | 2476 | return ret; |
2418 | } | 2477 | } |
@@ -2830,14 +2889,22 @@ out: | |||
2830 | int netif_receive_skb(struct sk_buff *skb) | 2889 | int netif_receive_skb(struct sk_buff *skb) |
2831 | { | 2890 | { |
2832 | #ifdef CONFIG_RPS | 2891 | #ifdef CONFIG_RPS |
2833 | int cpu; | 2892 | struct rps_dev_flow voidflow, *rflow = &voidflow; |
2893 | int cpu, ret; | ||
2894 | |||
2895 | rcu_read_lock(); | ||
2834 | 2896 | ||
2835 | cpu = get_rps_cpu(skb->dev, skb); | 2897 | cpu = get_rps_cpu(skb->dev, skb, &rflow); |
2836 | 2898 | ||
2837 | if (cpu < 0) | 2899 | if (cpu >= 0) { |
2838 | return __netif_receive_skb(skb); | 2900 | ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); |
2839 | else | 2901 | rcu_read_unlock(); |
2840 | return enqueue_to_backlog(skb, cpu); | 2902 | } else { |
2903 | rcu_read_unlock(); | ||
2904 | ret = __netif_receive_skb(skb); | ||
2905 | } | ||
2906 | |||
2907 | return ret; | ||
2841 | #else | 2908 | #else |
2842 | return __netif_receive_skb(skb); | 2909 | return __netif_receive_skb(skb); |
2843 | #endif | 2910 | #endif |
@@ -2856,6 +2923,7 @@ static void flush_backlog(void *arg) | |||
2856 | if (skb->dev == dev) { | 2923 | if (skb->dev == dev) { |
2857 | __skb_unlink(skb, &queue->input_pkt_queue); | 2924 | __skb_unlink(skb, &queue->input_pkt_queue); |
2858 | kfree_skb(skb); | 2925 | kfree_skb(skb); |
2926 | incr_input_queue_head(queue); | ||
2859 | } | 2927 | } |
2860 | rps_unlock(queue); | 2928 | rps_unlock(queue); |
2861 | } | 2929 | } |
@@ -3179,6 +3247,7 @@ static int process_backlog(struct napi_struct *napi, int quota) | |||
3179 | local_irq_enable(); | 3247 | local_irq_enable(); |
3180 | break; | 3248 | break; |
3181 | } | 3249 | } |
3250 | incr_input_queue_head(queue); | ||
3182 | rps_unlock(queue); | 3251 | rps_unlock(queue); |
3183 | local_irq_enable(); | 3252 | local_irq_enable(); |
3184 | 3253 | ||
@@ -5542,8 +5611,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, | |||
5542 | local_irq_enable(); | 5611 | local_irq_enable(); |
5543 | 5612 | ||
5544 | /* Process offline CPU's input_pkt_queue */ | 5613 | /* Process offline CPU's input_pkt_queue */ |
5545 | while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) | 5614 | while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { |
5546 | netif_rx(skb); | 5615 | netif_rx(skb); |
5616 | incr_input_queue_head(oldsd); | ||
5617 | } | ||
5547 | 5618 | ||
5548 | return NOTIFY_OK; | 5619 | return NOTIFY_OK; |
5549 | } | 5620 | } |