diff options
author | Eric Dumazet <edumazet@google.com> | 2014-11-11 08:54:28 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-11-11 13:00:06 -0500 |
commit | 2c8c56e15df3d4c2af3d656e44feb18789f75837 (patch) | |
tree | e3c81c868a7c14ca2bac7efd69b6b21e25c355d4 /net | |
parent | 3d97379a67486bc481ab5b8f7aa5b7ceb6154a95 (diff) |
net: introduce SO_INCOMING_CPU
Alternative to RPS/RFS is to use hardware support for multiple
queues.
Then split a set of million of sockets into worker threads, each
one using epoll() to manage events on its own socket pool.
Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
know after accept() or connect() on which queue/cpu a socket is managed.
We normally use one cpu per RX queue (IRQ smp_affinity being properly
set), so remembering on socket structure which cpu delivered last packet
is enough to solve the problem.
After accept(), connect(), or even file descriptor passing around
processes, applications can use :
int cpu;
socklen_t len = sizeof(cpu);
getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
And use this information to put the socket into the right silo
for optimal performance, as all networking stack should run
on the appropriate cpu, without need to send IPI (RPS/RFS).
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/core/sock.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/udp.c | 1 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 | ||||
-rw-r--r-- | net/ipv6/udp.c | 1 | ||||
-rw-r--r-- | net/sctp/ulpqueue.c | 5 |
6 files changed, 12 insertions, 2 deletions
diff --git a/net/core/sock.c b/net/core/sock.c index ac56dd06c306..0725cf0cb685 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, | |||
1213 | v.val = sk->sk_max_pacing_rate; | 1213 | v.val = sk->sk_max_pacing_rate; |
1214 | break; | 1214 | break; |
1215 | 1215 | ||
1216 | case SO_INCOMING_CPU: | ||
1217 | v.val = sk->sk_incoming_cpu; | ||
1218 | break; | ||
1219 | |||
1216 | default: | 1220 | default: |
1217 | return -ENOPROTOOPT; | 1221 | return -ENOPROTOOPT; |
1218 | } | 1222 | } |
@@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) | |||
1517 | 1521 | ||
1518 | newsk->sk_err = 0; | 1522 | newsk->sk_err = 0; |
1519 | newsk->sk_priority = 0; | 1523 | newsk->sk_priority = 0; |
1524 | newsk->sk_incoming_cpu = raw_smp_processor_id(); | ||
1520 | /* | 1525 | /* |
1521 | * Before updating sk_refcnt, we must commit prior changes to memory | 1526 | * Before updating sk_refcnt, we must commit prior changes to memory |
1522 | * (Documentation/RCU/rculist_nulls.txt for details) | 1527 | * (Documentation/RCU/rculist_nulls.txt for details) |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8893598a4124..2c6a955fd5c3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -1663,6 +1663,7 @@ process: | |||
1663 | if (sk_filter(sk, skb)) | 1663 | if (sk_filter(sk, skb)) |
1664 | goto discard_and_relse; | 1664 | goto discard_and_relse; |
1665 | 1665 | ||
1666 | sk_incoming_cpu_update(sk); | ||
1666 | skb->dev = NULL; | 1667 | skb->dev = NULL; |
1667 | 1668 | ||
1668 | bh_lock_sock_nested(sk); | 1669 | bh_lock_sock_nested(sk); |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5d0fdca8e965..d13751685f44 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
1445 | if (inet_sk(sk)->inet_daddr) { | 1445 | if (inet_sk(sk)->inet_daddr) { |
1446 | sock_rps_save_rxhash(sk, skb); | 1446 | sock_rps_save_rxhash(sk, skb); |
1447 | sk_mark_napi_id(sk, skb); | 1447 | sk_mark_napi_id(sk, skb); |
1448 | sk_incoming_cpu_update(sk); | ||
1448 | } | 1449 | } |
1449 | 1450 | ||
1450 | rc = sock_queue_rcv_skb(sk, skb); | 1451 | rc = sock_queue_rcv_skb(sk, skb); |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fd8e50b380e7..1985b4933a6b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1456,6 +1456,7 @@ process: | |||
1456 | if (sk_filter(sk, skb)) | 1456 | if (sk_filter(sk, skb)) |
1457 | goto discard_and_relse; | 1457 | goto discard_and_relse; |
1458 | 1458 | ||
1459 | sk_incoming_cpu_update(sk); | ||
1459 | skb->dev = NULL; | 1460 | skb->dev = NULL; |
1460 | 1461 | ||
1461 | bh_lock_sock_nested(sk); | 1462 | bh_lock_sock_nested(sk); |
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b756355e9739..d1fe36274906 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c | |||
@@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
577 | if (!ipv6_addr_any(&sk->sk_v6_daddr)) { | 577 | if (!ipv6_addr_any(&sk->sk_v6_daddr)) { |
578 | sock_rps_save_rxhash(sk, skb); | 578 | sock_rps_save_rxhash(sk, skb); |
579 | sk_mark_napi_id(sk, skb); | 579 | sk_mark_napi_id(sk, skb); |
580 | sk_incoming_cpu_update(sk); | ||
580 | } | 581 | } |
581 | 582 | ||
582 | rc = sock_queue_rcv_skb(sk, skb); | 583 | rc = sock_queue_rcv_skb(sk, skb); |
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index d49dc2ed30ad..ce469d648ffb 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c | |||
@@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) | |||
205 | if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) | 205 | if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) |
206 | goto out_free; | 206 | goto out_free; |
207 | 207 | ||
208 | if (!sctp_ulpevent_is_notification(event)) | 208 | if (!sctp_ulpevent_is_notification(event)) { |
209 | sk_mark_napi_id(sk, skb); | 209 | sk_mark_napi_id(sk, skb); |
210 | 210 | sk_incoming_cpu_update(sk); | |
211 | } | ||
211 | /* Check if the user wishes to receive this event. */ | 212 | /* Check if the user wishes to receive this event. */ |
212 | if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) | 213 | if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) |
213 | goto out_free; | 214 | goto out_free; |