summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt11
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--include/net/protocol.h7
-rw-r--r--include/net/udp.h1
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/protocol.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c67
-rw-r--r--net/ipv6/ip6_input.c6
-rw-r--r--net/ipv6/protocol.c2
-rw-r--r--net/ipv6/tcp_ipv6.c3
-rw-r--r--net/ipv6/udp.c3
12 files changed, 103 insertions, 14 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index eaee2c8d4c00..b1c6500e7a8d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -856,12 +856,21 @@ ip_dynaddr - BOOLEAN
856ip_early_demux - BOOLEAN 856ip_early_demux - BOOLEAN
857 Optimize input packet processing down to one demux for 857 Optimize input packet processing down to one demux for
858 certain kinds of local sockets. Currently we only do this 858 certain kinds of local sockets. Currently we only do this
859 for established TCP sockets. 859 for established TCP and connected UDP sockets.
860 860
861 It may add an additional cost for pure routing workloads that 861 It may add an additional cost for pure routing workloads that
862 reduces overall throughput, in such case you should disable it. 862 reduces overall throughput, in such case you should disable it.
863 Default: 1 863 Default: 1
864 864
865tcp_early_demux - BOOLEAN
866 Enable early demux for established TCP sockets.
867 Default: 1
868
869udp_early_demux - BOOLEAN
870 Enable early demux for connected UDP sockets. Disable this if
871 your system could experience more unconnected load.
872 Default: 1
873
865icmp_echo_ignore_all - BOOLEAN 874icmp_echo_ignore_all - BOOLEAN
866 If set non-zero, then the kernel will ignore all ICMP ECHO 875 If set non-zero, then the kernel will ignore all ICMP ECHO
867 requests sent to it. 876 requests sent to it.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index a0e89190a3e9..cd686c4fb32d 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -95,6 +95,8 @@ struct netns_ipv4 {
95 /* Shall we try to damage output packets if routing dev changes? */ 95 /* Shall we try to damage output packets if routing dev changes? */
96 int sysctl_ip_dynaddr; 96 int sysctl_ip_dynaddr;
97 int sysctl_ip_early_demux; 97 int sysctl_ip_early_demux;
98 int sysctl_tcp_early_demux;
99 int sysctl_udp_early_demux;
98 100
99 int sysctl_fwmark_reflect; 101 int sysctl_fwmark_reflect;
100 int sysctl_tcp_fwmark_accept; 102 int sysctl_tcp_fwmark_accept;
diff --git a/include/net/protocol.h b/include/net/protocol.h
index bf36ca34af7a..65ba335b0e7e 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -40,6 +40,7 @@
40/* This is used to register protocols. */ 40/* This is used to register protocols. */
41struct net_protocol { 41struct net_protocol {
42 void (*early_demux)(struct sk_buff *skb); 42 void (*early_demux)(struct sk_buff *skb);
43 void (*early_demux_handler)(struct sk_buff *skb);
43 int (*handler)(struct sk_buff *skb); 44 int (*handler)(struct sk_buff *skb);
44 void (*err_handler)(struct sk_buff *skb, u32 info); 45 void (*err_handler)(struct sk_buff *skb, u32 info);
45 unsigned int no_policy:1, 46 unsigned int no_policy:1,
@@ -54,7 +55,7 @@ struct net_protocol {
54#if IS_ENABLED(CONFIG_IPV6) 55#if IS_ENABLED(CONFIG_IPV6)
55struct inet6_protocol { 56struct inet6_protocol {
56 void (*early_demux)(struct sk_buff *skb); 57 void (*early_demux)(struct sk_buff *skb);
57 58 void (*early_demux_handler)(struct sk_buff *skb);
58 int (*handler)(struct sk_buff *skb); 59 int (*handler)(struct sk_buff *skb);
59 60
60 void (*err_handler)(struct sk_buff *skb, 61 void (*err_handler)(struct sk_buff *skb,
@@ -92,12 +93,12 @@ struct inet_protosw {
92#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ 93#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
93#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ 94#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */
94 95
95extern const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; 96extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS];
96extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS]; 97extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS];
97extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS]; 98extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS];
98 99
99#if IS_ENABLED(CONFIG_IPV6) 100#if IS_ENABLED(CONFIG_IPV6)
100extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; 101extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];
101#endif 102#endif
102 103
103int inet_add_protocol(const struct net_protocol *prot, unsigned char num); 104int inet_add_protocol(const struct net_protocol *prot, unsigned char num);
diff --git a/include/net/udp.h b/include/net/udp.h
index c9d8b8e848e0..3391dbd73959 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -372,4 +372,5 @@ void udp_encap_enable(void);
372#if IS_ENABLED(CONFIG_IPV6) 372#if IS_ENABLED(CONFIG_IPV6)
373void udpv6_encap_enable(void); 373void udpv6_encap_enable(void);
374#endif 374#endif
375
375#endif /* _UDP_H */ 376#endif /* _UDP_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6b1fc6e4278e..d1a11707a126 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1599,8 +1599,9 @@ static const struct net_protocol igmp_protocol = {
1599}; 1599};
1600#endif 1600#endif
1601 1601
1602static const struct net_protocol tcp_protocol = { 1602static struct net_protocol tcp_protocol = {
1603 .early_demux = tcp_v4_early_demux, 1603 .early_demux = tcp_v4_early_demux,
1604 .early_demux_handler = tcp_v4_early_demux,
1604 .handler = tcp_v4_rcv, 1605 .handler = tcp_v4_rcv,
1605 .err_handler = tcp_v4_err, 1606 .err_handler = tcp_v4_err,
1606 .no_policy = 1, 1607 .no_policy = 1,
@@ -1608,8 +1609,9 @@ static const struct net_protocol tcp_protocol = {
1608 .icmp_strict_tag_validation = 1, 1609 .icmp_strict_tag_validation = 1,
1609}; 1610};
1610 1611
1611static const struct net_protocol udp_protocol = { 1612static struct net_protocol udp_protocol = {
1612 .early_demux = udp_v4_early_demux, 1613 .early_demux = udp_v4_early_demux,
1614 .early_demux_handler = udp_v4_early_demux,
1613 .handler = udp_rcv, 1615 .handler = udp_rcv,
1614 .err_handler = udp_err, 1616 .err_handler = udp_err,
1615 .no_policy = 1, 1617 .no_policy = 1,
@@ -1720,6 +1722,8 @@ static __net_init int inet_init_net(struct net *net)
1720 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; 1722 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
1721 net->ipv4.sysctl_ip_dynaddr = 0; 1723 net->ipv4.sysctl_ip_dynaddr = 0;
1722 net->ipv4.sysctl_ip_early_demux = 1; 1724 net->ipv4.sysctl_ip_early_demux = 1;
1725 net->ipv4.sysctl_udp_early_demux = 1;
1726 net->ipv4.sysctl_tcp_early_demux = 1;
1723#ifdef CONFIG_SYSCTL 1727#ifdef CONFIG_SYSCTL
1724 net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; 1728 net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
1725#endif 1729#endif
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d6feabb03516..fa2dc8f692c6 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
313 const struct iphdr *iph = ip_hdr(skb); 313 const struct iphdr *iph = ip_hdr(skb);
314 struct rtable *rt; 314 struct rtable *rt;
315 struct net_device *dev = skb->dev; 315 struct net_device *dev = skb->dev;
316 void (*edemux)(struct sk_buff *skb);
316 317
317 /* if ingress device is enslaved to an L3 master device pass the 318 /* if ingress device is enslaved to an L3 master device pass the
318 * skb to its handler for processing 319 * skb to its handler for processing
@@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
329 int protocol = iph->protocol; 330 int protocol = iph->protocol;
330 331
331 ipprot = rcu_dereference(inet_protos[protocol]); 332 ipprot = rcu_dereference(inet_protos[protocol]);
332 if (ipprot && ipprot->early_demux) { 333 if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
333 ipprot->early_demux(skb); 334 edemux(skb);
334 /* must reload iph, skb->head might have changed */ 335 /* must reload iph, skb->head might have changed */
335 iph = ip_hdr(skb); 336 iph = ip_hdr(skb);
336 } 337 }
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 4b7c0ec65251..32a691b7ce2c 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,7 +28,7 @@
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29#include <net/protocol.h> 29#include <net/protocol.h>
30 30
31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; 31struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
32const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; 32const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
33EXPORT_SYMBOL(inet_offloads); 33EXPORT_SYMBOL(inet_offloads);
34 34
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 711c3e2e17b1..6fb25693c00b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,6 +24,7 @@
24#include <net/cipso_ipv4.h> 24#include <net/cipso_ipv4.h>
25#include <net/inet_frag.h> 25#include <net/inet_frag.h>
26#include <net/ping.h> 26#include <net/ping.h>
27#include <net/protocol.h>
27 28
28static int zero; 29static int zero;
29static int one = 1; 30static int one = 1;
@@ -294,6 +295,58 @@ bad_key:
294 return ret; 295 return ret;
295} 296}
296 297
298static void proc_configure_early_demux(int enabled, int protocol)
299{
300 struct net_protocol *ipprot;
301#if IS_ENABLED(CONFIG_IPV6)
302 struct inet6_protocol *ip6prot;
303#endif
304
305 ipprot = rcu_dereference(inet_protos[protocol]);
306 if (ipprot)
307 ipprot->early_demux = enabled ? ipprot->early_demux_handler :
308 NULL;
309
310#if IS_ENABLED(CONFIG_IPV6)
311 ip6prot = rcu_dereference(inet6_protos[protocol]);
312 if (ip6prot)
313 ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
314 NULL;
315#endif
316}
317
318static int proc_tcp_early_demux(struct ctl_table *table, int write,
319 void __user *buffer, size_t *lenp, loff_t *ppos)
320{
321 int ret = 0;
322
323 ret = proc_dointvec(table, write, buffer, lenp, ppos);
324
325 if (write && !ret) {
326 int enabled = init_net.ipv4.sysctl_tcp_early_demux;
327
328 proc_configure_early_demux(enabled, IPPROTO_TCP);
329 }
330
331 return ret;
332}
333
334static int proc_udp_early_demux(struct ctl_table *table, int write,
335 void __user *buffer, size_t *lenp, loff_t *ppos)
336{
337 int ret = 0;
338
339 ret = proc_dointvec(table, write, buffer, lenp, ppos);
340
341 if (write && !ret) {
342 int enabled = init_net.ipv4.sysctl_udp_early_demux;
343
344 proc_configure_early_demux(enabled, IPPROTO_UDP);
345 }
346
347 return ret;
348}
349
297static struct ctl_table ipv4_table[] = { 350static struct ctl_table ipv4_table[] = {
298 { 351 {
299 .procname = "tcp_timestamps", 352 .procname = "tcp_timestamps",
@@ -750,6 +803,20 @@ static struct ctl_table ipv4_net_table[] = {
750 .proc_handler = proc_dointvec 803 .proc_handler = proc_dointvec
751 }, 804 },
752 { 805 {
806 .procname = "udp_early_demux",
807 .data = &init_net.ipv4.sysctl_udp_early_demux,
808 .maxlen = sizeof(int),
809 .mode = 0644,
810 .proc_handler = proc_udp_early_demux
811 },
812 {
813 .procname = "tcp_early_demux",
814 .data = &init_net.ipv4.sysctl_tcp_early_demux,
815 .maxlen = sizeof(int),
816 .mode = 0644,
817 .proc_handler = proc_tcp_early_demux
818 },
819 {
753 .procname = "ip_default_ttl", 820 .procname = "ip_default_ttl",
754 .data = &init_net.ipv4.sysctl_ip_default_ttl, 821 .data = &init_net.ipv4.sysctl_ip_default_ttl,
755 .maxlen = sizeof(int), 822 .maxlen = sizeof(int),
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index aacfb4bce153..b04539dd4629 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,6 +49,8 @@
49 49
50int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 50int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
51{ 51{
52 void (*edemux)(struct sk_buff *skb);
53
52 /* if ingress device is enslaved to an L3 master device pass the 54 /* if ingress device is enslaved to an L3 master device pass the
53 * skb to its handler for processing 55 * skb to its handler for processing
54 */ 56 */
@@ -60,8 +62,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
60 const struct inet6_protocol *ipprot; 62 const struct inet6_protocol *ipprot;
61 63
62 ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); 64 ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
63 if (ipprot && ipprot->early_demux) 65 if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
64 ipprot->early_demux(skb); 66 edemux(skb);
65 } 67 }
66 if (!skb_valid_dst(skb)) 68 if (!skb_valid_dst(skb))
67 ip6_route_input(skb); 69 ip6_route_input(skb);
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index e3770abe688a..b5d54d4f995c 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -26,7 +26,7 @@
26#include <net/protocol.h> 26#include <net/protocol.h>
27 27
28#if IS_ENABLED(CONFIG_IPV6) 28#if IS_ENABLED(CONFIG_IPV6)
29const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; 29struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
30EXPORT_SYMBOL(inet6_protos); 30EXPORT_SYMBOL(inet6_protos);
31 31
32int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) 32int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0f08d718a002..031a8c019f7a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1925,8 +1925,9 @@ struct proto tcpv6_prot = {
1925 .diag_destroy = tcp_abort, 1925 .diag_destroy = tcp_abort,
1926}; 1926};
1927 1927
1928static const struct inet6_protocol tcpv6_protocol = { 1928static struct inet6_protocol tcpv6_protocol = {
1929 .early_demux = tcp_v6_early_demux, 1929 .early_demux = tcp_v6_early_demux,
1930 .early_demux_handler = tcp_v6_early_demux,
1930 .handler = tcp_v6_rcv, 1931 .handler = tcp_v6_rcv,
1931 .err_handler = tcp_v6_err, 1932 .err_handler = tcp_v6_err,
1932 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, 1933 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b793ed1d2a36..fd4b1c98a472 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1436,8 +1436,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
1436} 1436}
1437#endif 1437#endif
1438 1438
1439static const struct inet6_protocol udpv6_protocol = { 1439static struct inet6_protocol udpv6_protocol = {
1440 .early_demux = udp_v6_early_demux, 1440 .early_demux = udp_v6_early_demux,
1441 .early_demux_handler = udp_v6_early_demux,
1441 .handler = udpv6_rcv, 1442 .handler = udpv6_rcv,
1442 .err_handler = udpv6_err, 1443 .err_handler = udpv6_err,
1443 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, 1444 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,