diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 11 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 2 | ||||
-rw-r--r-- | include/net/protocol.h | 7 | ||||
-rw-r--r-- | include/net/udp.h | 1 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 8 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 5 | ||||
-rw-r--r-- | net/ipv4/protocol.c | 2 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 67 | ||||
-rw-r--r-- | net/ipv6/ip6_input.c | 6 | ||||
-rw-r--r-- | net/ipv6/protocol.c | 2 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 3 | ||||
-rw-r--r-- | net/ipv6/udp.c | 3 |
12 files changed, 103 insertions, 14 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index eaee2c8d4c00..b1c6500e7a8d 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -856,12 +856,21 @@ ip_dynaddr - BOOLEAN | |||
856 | ip_early_demux - BOOLEAN | 856 | ip_early_demux - BOOLEAN |
857 | Optimize input packet processing down to one demux for | 857 | Optimize input packet processing down to one demux for |
858 | certain kinds of local sockets. Currently we only do this | 858 | certain kinds of local sockets. Currently we only do this |
859 | for established TCP sockets. | 859 | for established TCP and connected UDP sockets. |
860 | 860 | ||
861 | It may add an additional cost for pure routing workloads that | 861 | It may add an additional cost for pure routing workloads that |
862 | reduces overall throughput, in such case you should disable it. | 862 | reduces overall throughput, in such case you should disable it. |
863 | Default: 1 | 863 | Default: 1 |
864 | 864 | ||
865 | tcp_early_demux - BOOLEAN | ||
866 | Enable early demux for established TCP sockets. | ||
867 | Default: 1 | ||
868 | |||
869 | udp_early_demux - BOOLEAN | ||
870 | Enable early demux for connected UDP sockets. Disable this if | ||
871 | your system could experience more unconnected load. | ||
872 | Default: 1 | ||
873 | |||
865 | icmp_echo_ignore_all - BOOLEAN | 874 | icmp_echo_ignore_all - BOOLEAN |
866 | If set non-zero, then the kernel will ignore all ICMP ECHO | 875 | If set non-zero, then the kernel will ignore all ICMP ECHO |
867 | requests sent to it. | 876 | requests sent to it. |
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a0e89190a3e9..cd686c4fb32d 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h | |||
@@ -95,6 +95,8 @@ struct netns_ipv4 { | |||
95 | /* Shall we try to damage output packets if routing dev changes? */ | 95 | /* Shall we try to damage output packets if routing dev changes? */ |
96 | int sysctl_ip_dynaddr; | 96 | int sysctl_ip_dynaddr; |
97 | int sysctl_ip_early_demux; | 97 | int sysctl_ip_early_demux; |
98 | int sysctl_tcp_early_demux; | ||
99 | int sysctl_udp_early_demux; | ||
98 | 100 | ||
99 | int sysctl_fwmark_reflect; | 101 | int sysctl_fwmark_reflect; |
100 | int sysctl_tcp_fwmark_accept; | 102 | int sysctl_tcp_fwmark_accept; |
diff --git a/include/net/protocol.h b/include/net/protocol.h index bf36ca34af7a..65ba335b0e7e 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h | |||
@@ -40,6 +40,7 @@ | |||
40 | /* This is used to register protocols. */ | 40 | /* This is used to register protocols. */ |
41 | struct net_protocol { | 41 | struct net_protocol { |
42 | void (*early_demux)(struct sk_buff *skb); | 42 | void (*early_demux)(struct sk_buff *skb); |
43 | void (*early_demux_handler)(struct sk_buff *skb); | ||
43 | int (*handler)(struct sk_buff *skb); | 44 | int (*handler)(struct sk_buff *skb); |
44 | void (*err_handler)(struct sk_buff *skb, u32 info); | 45 | void (*err_handler)(struct sk_buff *skb, u32 info); |
45 | unsigned int no_policy:1, | 46 | unsigned int no_policy:1, |
@@ -54,7 +55,7 @@ struct net_protocol { | |||
54 | #if IS_ENABLED(CONFIG_IPV6) | 55 | #if IS_ENABLED(CONFIG_IPV6) |
55 | struct inet6_protocol { | 56 | struct inet6_protocol { |
56 | void (*early_demux)(struct sk_buff *skb); | 57 | void (*early_demux)(struct sk_buff *skb); |
57 | 58 | void (*early_demux_handler)(struct sk_buff *skb); | |
58 | int (*handler)(struct sk_buff *skb); | 59 | int (*handler)(struct sk_buff *skb); |
59 | 60 | ||
60 | void (*err_handler)(struct sk_buff *skb, | 61 | void (*err_handler)(struct sk_buff *skb, |
@@ -92,12 +93,12 @@ struct inet_protosw { | |||
92 | #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ | 93 | #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ |
93 | #define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ | 94 | #define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ |
94 | 95 | ||
95 | extern const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; | 96 | extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; |
96 | extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS]; | 97 | extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS]; |
97 | extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS]; | 98 | extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS]; |
98 | 99 | ||
99 | #if IS_ENABLED(CONFIG_IPV6) | 100 | #if IS_ENABLED(CONFIG_IPV6) |
100 | extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; | 101 | extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; |
101 | #endif | 102 | #endif |
102 | 103 | ||
103 | int inet_add_protocol(const struct net_protocol *prot, unsigned char num); | 104 | int inet_add_protocol(const struct net_protocol *prot, unsigned char num); |
diff --git a/include/net/udp.h b/include/net/udp.h index c9d8b8e848e0..3391dbd73959 100644 --- a/include/net/udp.h +++ b/include/net/udp.h | |||
@@ -372,4 +372,5 @@ void udp_encap_enable(void); | |||
372 | #if IS_ENABLED(CONFIG_IPV6) | 372 | #if IS_ENABLED(CONFIG_IPV6) |
373 | void udpv6_encap_enable(void); | 373 | void udpv6_encap_enable(void); |
374 | #endif | 374 | #endif |
375 | |||
375 | #endif /* _UDP_H */ | 376 | #endif /* _UDP_H */ |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6b1fc6e4278e..d1a11707a126 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -1599,8 +1599,9 @@ static const struct net_protocol igmp_protocol = { | |||
1599 | }; | 1599 | }; |
1600 | #endif | 1600 | #endif |
1601 | 1601 | ||
1602 | static const struct net_protocol tcp_protocol = { | 1602 | static struct net_protocol tcp_protocol = { |
1603 | .early_demux = tcp_v4_early_demux, | 1603 | .early_demux = tcp_v4_early_demux, |
1604 | .early_demux_handler = tcp_v4_early_demux, | ||
1604 | .handler = tcp_v4_rcv, | 1605 | .handler = tcp_v4_rcv, |
1605 | .err_handler = tcp_v4_err, | 1606 | .err_handler = tcp_v4_err, |
1606 | .no_policy = 1, | 1607 | .no_policy = 1, |
@@ -1608,8 +1609,9 @@ static const struct net_protocol tcp_protocol = { | |||
1608 | .icmp_strict_tag_validation = 1, | 1609 | .icmp_strict_tag_validation = 1, |
1609 | }; | 1610 | }; |
1610 | 1611 | ||
1611 | static const struct net_protocol udp_protocol = { | 1612 | static struct net_protocol udp_protocol = { |
1612 | .early_demux = udp_v4_early_demux, | 1613 | .early_demux = udp_v4_early_demux, |
1614 | .early_demux_handler = udp_v4_early_demux, | ||
1613 | .handler = udp_rcv, | 1615 | .handler = udp_rcv, |
1614 | .err_handler = udp_err, | 1616 | .err_handler = udp_err, |
1615 | .no_policy = 1, | 1617 | .no_policy = 1, |
@@ -1720,6 +1722,8 @@ static __net_init int inet_init_net(struct net *net) | |||
1720 | net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; | 1722 | net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; |
1721 | net->ipv4.sysctl_ip_dynaddr = 0; | 1723 | net->ipv4.sysctl_ip_dynaddr = 0; |
1722 | net->ipv4.sysctl_ip_early_demux = 1; | 1724 | net->ipv4.sysctl_ip_early_demux = 1; |
1725 | net->ipv4.sysctl_udp_early_demux = 1; | ||
1726 | net->ipv4.sysctl_tcp_early_demux = 1; | ||
1723 | #ifdef CONFIG_SYSCTL | 1727 | #ifdef CONFIG_SYSCTL |
1724 | net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; | 1728 | net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; |
1725 | #endif | 1729 | #endif |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d6feabb03516..fa2dc8f692c6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) | |||
313 | const struct iphdr *iph = ip_hdr(skb); | 313 | const struct iphdr *iph = ip_hdr(skb); |
314 | struct rtable *rt; | 314 | struct rtable *rt; |
315 | struct net_device *dev = skb->dev; | 315 | struct net_device *dev = skb->dev; |
316 | void (*edemux)(struct sk_buff *skb); | ||
316 | 317 | ||
317 | /* if ingress device is enslaved to an L3 master device pass the | 318 | /* if ingress device is enslaved to an L3 master device pass the |
318 | * skb to its handler for processing | 319 | * skb to its handler for processing |
@@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) | |||
329 | int protocol = iph->protocol; | 330 | int protocol = iph->protocol; |
330 | 331 | ||
331 | ipprot = rcu_dereference(inet_protos[protocol]); | 332 | ipprot = rcu_dereference(inet_protos[protocol]); |
332 | if (ipprot && ipprot->early_demux) { | 333 | if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { |
333 | ipprot->early_demux(skb); | 334 | edemux(skb); |
334 | /* must reload iph, skb->head might have changed */ | 335 | /* must reload iph, skb->head might have changed */ |
335 | iph = ip_hdr(skb); | 336 | iph = ip_hdr(skb); |
336 | } | 337 | } |
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 4b7c0ec65251..32a691b7ce2c 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/spinlock.h> | 28 | #include <linux/spinlock.h> |
29 | #include <net/protocol.h> | 29 | #include <net/protocol.h> |
30 | 30 | ||
31 | const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; | 31 | struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; |
32 | const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; | 32 | const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; |
33 | EXPORT_SYMBOL(inet_offloads); | 33 | EXPORT_SYMBOL(inet_offloads); |
34 | 34 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 711c3e2e17b1..6fb25693c00b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <net/cipso_ipv4.h> | 24 | #include <net/cipso_ipv4.h> |
25 | #include <net/inet_frag.h> | 25 | #include <net/inet_frag.h> |
26 | #include <net/ping.h> | 26 | #include <net/ping.h> |
27 | #include <net/protocol.h> | ||
27 | 28 | ||
28 | static int zero; | 29 | static int zero; |
29 | static int one = 1; | 30 | static int one = 1; |
@@ -294,6 +295,58 @@ bad_key: | |||
294 | return ret; | 295 | return ret; |
295 | } | 296 | } |
296 | 297 | ||
298 | static void proc_configure_early_demux(int enabled, int protocol) | ||
299 | { | ||
300 | struct net_protocol *ipprot; | ||
301 | #if IS_ENABLED(CONFIG_IPV6) | ||
302 | struct inet6_protocol *ip6prot; | ||
303 | #endif | ||
304 | |||
305 | ipprot = rcu_dereference(inet_protos[protocol]); | ||
306 | if (ipprot) | ||
307 | ipprot->early_demux = enabled ? ipprot->early_demux_handler : | ||
308 | NULL; | ||
309 | |||
310 | #if IS_ENABLED(CONFIG_IPV6) | ||
311 | ip6prot = rcu_dereference(inet6_protos[protocol]); | ||
312 | if (ip6prot) | ||
313 | ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : | ||
314 | NULL; | ||
315 | #endif | ||
316 | } | ||
317 | |||
318 | static int proc_tcp_early_demux(struct ctl_table *table, int write, | ||
319 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
320 | { | ||
321 | int ret = 0; | ||
322 | |||
323 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
324 | |||
325 | if (write && !ret) { | ||
326 | int enabled = init_net.ipv4.sysctl_tcp_early_demux; | ||
327 | |||
328 | proc_configure_early_demux(enabled, IPPROTO_TCP); | ||
329 | } | ||
330 | |||
331 | return ret; | ||
332 | } | ||
333 | |||
334 | static int proc_udp_early_demux(struct ctl_table *table, int write, | ||
335 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
336 | { | ||
337 | int ret = 0; | ||
338 | |||
339 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
340 | |||
341 | if (write && !ret) { | ||
342 | int enabled = init_net.ipv4.sysctl_udp_early_demux; | ||
343 | |||
344 | proc_configure_early_demux(enabled, IPPROTO_UDP); | ||
345 | } | ||
346 | |||
347 | return ret; | ||
348 | } | ||
349 | |||
297 | static struct ctl_table ipv4_table[] = { | 350 | static struct ctl_table ipv4_table[] = { |
298 | { | 351 | { |
299 | .procname = "tcp_timestamps", | 352 | .procname = "tcp_timestamps", |
@@ -750,6 +803,20 @@ static struct ctl_table ipv4_net_table[] = { | |||
750 | .proc_handler = proc_dointvec | 803 | .proc_handler = proc_dointvec |
751 | }, | 804 | }, |
752 | { | 805 | { |
806 | .procname = "udp_early_demux", | ||
807 | .data = &init_net.ipv4.sysctl_udp_early_demux, | ||
808 | .maxlen = sizeof(int), | ||
809 | .mode = 0644, | ||
810 | .proc_handler = proc_udp_early_demux | ||
811 | }, | ||
812 | { | ||
813 | .procname = "tcp_early_demux", | ||
814 | .data = &init_net.ipv4.sysctl_tcp_early_demux, | ||
815 | .maxlen = sizeof(int), | ||
816 | .mode = 0644, | ||
817 | .proc_handler = proc_tcp_early_demux | ||
818 | }, | ||
819 | { | ||
753 | .procname = "ip_default_ttl", | 820 | .procname = "ip_default_ttl", |
754 | .data = &init_net.ipv4.sysctl_ip_default_ttl, | 821 | .data = &init_net.ipv4.sysctl_ip_default_ttl, |
755 | .maxlen = sizeof(int), | 822 | .maxlen = sizeof(int), |
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index aacfb4bce153..b04539dd4629 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c | |||
@@ -49,6 +49,8 @@ | |||
49 | 49 | ||
50 | int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) | 50 | int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) |
51 | { | 51 | { |
52 | void (*edemux)(struct sk_buff *skb); | ||
53 | |||
52 | /* if ingress device is enslaved to an L3 master device pass the | 54 | /* if ingress device is enslaved to an L3 master device pass the |
53 | * skb to its handler for processing | 55 | * skb to its handler for processing |
54 | */ | 56 | */ |
@@ -60,8 +62,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) | |||
60 | const struct inet6_protocol *ipprot; | 62 | const struct inet6_protocol *ipprot; |
61 | 63 | ||
62 | ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); | 64 | ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); |
63 | if (ipprot && ipprot->early_demux) | 65 | if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) |
64 | ipprot->early_demux(skb); | 66 | edemux(skb); |
65 | } | 67 | } |
66 | if (!skb_valid_dst(skb)) | 68 | if (!skb_valid_dst(skb)) |
67 | ip6_route_input(skb); | 69 | ip6_route_input(skb); |
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index e3770abe688a..b5d54d4f995c 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c | |||
@@ -26,7 +26,7 @@ | |||
26 | #include <net/protocol.h> | 26 | #include <net/protocol.h> |
27 | 27 | ||
28 | #if IS_ENABLED(CONFIG_IPV6) | 28 | #if IS_ENABLED(CONFIG_IPV6) |
29 | const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; | 29 | struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; |
30 | EXPORT_SYMBOL(inet6_protos); | 30 | EXPORT_SYMBOL(inet6_protos); |
31 | 31 | ||
32 | int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) | 32 | int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0f08d718a002..031a8c019f7a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1925,8 +1925,9 @@ struct proto tcpv6_prot = { | |||
1925 | .diag_destroy = tcp_abort, | 1925 | .diag_destroy = tcp_abort, |
1926 | }; | 1926 | }; |
1927 | 1927 | ||
1928 | static const struct inet6_protocol tcpv6_protocol = { | 1928 | static struct inet6_protocol tcpv6_protocol = { |
1929 | .early_demux = tcp_v6_early_demux, | 1929 | .early_demux = tcp_v6_early_demux, |
1930 | .early_demux_handler = tcp_v6_early_demux, | ||
1930 | .handler = tcp_v6_rcv, | 1931 | .handler = tcp_v6_rcv, |
1931 | .err_handler = tcp_v6_err, | 1932 | .err_handler = tcp_v6_err, |
1932 | .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, | 1933 | .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, |
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b793ed1d2a36..fd4b1c98a472 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c | |||
@@ -1436,8 +1436,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, | |||
1436 | } | 1436 | } |
1437 | #endif | 1437 | #endif |
1438 | 1438 | ||
1439 | static const struct inet6_protocol udpv6_protocol = { | 1439 | static struct inet6_protocol udpv6_protocol = { |
1440 | .early_demux = udp_v6_early_demux, | 1440 | .early_demux = udp_v6_early_demux, |
1441 | .early_demux_handler = udp_v6_early_demux, | ||
1441 | .handler = udpv6_rcv, | 1442 | .handler = udpv6_rcv, |
1442 | .err_handler = udpv6_err, | 1443 | .err_handler = udpv6_err, |
1443 | .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, | 1444 | .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, |