diff options
author | David Ahern <dsa@cumulusnetworks.com> | 2015-12-16 16:20:44 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-12-18 14:43:38 -0500 |
commit | 6dd9a14e92e54895e143f10fef4d0b9abe109aa9 (patch) | |
tree | bb02bd14741b26fc9ba0abcaeb9a7f01a8339e68 | |
parent | 1a8524794fc7c70f44ac28e3a6e8fd637bc41f14 (diff) |
net: Allow accepted sockets to be bound to l3mdev domain
Allow accepted sockets to derive their sk_bound_dev_if setting from the
l3mdev domain in which the packets originated. A sysctl setting is added
to control the behavior which is similar to sk_mark and
sysctl_tcp_fwmark_accept.
This effectively allow a process to have a "VRF-global" listen socket,
with child sockets bound to the VRF device in which the packet originated.
A similar behavior can be achieved using sk_mark, but a solution using marks
is incomplete as it does not handle duplicate addresses in different L3
domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev
domain provides a complete solution.
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 8 | ||||
-rw-r--r-- | include/net/inet_sock.h | 14 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 3 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 4 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv6/syncookies.c | 4 |
8 files changed, 42 insertions, 5 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5de632ed0ec0..ceb44a095a27 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -335,6 +335,14 @@ tcp_keepalive_intvl - INTEGER | |||
335 | after probes started. Default value: 75sec i.e. connection | 335 | after probes started. Default value: 75sec i.e. connection |
336 | will be aborted after ~11 minutes of retries. | 336 | will be aborted after ~11 minutes of retries. |
337 | 337 | ||
338 | tcp_l3mdev_accept - BOOLEAN | ||
339 | Enables child sockets to inherit the L3 master device index. | ||
340 | Enabling this option allows a "global" listen socket to work | ||
341 | across L3 master domains (e.g., VRFs) with connected sockets | ||
342 | derived from the listen socket to be bound to the L3 domain in | ||
343 | which the packets originated. Only valid when the kernel was | ||
344 | compiled with CONFIG_NET_L3_MASTER_DEV. | ||
345 | |||
338 | tcp_low_latency - BOOLEAN | 346 | tcp_low_latency - BOOLEAN |
339 | If set, the TCP stack makes decisions that prefer lower | 347 | If set, the TCP stack makes decisions that prefer lower |
340 | latency as opposed to higher throughput. By default, this | 348 | latency as opposed to higher throughput. By default, this |
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 625bdf95d673..012b1f91f3ec 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <net/request_sock.h> | 28 | #include <net/request_sock.h> |
29 | #include <net/netns/hash.h> | 29 | #include <net/netns/hash.h> |
30 | #include <net/tcp_states.h> | 30 | #include <net/tcp_states.h> |
31 | #include <net/l3mdev.h> | ||
31 | 32 | ||
32 | /** struct ip_options - IP Options | 33 | /** struct ip_options - IP Options |
33 | * | 34 | * |
@@ -113,6 +114,19 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) | |||
113 | return sk->sk_mark; | 114 | return sk->sk_mark; |
114 | } | 115 | } |
115 | 116 | ||
117 | static inline int inet_request_bound_dev_if(const struct sock *sk, | ||
118 | struct sk_buff *skb) | ||
119 | { | ||
120 | #ifdef CONFIG_NET_L3_MASTER_DEV | ||
121 | struct net *net = sock_net(sk); | ||
122 | |||
123 | if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) | ||
124 | return l3mdev_master_ifindex_by_index(net, skb->skb_iif); | ||
125 | #endif | ||
126 | |||
127 | return sk->sk_bound_dev_if; | ||
128 | } | ||
129 | |||
116 | struct inet_cork { | 130 | struct inet_cork { |
117 | unsigned int flags; | 131 | unsigned int flags; |
118 | __be32 addr; | 132 | __be32 addr; |
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c68926b4899c..d75be32650ba 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h | |||
@@ -86,6 +86,9 @@ struct netns_ipv4 { | |||
86 | 86 | ||
87 | int sysctl_fwmark_reflect; | 87 | int sysctl_fwmark_reflect; |
88 | int sysctl_tcp_fwmark_accept; | 88 | int sysctl_tcp_fwmark_accept; |
89 | #ifdef CONFIG_NET_L3_MASTER_DEV | ||
90 | int sysctl_tcp_l3mdev_accept; | ||
91 | #endif | ||
89 | int sysctl_tcp_mtu_probing; | 92 | int sysctl_tcp_mtu_probing; |
90 | int sysctl_tcp_base_mss; | 93 | int sysctl_tcp_base_mss; |
91 | int sysctl_tcp_probe_threshold; | 94 | int sysctl_tcp_probe_threshold; |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4cbe9f0a4281..643a86c49020 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) | |||
351 | treq->snt_synack.v64 = 0; | 351 | treq->snt_synack.v64 = 0; |
352 | treq->tfo_listener = false; | 352 | treq->tfo_listener = false; |
353 | 353 | ||
354 | ireq->ir_iif = sk->sk_bound_dev_if; | 354 | ireq->ir_iif = inet_request_bound_dev_if(sk, skb); |
355 | 355 | ||
356 | /* We throwed the options of the initial SYN away, so we hope | 356 | /* We throwed the options of the initial SYN away, so we hope |
357 | * the ACK carries the same options again (see RFC1122 4.2.3.8) | 357 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
@@ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) | |||
371 | * hasn't changed since we received the original syn, but I see | 371 | * hasn't changed since we received the original syn, but I see |
372 | * no easy way to do this. | 372 | * no easy way to do this. |
373 | */ | 373 | */ |
374 | flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, | 374 | flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, |
375 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, | 375 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, |
376 | inet_sk_flowi_flags(sk), | 376 | inet_sk_flowi_flags(sk), |
377 | opt->srr ? opt->faddr : ireq->ir_rmt_addr, | 377 | opt->srr ? opt->faddr : ireq->ir_rmt_addr, |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a0bd7a55193e..41ff1f87dfd7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -915,6 +915,17 @@ static struct ctl_table ipv4_net_table[] = { | |||
915 | .mode = 0644, | 915 | .mode = 0644, |
916 | .proc_handler = proc_dointvec, | 916 | .proc_handler = proc_dointvec, |
917 | }, | 917 | }, |
918 | #ifdef CONFIG_NET_L3_MASTER_DEV | ||
919 | { | ||
920 | .procname = "tcp_l3mdev_accept", | ||
921 | .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, | ||
922 | .maxlen = sizeof(int), | ||
923 | .mode = 0644, | ||
924 | .proc_handler = proc_dointvec_minmax, | ||
925 | .extra1 = &zero, | ||
926 | .extra2 = &one, | ||
927 | }, | ||
928 | #endif | ||
918 | { | 929 | { |
919 | .procname = "tcp_mtu_probing", | 930 | .procname = "tcp_mtu_probing", |
920 | .data = &init_net.ipv4.sysctl_tcp_mtu_probing, | 931 | .data = &init_net.ipv4.sysctl_tcp_mtu_probing, |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2d656eef7f8e..7b1fddc47019 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -6204,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
6204 | tcp_openreq_init(req, &tmp_opt, skb, sk); | 6204 | tcp_openreq_init(req, &tmp_opt, skb, sk); |
6205 | 6205 | ||
6206 | /* Note: tcp_v6_init_req() might override ir_iif for link locals */ | 6206 | /* Note: tcp_v6_init_req() might override ir_iif for link locals */ |
6207 | inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; | 6207 | inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); |
6208 | 6208 | ||
6209 | af_ops->init_req(req, sk, skb); | 6209 | af_ops->init_req(req, sk, skb); |
6210 | 6210 | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 205e6745393f..46e92fbd26a8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -1276,6 +1276,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, | |||
1276 | ireq = inet_rsk(req); | 1276 | ireq = inet_rsk(req); |
1277 | sk_daddr_set(newsk, ireq->ir_rmt_addr); | 1277 | sk_daddr_set(newsk, ireq->ir_rmt_addr); |
1278 | sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); | 1278 | sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); |
1279 | newsk->sk_bound_dev_if = ireq->ir_iif; | ||
1279 | newinet->inet_saddr = ireq->ir_loc_addr; | 1280 | newinet->inet_saddr = ireq->ir_loc_addr; |
1280 | inet_opt = ireq->opt; | 1281 | inet_opt = ireq->opt; |
1281 | rcu_assign_pointer(newinet->inet_opt, inet_opt); | 1282 | rcu_assign_pointer(newinet->inet_opt, inet_opt); |
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index eaf7ac496d50..2906ef20795e 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c | |||
@@ -193,7 +193,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) | |||
193 | ireq->pktopts = skb; | 193 | ireq->pktopts = skb; |
194 | } | 194 | } |
195 | 195 | ||
196 | ireq->ir_iif = sk->sk_bound_dev_if; | 196 | ireq->ir_iif = inet_request_bound_dev_if(sk, skb); |
197 | /* So that link locals have meaning */ | 197 | /* So that link locals have meaning */ |
198 | if (!sk->sk_bound_dev_if && | 198 | if (!sk->sk_bound_dev_if && |
199 | ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) | 199 | ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) |
@@ -224,7 +224,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) | |||
224 | fl6.daddr = ireq->ir_v6_rmt_addr; | 224 | fl6.daddr = ireq->ir_v6_rmt_addr; |
225 | final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); | 225 | final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); |
226 | fl6.saddr = ireq->ir_v6_loc_addr; | 226 | fl6.saddr = ireq->ir_v6_loc_addr; |
227 | fl6.flowi6_oif = sk->sk_bound_dev_if; | 227 | fl6.flowi6_oif = ireq->ir_iif; |
228 | fl6.flowi6_mark = ireq->ir_mark; | 228 | fl6.flowi6_mark = ireq->ir_mark; |
229 | fl6.fl6_dport = ireq->ir_rmt_port; | 229 | fl6.fl6_dport = ireq->ir_rmt_port; |
230 | fl6.fl6_sport = inet_sk(sk)->inet_sport; | 230 | fl6.fl6_sport = inet_sk(sk)->inet_sport; |