summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Ahern <dsa@cumulusnetworks.com>2015-12-16 16:20:44 -0500
committerDavid S. Miller <davem@davemloft.net>2015-12-18 14:43:38 -0500
commit6dd9a14e92e54895e143f10fef4d0b9abe109aa9 (patch)
treebb02bd14741b26fc9ba0abcaeb9a7f01a8339e68
parent1a8524794fc7c70f44ac28e3a6e8fd637bc41f14 (diff)
net: Allow accepted sockets to be bound to l3mdev domain
Allow accepted sockets to derive their sk_bound_dev_if setting from the l3mdev domain in which the packets originated. A sysctl setting is added to control the behavior which is similar to sk_mark and sysctl_tcp_fwmark_accept. This effectively allow a process to have a "VRF-global" listen socket, with child sockets bound to the VRF device in which the packet originated. A similar behavior can be achieved using sk_mark, but a solution using marks is incomplete as it does not handle duplicate addresses in different L3 domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev domain provides a complete solution. Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt8
-rw-r--r--include/net/inet_sock.h14
-rw-r--r--include/net/netns/ipv4.h3
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv6/syncookies.c4
8 files changed, 42 insertions, 5 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5de632ed0ec0..ceb44a095a27 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -335,6 +335,14 @@ tcp_keepalive_intvl - INTEGER
335 after probes started. Default value: 75sec i.e. connection 335 after probes started. Default value: 75sec i.e. connection
336 will be aborted after ~11 minutes of retries. 336 will be aborted after ~11 minutes of retries.
337 337
338tcp_l3mdev_accept - BOOLEAN
339 Enables child sockets to inherit the L3 master device index.
340 Enabling this option allows a "global" listen socket to work
341 across L3 master domains (e.g., VRFs) with connected sockets
342 derived from the listen socket to be bound to the L3 domain in
343 which the packets originated. Only valid when the kernel was
344 compiled with CONFIG_NET_L3_MASTER_DEV.
345
338tcp_low_latency - BOOLEAN 346tcp_low_latency - BOOLEAN
339 If set, the TCP stack makes decisions that prefer lower 347 If set, the TCP stack makes decisions that prefer lower
340 latency as opposed to higher throughput. By default, this 348 latency as opposed to higher throughput. By default, this
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 625bdf95d673..012b1f91f3ec 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -28,6 +28,7 @@
28#include <net/request_sock.h> 28#include <net/request_sock.h>
29#include <net/netns/hash.h> 29#include <net/netns/hash.h>
30#include <net/tcp_states.h> 30#include <net/tcp_states.h>
31#include <net/l3mdev.h>
31 32
32/** struct ip_options - IP Options 33/** struct ip_options - IP Options
33 * 34 *
@@ -113,6 +114,19 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
113 return sk->sk_mark; 114 return sk->sk_mark;
114} 115}
115 116
117static inline int inet_request_bound_dev_if(const struct sock *sk,
118 struct sk_buff *skb)
119{
120#ifdef CONFIG_NET_L3_MASTER_DEV
121 struct net *net = sock_net(sk);
122
123 if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
124 return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
125#endif
126
127 return sk->sk_bound_dev_if;
128}
129
116struct inet_cork { 130struct inet_cork {
117 unsigned int flags; 131 unsigned int flags;
118 __be32 addr; 132 __be32 addr;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index c68926b4899c..d75be32650ba 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -86,6 +86,9 @@ struct netns_ipv4 {
86 86
87 int sysctl_fwmark_reflect; 87 int sysctl_fwmark_reflect;
88 int sysctl_tcp_fwmark_accept; 88 int sysctl_tcp_fwmark_accept;
89#ifdef CONFIG_NET_L3_MASTER_DEV
90 int sysctl_tcp_l3mdev_accept;
91#endif
89 int sysctl_tcp_mtu_probing; 92 int sysctl_tcp_mtu_probing;
90 int sysctl_tcp_base_mss; 93 int sysctl_tcp_base_mss;
91 int sysctl_tcp_probe_threshold; 94 int sysctl_tcp_probe_threshold;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4cbe9f0a4281..643a86c49020 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
351 treq->snt_synack.v64 = 0; 351 treq->snt_synack.v64 = 0;
352 treq->tfo_listener = false; 352 treq->tfo_listener = false;
353 353
354 ireq->ir_iif = sk->sk_bound_dev_if; 354 ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
355 355
356 /* We throwed the options of the initial SYN away, so we hope 356 /* We throwed the options of the initial SYN away, so we hope
357 * the ACK carries the same options again (see RFC1122 4.2.3.8) 357 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
371 * hasn't changed since we received the original syn, but I see 371 * hasn't changed since we received the original syn, but I see
372 * no easy way to do this. 372 * no easy way to do this.
373 */ 373 */
374 flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, 374 flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
375 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, 375 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
376 inet_sk_flowi_flags(sk), 376 inet_sk_flowi_flags(sk),
377 opt->srr ? opt->faddr : ireq->ir_rmt_addr, 377 opt->srr ? opt->faddr : ireq->ir_rmt_addr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a0bd7a55193e..41ff1f87dfd7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -915,6 +915,17 @@ static struct ctl_table ipv4_net_table[] = {
915 .mode = 0644, 915 .mode = 0644,
916 .proc_handler = proc_dointvec, 916 .proc_handler = proc_dointvec,
917 }, 917 },
918#ifdef CONFIG_NET_L3_MASTER_DEV
919 {
920 .procname = "tcp_l3mdev_accept",
921 .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept,
922 .maxlen = sizeof(int),
923 .mode = 0644,
924 .proc_handler = proc_dointvec_minmax,
925 .extra1 = &zero,
926 .extra2 = &one,
927 },
928#endif
918 { 929 {
919 .procname = "tcp_mtu_probing", 930 .procname = "tcp_mtu_probing",
920 .data = &init_net.ipv4.sysctl_tcp_mtu_probing, 931 .data = &init_net.ipv4.sysctl_tcp_mtu_probing,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2d656eef7f8e..7b1fddc47019 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6204,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6204 tcp_openreq_init(req, &tmp_opt, skb, sk); 6204 tcp_openreq_init(req, &tmp_opt, skb, sk);
6205 6205
6206 /* Note: tcp_v6_init_req() might override ir_iif for link locals */ 6206 /* Note: tcp_v6_init_req() might override ir_iif for link locals */
6207 inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; 6207 inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
6208 6208
6209 af_ops->init_req(req, sk, skb); 6209 af_ops->init_req(req, sk, skb);
6210 6210
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 205e6745393f..46e92fbd26a8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1276,6 +1276,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1276 ireq = inet_rsk(req); 1276 ireq = inet_rsk(req);
1277 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1277 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1278 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1278 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1279 newsk->sk_bound_dev_if = ireq->ir_iif;
1279 newinet->inet_saddr = ireq->ir_loc_addr; 1280 newinet->inet_saddr = ireq->ir_loc_addr;
1280 inet_opt = ireq->opt; 1281 inet_opt = ireq->opt;
1281 rcu_assign_pointer(newinet->inet_opt, inet_opt); 1282 rcu_assign_pointer(newinet->inet_opt, inet_opt);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index eaf7ac496d50..2906ef20795e 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -193,7 +193,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
193 ireq->pktopts = skb; 193 ireq->pktopts = skb;
194 } 194 }
195 195
196 ireq->ir_iif = sk->sk_bound_dev_if; 196 ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
197 /* So that link locals have meaning */ 197 /* So that link locals have meaning */
198 if (!sk->sk_bound_dev_if && 198 if (!sk->sk_bound_dev_if &&
199 ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) 199 ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
@@ -224,7 +224,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
224 fl6.daddr = ireq->ir_v6_rmt_addr; 224 fl6.daddr = ireq->ir_v6_rmt_addr;
225 final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); 225 final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
226 fl6.saddr = ireq->ir_v6_loc_addr; 226 fl6.saddr = ireq->ir_v6_loc_addr;
227 fl6.flowi6_oif = sk->sk_bound_dev_if; 227 fl6.flowi6_oif = ireq->ir_iif;
228 fl6.flowi6_mark = ireq->ir_mark; 228 fl6.flowi6_mark = ireq->ir_mark;
229 fl6.fl6_dport = ireq->ir_rmt_port; 229 fl6.fl6_dport = ireq->ir_rmt_port;
230 fl6.fl6_sport = inet_sk(sk)->inet_sport; 230 fl6.fl6_sport = inet_sk(sk)->inet_sport;